예제 #1
0
 def __init__(self, params):
     """See documentation in the base class"""
     Agent.__init__(self, params)
     self.q = np.random.rand(self.num_states, self.num_actions)
     self.returns = {(state, action):[] for state in range(self.num_states) for action in range(self.num_actions)}
     self.epsilon = 1
     self.tuple_state_agent_met=[]
예제 #2
0
    def __init__(self, params):
        Agent.__init__(self, params)

        # The 3 following args will be given by the env.
        self.n_x = None
        self.n_y = None
        self.states = None

        self.X_memory = []
        self.Y_memory = []
        self.ia = MLPRegressor(warm_start=True,
                               max_iter=200,
                               early_stopping=False,
                               hidden_layer_sizes=(20, 10, 5),
                               learning_rate_init=1 * 10**-3,
                               activation='identity')

        self.epsilon = 0.5
        self.gamma = 0.8
        self.alpha = 0.5

        self.last_action = None
        self.last_state = None
        self.is_action_possible = None
        self.n_max_action = params["max_action_per_episode"]

        self.first_fit = True
        self.key_taken = []
예제 #3
0
class Game:
    def __init__(self, agent1, agent1parameters, agent2, agent2parameters, game, displayPrefix = None):
        self.board = games.getBoard(game)
        self.agent1 = Agent(agent1, agent1parameters, self.board)
        self.agent2 = Agent(agent2, agent2parameters, self.board)
        self.displayPrefix = displayPrefix


    # Return 1 if agent1 won
    # Return 0 if draw
    # Return -1 if agent2 won
    def playGame(self, displayState = True):
        gameState = self.board.start()
        agent1Turn = True
        turn = 0

        if displayState:
            self.board.display(gameState)

        while not self.board.isGameOver(gameState):
            if self.displayPrefix:
                print(self.displayPrefix + ' Turn {0}'.format(turn), end='\u001b[0K\r')
            turn += 1
            if displayState:
                print('{0}\'s turn'.format(self.agent1.agentType if agent1Turn else self.agent2.agentType))
            gameState = self.agent1.makeMove(gameState) if agent1Turn else self.agent2.makeMove(gameState)
            agent1Turn = not agent1Turn
            if displayState:
                self.board.display(gameState)
            
        if self.board.winner(gameState) == -1:
            return 0
        
        return -1 if agent1Turn else 1
예제 #4
0
    def __init__(self, params):
        """See documentation in the base class"""
        Agent.__init__(self, params)

        self.epsilon = 1
        self.n_key_max = 5
        self.alpha = 0.4
        self.gamma = 0.9999
        self.key_taken = []
        self.q = np.random.rand(self.num_states, self.num_actions, self.n_key_max)
 def __init__(self, params):
     """See documentation in the base class"""
     Agent.__init__(self, params)
     
     #strategie initiale (au depart l'agent ne connait pas les deplacements optimaux a effectuer)
     #tableau de taille nombre d'etat * nombre d'actions possibles
     self.Q = np.zeros((int(params['num_cells_grid1D']),2))
     
     #Probabilite d'exploration
     self.exploration = 0.05
예제 #6
0
 def __init__(self,
              agent1,
              agent1parameters,
              agent2,
              agent2parameters,
              game,
              displayPrefix=None):
     self.board = games.getBoard(game)
     self.agent1 = Agent(agent1, agent1parameters, self.board)
     self.agent2 = Agent(agent2, agent2parameters, self.board)
     self.displayPrefix = displayPrefix
예제 #7
0
    def train(self,
              Q: Agent,
              task: Task,
              epsilon: Epsilon,
              alpha: LearningRate,
              episodes,
              cache_train=True,
              test_times=1):
        Q.clear()
        epsilon.clear()
        alpha.clear()
        rewards_history = np.zeros(episodes, dtype=np.float32)
        steps_history = np.zeros(episodes, dtype=np.float32)
        episode_epsilon_history = np.zeros(episodes, dtype=np.float32)
        epsilon_history = []
        conseq_200 = 0
        self.episode = 0
        for e in range(episodes):
            steps, rewards, epsilons = self.run_episode(
                Q, task, epsilon, alpha)
            if cache_train:
                returns = 0.0
                for r in rewards[::-1]:
                    returns = r + self.gamma * returns
            else:
                returns, steps = 0.0, 0.0
                for _ in range(test_times):
                    returns_, steps_ = self.evaluate(Q, task)
                    returns += returns_ / test_times
                    steps += steps_ / test_times
            rewards_history[e] = returns
            steps_history[e] = steps
            episode_epsilon_history[e] = np.mean(epsilons)
            epsilon_history.append(epsilons)
            if e % 10 == 0:
                print('{} {} {}'.format(episode_epsilon_history[e], returns,
                                        steps))
            epsilon.update_end_of_episode(self.episode)
            alpha.update_end_of_episode(self.episode)
            self.episode += 1

            if steps >= 199.99:
                conseq_200 += 1
            else:
                conseq_200 = 0
            # if conseq_200 >= 4:
            #    rewards_history[e:] = rewards_history[e]
            #    steps_history[e:] = steps_history[e]
            #    episode_epsilon_history[e:] = episode_epsilon_history[e]
            #    break

        return steps_history, rewards_history, episode_epsilon_history, \
            np.concatenate(epsilon_history, axis=0)
예제 #8
0
    def __init__(self, params):
        """See documentation in the base class"""
        Agent.__init__(self, params)

        #Nombre de lignes
        self.l = int(params['ligne'])
        #Nombre de colonnes
        self.c = int(params['colonne'])

        #strategie initiale (au depart l'agent ne connait pas les deplacements optimaux a effectuer)
        #tableau de taille nombre d'etat * nombre d'actions possibles
        self.Q = np.zeros((self.c * self.l, 4))

        #Probabilite d'exploration
        self.exploration = 0.05
예제 #9
0
    def train(self, Q: Agent, task: Task, policy: Policy, episodes):
        """ Trains the specified agent on the specified task using the specified
        exploration policy using the current implementation. A specified number of episodes
        is generated for training.
        
        inputs:
            Q - an Agent object storing the Q-values
            task - a Task object representing the task the agent is learning
            policy - a Policy object representing the exploration policy used to 
            balance exploration and exploitation
            episodes - the number of episodes of training to perform
        outputs:
            - a one-dimensional numpy array containing the lengths of each episode - this
            can be used to check the learning progress of the agent
            - a one-dimensional numpy array containing the sum of the discounted 
            rewards from the environment obtained on each episode - this can be used to check
            the learning progress of the agent 
        """

        # initialization
        self.clear()
        Q.clear()
        policy.clear()

        # for storing history of trial
        rewards_history = np.zeros(episodes, dtype=float)
        steps_history = np.zeros(episodes, dtype=int)

        # run episodes
        for e in range(episodes):

            # run an episode of training
            steps, rewards = self.run_episode(Q, task, policy)

            # compute the value of the backup and update the history
            R = 0.0
            for reward in rewards[::-1]:
                R = reward + self.gamma * R
            rewards_history[e] = R
            steps_history[e] = steps

            # finish episode
            policy.finish_episode(e)
            Q.finish_episode(e)

        return steps_history, rewards_history
    def __init__(self, params):
        """initialisation de l'agent"""

        Agent.__init__(self, params)

        self.l = int(params['ligne'])
        self.c = int(params['colonne'])

        #strategie initiale (au depart l'agent ne connait pas les deplacements optimaux a effectuer)
        #tableau de taille nombre d'etat * nombre d'actions possibles
        self.Q = np.zeros((self.l * self.c, 4))

        #Probabilite d'exploration
        self.exploration = 0.05

        self.alpha = 0.6  #coefficient optimises
        self.gamma = 1
예제 #11
0
def create_agent(agent_id, connections, config_filename):
    """
    load config fromm file and create agent
    :param agent_id: agent id
    :param connections: connections
    :param config_filename: name of the file with config
    :return: created agent
    """
    config = load_agent_config(config_filename)
    return Agent(agent_id, connections, config)
예제 #12
0
 def sample_batch(self, batch_size = None):
     batch, self.sample_index = Agent.sample_batch(self)
     self.r = self.r.resize_(batch['reward'].shape).copy_(torch.Tensor(batch['reward']))
     self.done = self.done.resize_(batch['done'].shape).copy_(torch.Tensor(batch['done']))
     self.a = self.a.resize_(batch['action'].shape).copy_(torch.Tensor(batch['action']))
     self.s = self.s.resize_(batch['state'].shape).copy_(torch.Tensor(batch['state']))
     self.s_ = self.s_.resize_(batch['next_state'].shape).copy_(torch.Tensor(batch['next_state']))
     self.logpac_old = self.logpac_old.resize_(batch['logpac'].shape).copy_(torch.Tensor(batch['logpac']))
     self.distri = self.distri.resize_(batch['distri'].shape).copy_(torch.Tensor(batch['distri']))
     self.other_data = batch['other_data']
     if self.other_data:
         for key in self.other_data.keys():
             self.other_data[key] = torch.Tensor(self.other_data[key]).type_as(self.s)
    def __init__(self, _type, bandits, agent, runs, exp_nums, logdir, k_vec,
                 k_probs):
        self._type = _type
        self.esr_vector = []
        self.esr_probs = []
        self.f1_score = []
        self.f1 = []
        #self.plotter = Plotter()
        self.metrics = Metrics()
        self.k_vec = k_vec
        self.k_probs = k_probs
        self.f1_df = pd.DataFrame()
        self.esrBandit = Agent(0, 10)
        self.logdir = logdir

        for i in range(len(k_vec)):
            self.esrBandit.manual_distribution(k_vec[i], k_probs[i])

        if self._type == "bandit":
            self.bandits = bandits
            self.agent = agent
            self.runs = runs
            self.exp_nums = exp_nums
예제 #14
0
    def __init__(self, params):
        """See documentation in the base class"""
        Agent.__init__(self, params)

        #Nombre de lignes
        self.l = int(params['ligne'])
        #Nombre de colonnes
        self.c = int(params['colonne'])

        #strategie initiale (au depart l'agent ne connait pas les deplacements optimaux a effectuer)
        #tableau de taille nombre d'etat * nombre d'actions possibles
        self.Q = np.zeros((self.c * self.l, 4))

        #Probabilite d'exploration
        self.exploration = 0.05

        #Coefficients pour la formule de Q
        self.alpha = 1
        self.gamma = 1

        #vitesse de convergence initialisee (etape a partir de laquelle les recompenses totales sont superieures a 60
        self.vitesse = 0
        #numero de l'episode qui vient de finir
        self.ep = 0
예제 #15
0
    def act(self, Q: Agent, task: Task, state):

        # set the number of actions of the current task, if not set
        if self.valid_actions == 0:
            self.valid_actions = task.valid_actions()

        # get the distribution over actions for the current state
        pref = self.preferences[state]

        # sample an action from the preference distribution
        action = np.random.choice(self.valid_actions, 1, p=pref)

        # get the greedy action according to Q
        greedy = Q.max_action(state)

        # update the preference distribution
        pref *= (1.0 - self.beta)
        pref[greedy] /= (1.0 - self.beta)
        pref[greedy] += self.beta * (1.0 - pref[greedy])

        return action
예제 #16
0
 def cuda(self):
     Agent.cuda(self)
     self.e_NAF = self.e_NAF.cuda()
     self.t_NAF = self.t_NAF.cuda()
class Experiment():
    def __init__(self, _type, bandits, agent, runs, exp_nums, logdir, k_vec,
                 k_probs):
        self._type = _type
        self.esr_vector = []
        self.esr_probs = []
        self.f1_score = []
        self.f1 = []
        #self.plotter = Plotter()
        self.metrics = Metrics()
        self.k_vec = k_vec
        self.k_probs = k_probs
        self.f1_df = pd.DataFrame()
        self.esrBandit = Agent(0, 10)
        self.logdir = logdir

        for i in range(len(k_vec)):
            self.esrBandit.manual_distribution(k_vec[i], k_probs[i])

        if self._type == "bandit":
            self.bandits = bandits
            self.agent = agent
            self.runs = runs
            self.exp_nums = exp_nums

    def run(self):
        if self._type == "bandit":
            avg_log = self.logdir + 'average' + '/'
            if not os.path.exists(avg_log):
                os.makedirs(avg_log, exist_ok=True)

            for run in range(self.runs):
                self.run_df = pd.DataFrame()
                start = time.perf_counter()
                run_log = self.logdir + 'run_' + str(run + 1) + '/'

                if not os.path.exists(run_log):
                    os.makedirs(run_log, exist_ok=True)

                for i in range(self.exp_nums):
                    self.esr_vector = []
                    self.esr_probs = []
                    if i == 0:
                        for j in range(len(self.bandits)):
                            _return_ = self.bandits[j].pull_arm()
                            self.agent.update(j, _return_)

                    action = self.agent.select_action()
                    _return_ = self.bandits[action].pull_arm()
                    self.agent.update(action, _return_)
                    esr_index = self.agent.esr_dominance()

                    self.esr_agent = deepcopy(self.agent)
                    self.esr_agent.distribution = np.array(
                        self.esr_agent.distribution)[esr_index]

                    for val in esr_index:
                        self.esr_vector.append(
                            self.agent.distribution[val].get_distribution()[0])
                        self.esr_probs.append(
                            self.agent.distribution[val].get_distribution()[1])

                    #self.f1.append(self.metrics.precision_recall(self.esr_vector, self.esr_probs, self.k_vec, self.k_probs))
                    self.f1.append(
                        self.metrics.pr_kl(self.esr_agent, self.esrBandit))

                self.run_df['run' + str(run)] = self.f1
                self.run_df['mean'] = self.run_df.mean(axis=1)
                self.f1_df['run' + str(run)] = self.f1
                end = time.perf_counter()

                #self.run_df['average'] = self.f1_df.mean(axis=1)
                #print(self.f1_df)
                #self.f1_score = self.f1_df['Average']
                #self.run_df['average'] = np.mean(np.array(self.f1_score).reshape(-1, 10), axis=1)
                self.run_df.to_csv(run_log + "/f1_score.csv", index=False)

                ser = SER(self.k_vec, self.k_probs)
                ser_expectations = ser.expectations()
                ser_pareto_front = ser.pareto_front()

                print("")
                print(
                    '**** Run ' + str(run + 1) + ' - Execution Time: ' +
                    str(round((end - start), 2)) + ' seconds ****', )
                print(str(len(esr_index)) + " distributions in the ESR set")
                print("ESR Vector and Probabilities")
                for a in range(len(self.esr_vector)):
                    print(self.esr_vector[a])
                    print(self.esr_probs[a])
                    print(" ")
                print("")
                print("SER - Pareto Front")
                print("Number of policies on the pareto front : " +
                      str(len(ser_pareto_front)))
                print(ser_pareto_front)
                print("")

                self.plotter = Plotter(self.esr_vector, self.esr_probs,
                                       run_log, self.exp_nums, True, True)
                self.plotter.plot_run()

            self.f1_df['mean'] = self.f1_df.mean(axis=1)
            #self.f1_df['average'] = np.mean(np.array(self.f1_df['mean']).reshape(-1, 10), axis=1)
            self.f1_df.to_csv(avg_log + "/f1_score.csv", index=False)
            self.plotter = Plotter(self.esr_vector, self.esr_probs, avg_log,
                                   self.exp_nums, True, True)
            self.plotter.plot_run()

        return
예제 #18
0
 def cuda(self):
     Agent.cuda(self)
     self.policy = self.policy.cuda()
     if self.value_type is not None:
         self.value = self.value.cuda()
예제 #19
0
graph.create_network()

disable_spade_warnings()
f = open('log2.csv', 'w+')
logger.initialize_default_logger(f)

conf1 = AgentConfig()
conf2 = AgentConfig()

conf1.initial_resource = 100
conf1.storage_limit = 100
conf1._policy_builder = SimplePolicy
conf1._policy_builder_args = [0.1, 2, 2, 0.01, 0.01]

conf2.initial_money = 10
conf2.needs_satisfaction_timeout = 5
conf2._needs = random_uniform
conf2._needs_args = [3, 4]
conf2.needs_satisfaction_cost = 0.5
conf2._policy_builder = SimplePolicy
conf2._policy_builder_args = [0.1, 2, 2, 0.01, 0.01]

if __name__ == '__main__':
    a1 = Agent(1, [2], conf1)
    a2 = Agent(2, [1], conf2)

    a1.start()
    a2.start()
    visualisation(graph, logger)
    time.sleep(10)
 def __init__(self, params):
     """See documentation in the base class"""
     Agent.__init__(self, params)
예제 #21
0
 def distribution(self, Q: Agent, task: Task, state):
     values = Q.values(state)
     values = np.exp(values / self.temp)
     values /= np.sum(values)
     return values
예제 #22
0
    def create_system(self):

        for i in range(self.num_agents):
            self.dict_agents[i] = Agent(self.num_obs, self.num_messages,
                                        self.num_actions, self.epsilon,
                                        self.learning_rate, i, self.T)
예제 #23
0
 def cuda(self):
     Agent.cuda(self)
     self.e_Actor = self.e_Actor.cuda()
     self.e_Critic = self.e_Critic.cuda()
     self.t_Actor = self.t_Actor.cuda()
     self.t_Critic = self.t_Critic.cuda()
예제 #24
0
 def __init__(self, params):
     """See documentation in the base class"""
     Agent.__init__(self, params)
     self.final_state = EnvironmentGrid2D(params).terminal_state
예제 #25
0
 def epsilon_greedy(self, Q: Agent, task: Task, state, epsilon):
     if np.random.rand() <= epsilon:
         return random.randrange(task.valid_actions())
     else:
         return Q.max_action(state)
예제 #26
0
파일: DQN.py 프로젝트: phymucs/HTRPO
 def cuda(self):
     Agent.cuda(self)
     self.e_DQN = self.e_DQN.cuda()
     self.t_DQN = self.t_DQN.cuda()
예제 #27
0
def main():
	from agents.Agent import Agent
	from envs.Bandit import Bandit
	from wrappers.Distribution import Distribution
	from experiments.Experiment import Experiment
	from plot.Plotter import Plotter
	from wrappers.SER import SER
	import argparse

	parser = argparse.ArgumentParser(description='')
	parser.add_argument('--max_val', default=10, type=int)
	parser.add_argument('--timesteps', default=100000, type=int)
	parser.add_argument('--runs', default=1, type=int)
	parser.add_argument('--type_bandits', default='random', type=str)
	args = parser.parse_args()
	print(args)

	max_val = args.max_val
	actions = 5
	episodes = args.timesteps
	runs = args.runs
	plotter = Plotter()
	agent = Agent(actions, max_val)
	
	bandits = []
	bandit_distributions = []
	bandit_probabilities = []

	#logdir = f'runs/bandit/{args.type_bandits}/'
    #logdir += datetime.now().strftime('%Y-%m-%d_%H-%M-%S_') + str(uuid.uuid4())[:4] + '/'

	if args.type_bandits == 'random':

		# Random Distributions
		#param = [True/False, number of obs in distribution, num objective, max value]
		bandits.append(Bandit(param = [True, 5, 2, 5]))
		bandits.append(Bandit(param = [True, 5, 2, 2]))
		bandits.append(Bandit(param = [True, 5, 2, 3]))
		bandits.append(Bandit(param = [True, 5, 2, 5]))
		bandits.append(Bandit(param = [True, 5, 2, 2]))

		print("*** Bandit Distributions ***")
		for i in range(actions):
			print(bandits[i].vectors)
			print(bandits[i].probs)
			print("")

	if args.type_bandits == 'bandit':

		# Manual Distributions
		bandits.append(Bandit([[2, 0], [2, 1]], [0.05, 0.05]))
		bandits.append(Bandit([[0, 0], [1, 1]], [0.1, 0.1]))
		bandits.append(Bandit([[1, 0], [1, 3]], [0.1, 0.1]))
		bandits.append(Bandit([[1, 0], [2, 1]], [0.1, 0.4]))
		bandits.append(Bandit([[1, 1], [1, 2]], [0.05, 0.05]))
		#bandits.append(Bandit([[1, 1], [1, 5], [2, 3], [1, 2]], [0.1, 0.2, 0.5, 0.2]))
		#bandits.append(Bandit([[1, 1], [1, 5], [2, 3], [1, 2]], [0.1, 0.2, 0.5, 0.2]))
		#bandits.append(Bandit([[1, 1], [1, 5], [2, 3], [1, 2]], [0.1, 0.2, 0.5, 0.2]))

	
	if args.type_bandits == 'realworld':

		# Manual Distributions
		bandits.append(Bandit([[2, 0], [2, 1], [3, 2], [4, 2]], [0.05, 0.05, 0.1, 0.8]))
		bandits.append(Bandit([[0, 0], [1, 1], [2, 0], [2, 1]], [0.1, 0.1, 0.5, 0.3]))
		bandits.append(Bandit([[1, 0], [1, 3], [3, 4], [5, 4]], [0.1, 0.1, 0.2, 0.6]))
		bandits.append(Bandit([[1, 0], [2, 1], [3, 1], [3, 2]], [0.1, 0.4, 0.4, 0.1]))
		bandits.append(Bandit([[1, 1], [1, 2], [4, 0], [0, 0]], [0.05, 0.05, 0.1, 0.8]))
		#bandits.append(Bandit([[1, 1], [1, 5], [2, 3], [1, 2]], [0.1, 0.2, 0.5, 0.2]))
		#bandits.append(Bandit([[1, 1], [1, 5], [2, 3], [1, 2]], [0.1, 0.2, 0.5, 0.2]))
		#bandits.append(Bandit([[1, 1], [1, 5], [2, 3], [1, 2]], [0.1, 0.2, 0.5, 0.2]))


	for i in range(actions):
		#print(bandits[i].vectors)
		#print(bandits[i].probs)
		bandit_distributions.append(bandits[i].vectors)
		bandit_probabilities.append(bandits[i].probs)
		#print(" ")

	esr_vectors = []
	esr_probabilities = []
	esr_vectors.append(bandit_distributions[0])
	esr_vectors.append(bandit_distributions[2])
	esr_probabilities.append(bandit_probabilities[0])
	esr_probabilities.append(bandit_probabilities[2])

	experiment = Experiment("bandit", bandits, agent, runs, episodes, esr_vectors, esr_probabilities)
	experiment.run()
	#plotter.multi_cdf_plot(experiment.esr_vector, experiment.esr_probs)
	#plotter.multi_pdf_plot(experiment.esr_vector, experiment.esr_probs)
	#plotter.heatmap_plot([[2, 0], [2, 1], [3, 2], [4, 2]], [0.05, 0.05, 0.1, 0.8])

	#plots to generate 
	#plotter.multi_heatmap_plot(experiment.esr_vector, experiment.esr_probs)
	#plotter.multi_pdf_bar_plot(experiment.esr_vector, experiment.esr_probs)
	#plotter.multi_cdf_plot(experiment.esr_vector, experiment.esr_probs)
	


	#plotter.multi_joint_plot(experiment.esr_vector, experiment.esr_probs)
	#plotter.multi_pdf_bar_plot(experiment.esr_vector, experiment.esr_probs)
	#plotter.multi_3d_pdf_bar_plot(experiment.esr_vector, experiment.esr_probs)



	'''