コード例 #1
0
def run(team_name, number, first_player):
    interface = inter.AgentInterface(team_name, number)
    if first_player:
        interface.set_home(3, 2)
    else:
        interface.set_home(4, 2)
    obs = interface.observe_from_server()
    while ("start", 0) not in obs:
        obs = interface.observe_from_server()

    while ("stop", 0) not in obs:
        new_cycle = False
        for o in obs:
            if o[0] == "cycle":
                new_cycle = True
                obs = []
                break
        if new_cycle:
            interface.send_action("move", 1)
        obs = interface.observe_from_server()
コード例 #2
0
    def train(self):
        """
		Performs training of Q

		Args:
			exp_schedule: Exploration instance s.t.
				exp_schedule.get_action(best_action) returns an action
			lr_schedule: Schedule for learning rate
		"""

        # initialize replay buffer and variables
        num_episodes = 0
        self.agents = []
        self.agent_obs = []
        # agents = []
        # agent_obs = []
        for a in range(Config.num_players_per_team):
            agent = AgentInterface('COMA', a + 1)
            agent.set_home(
                int(Config.rows / 2) - int(Config.num_players_per_team / 2) +
                1 + a, 2)
            self.agents.append(agent)
            obs = agent.observe_from_server()
            self.agent_obs.append(obs)

        initial_state = None
        for a in range(Config.num_players_per_team):
            while ("start", 0) not in self.agent_obs[a]:
                self.agent_obs[a] = self.agents[a].observe_from_server()

        inital_obs = 0
        while inital_obs < Config.num_players_per_team:
            for a in range(Config.num_players_per_team):
                new_cycle = False
                agent = self.agents[a]
                self.agent_obs[a] = agent.observe_from_server()
                obs = self.agent_obs[a]
                for o in obs:
                    if o[0] == "cycle":
                        new_cycle = True
                        break
                if new_cycle:
                    initial_state = self.get_state(agent, obs)
                    inital_obs += 1
        i = 0
        self.t = 0
        episodes = 0
        goals_scored = 0
        while True:
            states = None
            actions = None
            TDTargets = None
            for i in range(Config.batch_size):
                estates, eactions, erewards, next_state, won = self.sample_episode(
                    initial_state)
                episodes += 1
                if won:
                    goals_scored += 1
                if episodes % Config.num_train_episodes == 0:
                    print "TRAIN PROPORTION OF EPISODES WON %s" % (
                        float(goals_scored) / Config.num_train_episodes)
                    with open(Config.model_name + '.out', 'a') as f:
                        f.write('%s,%s\n' % (self.t, float(goals_scored) /
                                             Config.num_train_episodes))
                    if Config.save_model:
                        self.save()
                    episodes = 0
                    goals_scored = 0

                initial_state = next_state
                batch_targets = self.getTDTargets(estates, eactions, erewards)
                if TDTargets is None:
                    TDTargets = batch_targets
                else:
                    TDTargets = np.append(TDTargets, batch_targets, axis=0)
                if states is None:
                    states = estates
                else:
                    states = np.append(states, estates, axis=0)
                if actions is None:
                    actions = eactions
                else:
                    actions = np.append(actions, eactions, axis=0)

            self.t += len(states)

            mini_batch_start = 0
            while mini_batch_start < states.shape[0]:
                next_mini_batch = min(states.shape[0],
                                      mini_batch_start + Config.max_batch_size)
                mstates = states[mini_batch_start:next_mini_batch]
                mactions = actions[mini_batch_start:next_mini_batch]
                mTDTargets = TDTargets[mini_batch_start:next_mini_batch]

                for a in range(Config.num_players_per_team):
                    a_minus = np.delete(mactions, a, 1)
                    agent_actions = mactions[:, a]
                    loss, _ = self.sess.run(
                        [self.loss, self.train_op_global],
                        feed_dict={
                            self.s:
                            mstates,
                            self.a:
                            agent_actions,
                            self.a_minus:
                            a_minus,
                            self.TD_targets:
                            mTDTargets[:, a],
                            self.agent: [
                                a for _ in range(next_mini_batch -
                                                 mini_batch_start)
                            ],
                            self.e:
                            self.get_epsilon()
                        })

                for a in range(Config.num_players_per_team):
                    a_minus = np.delete(mactions, a, 1)
                    agent_actions = mactions[:, a]
                    advantages = self.sess.run(
                        [self.advantage],
                        feed_dict={
                            self.s:
                            mstates,
                            self.s_local:
                            self.get_states_local(a, mstates),
                            self.a:
                            agent_actions,
                            self.a_minus:
                            a_minus,
                            self.agent: [
                                a for _ in range(next_mini_batch -
                                                 mini_batch_start)
                            ],
                            self.e:
                            self.get_epsilon()
                        })[0]
                    self.sess.run(
                        [self.policy_op],
                        feed_dict={
                            self.s:
                            mstates,
                            self.s_local:
                            self.get_states_local(a, mstates),
                            self.a:
                            agent_actions,
                            self.agent: [
                                a for _ in range(next_mini_batch -
                                                 mini_batch_start)
                            ],
                            self.advantages_in:
                            advantages,
                            self.e:
                            self.get_epsilon()
                        })
                if i > Config.target_update_freq:
                    self.update_target_params()
                i = 0

                i += next_mini_batch - mini_batch_start
                mini_batch_start = next_mini_batch
コード例 #3
0
def train():
    """
    Performs training of Q

    Args:
        exp_schedule: Exploration instance s.t.
            exp_schedule.get_action(best_action) returns an action
        lr_schedule: Schedule for learning rate
    """

    # initialize replay buffer and variables
    score_team_prev = 0
    score_opp_prev = 0
    i = 0
    while True:
        i += 1
        # agents = []
        # agent_obs = []
        # for a in range(Config.num_players_per_team):
        agent = AgentInterface('Q-Tabular', player)
        agent.set_home(3 * player - 1, 3)
        # agents.append(agent)
        obs = agent.observe_from_server()
        # agent_obs.append(obs)

        state_prev = None
        action_prev = None
        # for a in range(Config.num_players_per_team):
        while ("start", 0) not in obs:
            obs = agent.observe_from_server()
        while True:
            # for a in range(Config.num_players_per_team):
            obs = agent.observe_from_server()
            new_cycle = False
            for o in obs:
                if o[0] == "cycle":
                    new_cycle = True
                    break
            if new_cycle:
                state_new = get_state(agent, obs)
                if action_prev is not None:
                    score = None
                    for o in obs:
                        if o[0] == "score":
                            if agent.left_team:
                                score = [o[1][0], o[1][1]]
                            else:
                                score = [o[1][1], o[1][0]]
                    score_team_new, score_opp_new = score[0], score[1]
                    reward_prev = reward(state_prev, state_new, action_prev,
                                         score_team_prev, score_opp_prev,
                                         score_team_new, score_opp_new)
                    print(reward_prev)
                    score_team_prev = score_team_new
                    score_opp_prev = score_opp_new
                    update_Q(state_prev, action_prev, reward_prev, state_new)
                action_new = get_action(state_new)
                if action_new <= 8:
                    agent.send_action("move", action_new)
                else:
                    teammate = 2 if player == 1 else 1
                    agent.send_action("pass", teammate)
                state_prev = state_new.copy()
                action_prev = action_new
コード例 #4
0
    def train(self):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        score_team_prev = 0
        score_opp_prev = 0
        i = 0
        while True:
            i += 1
            # agents = []
            # agent_obs = []
            # for a in range(Config.num_players_per_team):
            agent = AgentInterface('SMART', self.player)
            agent.set_home(2 + self.player, 2)
            # agents.append(agent)
            obs = agent.observe_from_server()
            # agent_obs.append(obs)

            state_prev = None
            action_prev = None
            t = 0
            # for a in range(Config.num_players_per_team):
            while ("start", 0) not in obs:
                obs = agent.observe_from_server()
            while True:
                # for a in range(Config.num_players_per_team):
                obs = agent.observe_from_server()
                new_cycle = False
                for o in obs:
                    if o[0] == "cycle":
                        new_cycle = True
                        break
                if new_cycle:
                    state_new = self.get_state(agent, obs)
                    if action_prev is not None:
                        score = None
                        for o in obs:
                            if o[0] == "score":
                                if agent.left_team:
                                    score = [o[1][0], o[1][1]]
                                else:
                                    score = [o[1][1], o[1][0]]
                        score_team_new, score_opp_new = score[0], score[1]
                        reward_prev = self.reward(state_prev, state_new,
                                                  action_prev, score_team_prev,
                                                  score_opp_prev,
                                                  score_team_new,
                                                  score_opp_new)
                        print(reward_prev)
                        score_team_prev = score_team_new
                        score_opp_prev = score_opp_new
                        loss, _ = self.sess.run(
                            [self.loss, self.train_op],
                            feed_dict={
                                self.s: [state_prev],
                                self.s_: [state_new],
                                self.r: [reward_prev],
                                self.a: [action_prev]
                            })

                        if t % Config.target_update_freq == 0:
                            self.update_target_params()
                        t += 1
                    action_new = self.get_action(state_new)[0]
                    if action_new <= 8:
                        agent.send_action("move", action_new)
                    else:
                        teammate = 2 if self.player == 1 else 1
                        agent.send_action("pass", teammate)
                    state_prev = state_new.copy()
                    action_prev = action_new
コード例 #5
0
    def train(self):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        score_team = 0
        score_opp = 0
        i = 0
        while True:
            i += 1
            agents = []
            agent_obs = []
            for a in range(Config.num_players_per_team):
                agent = AgentInterface('SMART', a)
                agent.set_home(3 + a, 2)
                agents.append(agent)
                obs = agent.observe_from_server()
                agent_obs.append(obs)

            states = np.zeros((Config.num_players_per_team, Config.state_size))
            for a in range(Config.num_players_per_team):
                while ("start", 0) not in agent_obs[a]:
                    agent_obs[a] = agents[a].observe_from_server()
                    states[a] = self.get_state(agents[a], agent_obs[a])
            new_states = np.zeros(
                (Config.num_players_per_team, Config.state_size))
            while True:
                for a in range(Config.num_players_per_team):
                    agent_obs[a] = agents[a].observe_from_server()
                    new_cycle = False
                    for o in agent_obs[a]:
                        if o[0] == "cycle":
                            new_cycle = True
                            break
                    if new_cycle:
                        action = self.get_action([states[a]])[0]
                        if action <= 8:
                            agents[a].send_action("move", action)
                        else:
                            if action - 9 < a:
                                agents[a].send_action("pass", action - 9)
                            else:
                                agents[a].send_action("pass", action - 8)
                        new_states[a] = self.get_state(agents[a], agent_obs[a])
                        score = None
                        for o in agent_obs[a]:
                            if o[0] == "score":
                                if agents[a].left_team:
                                    score = [o[1][0], o[1][1]]
                                else:
                                    score = [o[1][1], o[1][0]]
                        reward = 0.
                        if score[0] > score_team:
                            reward = 1.
                        elif score[1] > score_opp:
                            reward = -1.
                        score_team = score[0]
                        score_opp = score[1]
                        loss, _ = self.sess.run(
                            [self.loss, self.train_op],
                            feed_dict={
                                self.s: [states[a]],
                                self.r: [reward],
                                self.a: [action]
                            })
                        states[a] = new_states[a]
コード例 #6
0
def train():
    """
    Performs training of Q

    Args:
        exp_schedule: Exploration instance s.t.
            exp_schedule.get_action(best_action) returns an action
        lr_schedule: Schedule for learning rate
    """

    # initialize replay buffer and variables
    score_team_prev = [0 for a in range(Config.num_players_per_team)]
    score_opp_prev = [0 for a in range(Config.num_players_per_team)]
    i = 0
    while True:
        i += 1
        agents = []
        agent_obs = []
        for a in range(Config.num_players_per_team):
            agent = AgentInterface('Q-Tabular-PS', a)
            agent.set_home(3*a+2, 3) 
            agents.append(agent)
            obs = agent.observe_from_server()
            agent_obs.append(obs)

        state_prev = [None, None]
        action_prev = [None, None]
        for a in range(Config.num_players_per_team):
            while ("start", 0) not in agent_obs[a]:
                agent_obs[a] = agents[a].observe_from_server()
        while True:
            for a in range(Config.num_players_per_team):
                agent = agents[a]
                agent_obs[a] = agent.observe_from_server()
                obs = agent_obs[a]
                new_cycle = False
                for o in obs:
                    if o[0] == "cycle":
                        new_cycle = True
                        break
                if new_cycle:
                    state_new = get_state(agent, obs)
                    if action_prev[a] is not None:
                        score = None
                        for o in obs:
                            if o[0] == "score":
                                if agent.left_team:
                                    score = [o[1][0], o[1][1]]
                                else:
                                    score = [o[1][1], o[1][0]]
                        score_team_new, score_opp_new = score[0], score[1]
                        reward_prev = reward(state_prev[a], state_new, action_prev[a], score_team_prev[a], score_opp_prev[a], score_team_new, score_opp_new)
                        score_team_prev[a] = score_team_new
                        score_opp_prev[a] = score_opp_new
                        update_Q(state_prev[a], action_prev[a], reward_prev, state_new)
                    action_new = get_action(state_new)
                    if action_new <= 8:
                        agent.send_action("move", action_new)
                    else:
                        teammate = 1-a
                        agent.send_action("pass", teammate)
                    state_prev[a] = state_new.copy()
                    action_prev[a] = action_new                
コード例 #7
0
    def train(self):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        score_team = 0
        score_opp = 0
        i = 0
        while True:
            i += 1
            # agents = []
            # agent_obs = []
            # for a in range(Config.num_players_per_team):
            agent = AgentInterface('SMART', player)
            agent.set_home(3+player, 2) 
            # agents.append(agent)
            obs = agent.observe_from_server()
            # agent_obs.append(obs)

            state = np.zeros(Config.state_size)
            # for a in range(Config.num_players_per_team):
            while ("start", 0) not in obs:
                obs = agent.observe_from_server()
                state = self.get_state(agent, obs)
            new_state = np.zeros(Config.state_size)
            while True:
                # for a in range(Config.num_players_per_team):
                obs = agent.observe_from_server()
                new_cycle = False
                for o in obs:
                    if o[0] == "cycle":
                        new_cycle = True
                        break
                if new_cycle:
                    action = self.get_action([state])[0]
                    if action <= 8:
                        agent.send_action("move", action)
                    else:
                        if action - 9 < player:
                            agent.send_action("pass", action-9)
                        else:
                            agent.send_action("pass", action-8)
                    new_state = self.get_state(agent, obs)
                    score = None
                    for o in obs:
                        if o[0] == "score":
                            if agent.left_team:
                                score = [o[1][0], o[1][1]]
                            else:
                                score = [o[1][1], o[1][0]]
                    reward = 0.
                    if score[0] > score_team:
                        reward = 1.
                    elif score[1] > score_opp:
                        reward = -1.
                    else:
                        reward = -0.2
                    score_team = score[0]
                    score_opp = score[1]
                    loss, _, grad_norm = self.sess.run([self.loss, self.train_op, self.grad_norm], feed_dict={self.s: [state], self.r:  [reward], self.a: [action]})
                    print grad_norm
                    state = new_state.copy()
コード例 #8
0
    def train(self):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        score_team_prev = 0
        score_opp_prev = 0
        num_episodes = 0
        # agents = []
        # agent_obs = []
        # for a in range(Config.num_players_per_team):
        agent = AgentInterface(Config.model_name, self.player)
        agent.set_home(
            int(Config.rows / 2) - int(Config.num_players_per_team / 2) +
            self.player, 2)
        # agents.append(agent)
        obs = agent.observe_from_server()
        # agent_obs.append(obs)
        self.i = 0
        state_prev = None
        action_prev = None
        train_episodes_won = 0
        # eval_episodes_won = 0
        last_print_episode = 0
        last_save_episode = 0
        # for a in range(Config.num_players_per_team):
        while ("start", 0) not in obs:
            obs = agent.observe_from_server()
        while True:
            obs = agent.observe_from_server()
            new_cycle = False
            for o in obs:
                if o[0] == "cycle":
                    new_cycle = True
                    break
            if new_cycle:
                if num_episodes % (
                        Config.num_train_episodes + Config.num_eval_episodes
                ) == 0 and num_episodes > last_print_episode:
                    if self.player == 1:
                        print "NUMBER OF TRAINING ITERATIONS: %d" % (self.t)
                        last_print_episode = num_episodes
                        print "TRAIN PROPORTION OF EPISODES WON %s" % (
                            float(train_episodes_won) /
                            Config.num_train_episodes)
                        # print "EVAL. PROPORTION OF EPISODES WON %s" % (float(eval_episodes_won) / Config.num_eval_episodes)
                        with open(Config.model_name + '.out', 'a') as f:
                            f.write('%s,%s\n' %
                                    (self.t, float(train_episodes_won) /
                                     Config.num_train_episodes))
                        train_episodes_won = 0
                    if Config.save_model and num_episodes > last_save_episode:
                        self.save()
                        last_save_episode = num_episodes
                    # eval_episodes_won = 0

                if num_episodes % (Config.num_train_episodes +
                                   Config.num_eval_episodes
                                   ) < Config.num_train_episodes:  #TRAINING
                    # for a in range(Config.num_players_per_team):
                    state_new = self.get_state(agent, obs)
                    if action_prev is not None:
                        score = None
                        for o in obs:
                            if o[0] == "score":
                                if agent.left_team:
                                    score = [o[1][0], o[1][1]]
                                else:
                                    score = [o[1][1], o[1][0]]
                        score_team_new, score_opp_new = score[0], score[1]
                        if (score_team_new, score_opp_new) != (score_team_prev,
                                                               score_opp_prev):
                            self.i = 0
                            num_episodes += 1
                        if score_team_new > score_team_prev:
                            train_episodes_won += 1

                        reward_prev = self.reward(state_prev, state_new,
                                                  action_prev, score_team_prev,
                                                  score_opp_prev,
                                                  score_team_new,
                                                  score_opp_new)
                        #--print(reward_prev)
                        # self.episode_opp_goals+=score_opp_new-score_opp_prev
                        # self.episode_team_goals+=score_team_new-score_team_prev

                        loss, _ = self.sess.run(
                            [self.loss, self.train_op],
                            feed_dict={
                                self.s: [state_prev],
                                self.s_: [state_new],
                                self.r: [reward_prev],
                                self.a: [action_prev]
                            })
                        # if self.t % Config.target_update_freq == 0:
                        if (score_team_new, score_opp_new) != (score_team_prev,
                                                               score_opp_prev):
                            self.update_target_params()
                            #print('final score:',self.episode_team_goals,self.episode_opp_goals)
                            # self.episode_opp_goals=0
                            # self.episode_team_goals=0
                        self.t += 1
                        score_team_prev = score_team_new
                        score_opp_prev = score_opp_new
                    action_new = self.get_action(state_new, True)[0]
                    # if self.i > Config.max_episode_length and agent.uni_number == 0:
                    #     agent.send_action("restart", False)
                    #     self.i = 0
                    #     num_episodes += 1
                    # else:
                    if action_new <= 8:
                        agent.send_action("move", action_new)
                    else:
                        if action_new - 8 < self.player:
                            agent.send_action("pass", action_new - 8)
                        else:
                            agent.send_action("pass", action_new - 7)
                    state_prev = state_new.copy()
                    action_prev = action_new
                # else: # EVALUATION
                #     # for a in range(Config.num_players_per_team):
                #     state_new = self.get_state(agent, obs)
                #     if action_prev is not None:
                #         score = None
                #         for o in obs:
                #             if o[0] == "score":
                #                 if agent.left_team:
                #                     score = [o[1][0], o[1][1]]
                #                 else:
                #                     score = [o[1][1], o[1][0]]
                #         score_team_new, score_opp_new = score[0], score[1]
                #         if (score_team_new, score_opp_new) != (score_team_prev, score_opp_prev):
                #             self.i = 0
                #             num_episodes += 1
                #         if score_team_new > score_team_prev:
                #             eval_episodes_won += 1
                #         reward_prev = self.reward(state_prev, state_new, action_prev, score_team_prev, score_opp_prev, score_team_new, score_opp_new)

                #         #--print(reward_prev)
                #         # self.episode_opp_goals+=score_opp_new-score_opp_prev
                #         # self.episode_team_goals+=score_team_new-score_team_prev

                #         score_team_prev = score_team_new
                #         score_opp_prev = score_opp_new
                #     action_new = self.get_action(state_new, False)[0]
                #     # if self.i > Config.max_episode_length and agent.uni_number == 0:
                #     #     agent.send_action("restart", False)
                #     #     self.i = 0
                #     #     num_episodes += 1
                #     # else:
                #     if action_new <= 8:
                #         agent.send_action("move", action_new)
                #     else:
                #         if action_new - 8 < self.player:
                #             agent.send_action("pass", action_new-8)
                #         else:
                #             agent.send_action("pass", action_new-7)
                #     state_prev = state_new.copy()
                #     action_prev = action_new
                self.i += 1
コード例 #9
0
    def train(self):
        """
        Performs training of Q

        Args:
            exp_schedule: Exploration instance s.t.
                exp_schedule.get_action(best_action) returns an action
            lr_schedule: Schedule for learning rate
        """

        # initialize replay buffer and variables
        score_team_prev = 0
        score_opp_prev = 0
        i = 0
        while True:
            i += 1
            # agents = []
            # agent_obs = []
            # for a in range(Config.num_players_per_team):
            agent = AgentInterface('SMART', self.player)
            agent.set_home(
                int(Config.rows / 2) - int(Config.num_players_per_team / 2) +
                1 + self.player, 2)
            # agents.append(agent)
            obs = agent.observe_from_server()
            # agent_obs.append(obs)

            state_prev = None
            action_prev = None
            # for a in range(Config.num_players_per_team):
            while ("start", 0) not in obs:
                obs = agent.observe_from_server()
            while True:
                # for a in range(Config.num_players_per_team):
                obs = agent.observe_from_server()
                new_cycle = False
                for o in obs:
                    if o[0] == "cycle":
                        new_cycle = True
                        break
                if new_cycle:
                    state_new = self.get_state(agent, obs)
                    if action_prev is not None:
                        score = None
                        for o in obs:
                            if o[0] == "score":
                                if agent.left_team:
                                    score = [o[1][0], o[1][1]]
                                else:
                                    score = [o[1][1], o[1][0]]
                        score_team_new, score_opp_new = score[0], score[1]
                        reward_prev = self.reward(state_prev, state_new,
                                                  action_prev, score_team_prev,
                                                  score_opp_prev,
                                                  score_team_new,
                                                  score_opp_new)
                        #--print(reward_prev)
                        self.episode_opp_goals += score_opp_new - score_opp_prev
                        self.episode_team_goals += score_team_new - score_team_prev

                        score_team_prev = score_team_new
                        score_opp_prev = score_opp_new

                        self.write_sars(state_prev, state_new, reward_prev,
                                        action_prev, self.player)

                        if self.t % Config.target_update_freq == 0:
                            self.update_target_params()
                            #print('final score:',self.episode_team_goals,self.episode_opp_goals)
                            self.episode_opp_goals = 0
                            self.episode_team_goals = 0

                        #Update model
                        if self.t % Config.model_update_freq == 0 and os.path.exists(
                                'tmp/model.ckpt.data-00000-of-00001'
                        ) and self.last_model_update != os.path.getmtime(
                                'tmp/model.ckpt.data-00000-of-00001'):
                            self.last_model_update = os.path.getmtime(
                                'tmp/model.ckpt.data-00000-of-00001')
                            try:
                                self.saver.restore(self.sess, "tmp/model.ckpt")
                                print('load model successful')
                            except Exception:
                                print('load model failed')

                        self.t += 1
                    action_new = self.get_action(state_new)[0]
                    if action_new <= 8:
                        agent.send_action("move", action_new)
                    else:
                        if action_new - 9 < self.player:
                            agent.send_action("pass", action_new - 9)
                        else:
                            agent.send_action("pass", action_new - 8)
                    state_prev = state_new.copy()
                    action_prev = action_new