예제 #1
0
 def __init__(self,
              max_velocity,
              turn_speed,
              max_health,
              max_armor,
              spawn_point=(200, 200),
              starting_angle=0,
              starter_weapon_pack=None,
              starter_ammo_pack=None,
              color='#303030',
              radius=10):
     BaseAgent.__init__(self,
                        max_velocity,
                        turn_speed,
                        max_health,
                        max_armor,
                        spawn_point,
                        starting_angle,
                        starter_weapon_pack,
                        starter_ammo_pack,
                        color,
                        radius)
     input_layer = Input(shape=(17, 13))
     flattened_input = Flatten()(input_layer)
     inner_layer = Dense(20, activation='relu')(flattened_input)
     output_layer = Dense(11, activation='tanh')(inner_layer)
     self.model = Model(input_layer, output_layer)
     self.model.compile(RMSprop(),
                        loss='hinge')
     self.delta = 1-1e-5
     self.epsilon = 1
예제 #2
0
    def __init__(self,
                 max_velocity,
                 turn_speed,
                 max_health,
                 max_armor,
                 spawn_point=(200, 200),
                 starting_angle=0,
                 starter_weapon_pack=None,
                 starter_ammo_pack=None,
                 color='#303030',
                 radius=10):
        BaseAgent.__init__(self,
                           max_velocity,
                           turn_speed,
                           max_health,
                           max_armor,
                           spawn_point,
                           starting_angle,
                           starter_weapon_pack,
                           starter_ammo_pack,
                           color,
                           radius)
        #input_layer = Input(shape=(17, 13))
        #inner_layer1 = Convolution1D(20, 5, activation='relu')(input_layer)
        #pooling1 = MaxPooling1D(2)(inner_layer1)
        #inner_layer2 = Convolution1D(20, 3, activation='relu')(pooling1)
        #pooling2 = MaxPooling1D(2)(inner_layer2)
        #flattened = Flatten()(pooling2)
        #inner_layer3 = Dense(20, activation='relu')(flattened)
        #bn = BatchNormalization()(inner_layer3)
        #output_layer = Dense(11, activation='tanh')(bn)
        #self.model = Model(input_layer, output_layer)
        #self.model.compile(RMSprop(),
        #                   loss='hinge')

        self.delta = 1-1e-5 #decrease coefficient of epsilon-greedy
        self.epsilon = 1 #probability of random action

        self.max_memory_size = 50000
        self.observation_memory = []
        self.action_memory = []

        self.max_buffer_size = 100
        self.observation_buffer = []
        self.action_buffer = []
        self.reward_buffer = []

        self.tau = 0.97

        self.batch_size = 16

        self.skip = 5
        self.t = 0

        self.episode_rewards = []

        self.age = 0

        self.to_learn = True
def play_base(env):
    load_model(MC_MODEL_FILE)
    agents = [BaseAgent('O'), OnPolicyMCAgent('X', 0, 1)]

    start_mark = 'X'
    test_cases = 10
    while test_cases:
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            env.show_turn(False, mark)
            action = agent.act(state, ava_actions)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            if done:
                env.show_result(True, mark, reward)
                break
            else:
                _, mark = state

        # rotation start
        start_mark = next_mark(start_mark)
        test_cases -= 1
예제 #4
0
def play_base(env):
    load_model(MC_MODEL_FILE)
    agents = [BaseAgent('O'),
              OffPolicyMCAgent('X', 0, 1)]

    start_mark = 'O'
    test_cases = 1000
    win1, win2 = 0,0
    while test_cases:
        env.set_start_mark(start_mark)
        state = env.reset()
        _,mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            env.show_turn(False,mark)
            action = agent.act(state,ava_actions)
            next_state, reward, done, _ = env.step(action)
            state = next_state
            if done:
                #env.show_result(True, mark, reward)
                if reward != 0 and mark == agents[0].mark:
                    win1 += 1
                elif reward != 0 and mark == agents[1].mark:
                    win2 += 1
                break
            else:
                _, mark = state

        # rotation s tart
        #start_mark = next_mark(start_mark)
        test_cases-=1
    print(agents[0].mark, win1, agents[1].mark, win2)
def _bench(max_episode, model_file, show_result=True):
    """Benchmark given model.

    Args:
        max_episode (int): Episode count to benchmark.
        model_file (str): Learned model file name to benchmark.
        show_result (bool): Output result to stdout.

    Returns:
        (dict): Benchmark result.
    """
    minfo = load_model(model_file)
    agents = [BaseAgent('O'), TDAgent('X', 0, 0)]
    show = False

    start_mark = 'O'
    env = TicTacToeEnv()
    env.set_start_mark(start_mark)

    episode = 0
    results = []
    for i in tqdm(range(max_episode)):
        env.set_start_mark(start_mark)
        state = env.reset()
        _, mark = state
        done = False
        while not done:
            agent = agent_by_mark(agents, mark)
            ava_actions = env.available_actions()
            action = agent.act(state, ava_actions)
            state, reward, done, info = env.step(action)
            print((state,reward,action))
            if show:
                env.show_turn(True, mark)
                env.render(mode='human')

            if done:
                if show:
                    env.show_result(True, mark, reward)
                results.append(reward)
                break
            else:
                _, mark = state

        # rotation start
        start_mark = next_mark(start_mark)
        episode += 1

    o_win = results.count(1)
    x_win = results.count(-1)
    draw = len(results) - o_win - x_win
    mfile = model_file.replace(CWD + os.sep, '')
    minfo.update(dict(base_win=o_win, td_win=x_win, draw=draw,
                      model_file=mfile))
    result = json.dumps(minfo)

    if show_result:
        print(result)
    return result
예제 #6
0
    def __init__(self, config, session):
        BaseAgent.__init__(self, config, session)
        self.action_modes = {str(config.testing_epsilon)+"_greedy":self.e_greedy_action,
                            "plan_"+str(config.testing_epsilon)+"_greedy":self.plan_e_greedy_action}
        self.default_action_mode = self.action_modes.items()[0]
        self.action_mode = self.default_action_mode
        # build the net
        with tf.device(config.device):
            # Create all variables and the FIFOQueue
            self.state_ph = tf.placeholder(
                tf.float32, [None, 84, 84, 4], name="state_ph")
            self.action_ph = tf.placeholder(tf.int64, [None], name="action_ph")
            self.reward_ph = tf.placeholder(tf.float32, [None], name="reward_ph")
            self.terminal_ph = tf.placeholder(tf.float32, [None], name="terminal_ph")
            self.stateT_ph = tf.placeholder(
                tf.float32, [None, 84, 84, 4], name="stateT_ph")
            # Define all the ops
            with tf.variable_scope("Q"):
                self.h_state = self.state_to_hidden(self.state_ph, config, "Normal")
                self.Q = self.hidden_to_Q(self.h_state, config, "Normal")
                self.predicted_reward = self.hidden_to_reward(self.h_state, config, "Normal")
                self.predicted_h_state = self.hidden_to_hidden(self.h_state, self.action_ph, config, "Normal")
                tf.get_variable_scope().reuse_variables()
                self.predicted_next_Q = self.hidden_to_Q(self.predicted_h_state, config, "Normal")
            with tf.variable_scope("QT"):
                self.h_stateT = self.state_to_hidden(self.stateT_ph, config, "Target")
                self.QT = self.hidden_to_Q(self.h_stateT, config, "Target")

            self.train_op = self.train_op(self.Q, self.predicted_reward,
                                self.predicted_next_Q, self.QT, self.reward_ph,
                                self.action_ph, self.terminal_ph, config, "Normal")
            self.sync_QT_op = []
            for W_pair in zip(
                    tf.get_collection("Target_weights"),
                    tf.get_collection("Normal_weights")):
                self.sync_QT_op.append(W_pair[0].assign(W_pair[1]))
            # Define the summary ops
            self.Q_summary_op = tf.merge_summary(
                tf.get_collection("Normal_summaries"))
            self.QT_summary_op = tf.merge_summary(
                tf.get_collection("Target_summaries"))
        if config.logging:
            self.summary_writter = tf.train.SummaryWriter(
                self.config.log_path, self.sess.graph, flush_secs=20)
예제 #7
0
 def __init__(self,
              max_velocity,
              turn_speed,
              max_health,
              max_armor,
              spawn_point=(200, 200),
              starting_angle=0,
              starter_weapon_pack=None,
              starter_ammo_pack=None,
              color='#303030',
              radius=10):
     BaseAgent.__init__(self,
                        max_velocity,
                        turn_speed,
                        max_health,
                        max_armor,
                        spawn_point,
                        starting_angle,
                        starter_weapon_pack,
                        starter_ammo_pack,
                        color,
                        radius)
예제 #8
0
def run_episode(env: gym.Env, agent: BaseAgent, render=False):
    start_time = time.time()
    print('Started', start_time)

    watcher = tw.Watcher(filename='random_agent.log')
    logger = watcher.create_stream(name='reward')
    watcher.make_notebook()

    obs = env.reset()
    agent.reset(env)

    reward, env_done, i, total_r = 0.0, False, 0, 0.0
    while not env_done:
        action = agent.act(obs, reward, env_done)
        obs, reward, env_done, info = env.step(action=action)
        if render:
            rendered = env.render(mode='human')
        total_r += reward
        logger.write((i, total_r))
        i += 1

    print('Done: reward, time', total_r, time.time() - start_time)
    return total_r
예제 #9
0
 def __init__(self, config, url):
     BaseAgent.__init__(self, config, url)
예제 #10
0
 def __init__(self, config, url):
     self.url = url
     self.config = config
     BaseAgent.__init__(self, config, url)
예제 #11
0
파일: run.py 프로젝트: rsun0/sc2rl
def main():

    ### Change this map if you must
    map_name = "DefeatRoaches"
    render = False
    step_mul = 8


    ### Edit this to be a list of sc2_env.Agent() variables, one for each agent
    ### or bot you want, unless you are playing a minigame
    players = None


    env = FullStateActionEnvironment(map_name_=map_name,
                                render=render,
                                step_multiplier=step_mul,
                                players=players)


    ### Set this to construct your desired network inheriting from BaseNetwork
    model = None

    ### Change these parameters and dicts to customize training

    lr = 1e-4
    eps_max = 0.3
    eps_min = 0.05
    eps_duration=1e5
    history_size=20


    num_episodes = 1000000
    num_epochs = 2
    batch_size = 32
    train_every = 2048
    save_every = 10240
    graph_every = 50
    averaging_window = 100

    """
        :param optimizer: A class from torch.optim (instantiated later)
        :param learning_rate: The learning rate for the network
        :param epsilon_max: The starting epsilon
        :param epsilon_min: The final epsilon
        :param epsilon_duration: The number of frames to reach the final epsilon
    """
    agent_settings = AgentSettings(torch.optim.Adam,
                                lr,
                                eps_max,
                                eps_min,
                                eps_duration)

    ### Unless you are changing code in interface, you shouldn't change this dict
    run_settings = RunSettings(num_episodes,
                                num_epochs,
                                batch_size,
                                train_every,
                                save_every,
                                graph_every,
                                averaging_window)

    ### Unless you are changing memory, you shouldn't change this
    memory = ReplayMemory(train_every, batch_size, hist_size=history_size)
    """
    Custom to how you want to train your agent.
    Unless you are changing base_agent and changing the training algorithm,
    or you want to tune train parameters, you should not change this dict.
    """
    train_settings = {
        "discount_factor": 0.99,
        "lambda": 0.95,
        "hist_size": history_size,
        "device": device,
        "eps_denom": 1e-6,
        "c1": 0.1,
        "c2": 0.05,
        "c3": 0.01,
        "c4": 0.01,
        "clip_param": 0.1,
        "map": map_name
    }

    """
    Constructs the agent and trains it in an experiment.
    """
    agent = BaseAgent(model, agent_settings, memory, train_settings)
    experiment = Experiment([agent], env, run_settings)
    experiment.train()