Exemple #1
0
def run_experiment(environment, backend, device_id, max_epoch, record, logdir,
                   visualizer):

    env = GymEnvironment(environment,
                         monitoring_path=logdir if record else None)

    if backend == 'cntk':
        from malmopy.model.cntk import QNeuralNetwork as CntkDQN
        model = CntkDQN((4, 84, 84),
                        env.available_actions,
                        momentum=0.95,
                        device_id=device_id,
                        visualizer=visualizer)
    else:
        from malmopy.model.chainer import DQNChain, QNeuralNetwork as ChainerDQN
        chain = DQNChain((4, 84, 84), env.available_actions)
        target_chain = DQNChain((4, 84, 84), env.available_actions)
        model = ChainerDQN(chain,
                           target_chain,
                           momentum=0.95,
                           device_id=device_id)

    memory = TemporalMemory(1000000, model.input_shape[1:])
    agent = QLearnerAgent("DQN Agent",
                          env.available_actions,
                          model,
                          memory,
                          0.99,
                          32,
                          train_after=10000,
                          reward_clipping=(-1, 1),
                          visualizer=visualizer)

    state = env.reset()
    reward = 0
    agent_done = False
    viz_rewards = []

    max_training_steps = max_epoch * EPOCH_SIZE
    for step in range(1, max_training_steps + 1):

        # check if env needs reset
        if env.done:
            visualize_training(visualizer, step, viz_rewards)
            agent.inject_summaries(step)
            viz_rewards = []
            state = env.reset()

        # select an action
        action = agent.act(state, reward, agent_done, is_training=True)

        # take a step
        state, reward, agent_done = env.do(action)
        viz_rewards.append(reward)

        if (step % EPOCH_SIZE) == 0:
            model.save('%s-%s-dqn_%d.model' %
                       (backend, environment, step / EPOCH_SIZE))
Exemple #2
0
def agent_factory(name, role, clients, backend, device, max_epochs, logdir,
                  visualizer):

    assert len(clients) >= 2, 'Not enough clients (need at least 2)'
    clients = parse_clients_args(clients)

    if role == 0:

        builder = PigChaseSymbolicStateBuilder()
        env = PigChaseEnvironment(clients,
                                  builder,
                                  role=role,
                                  randomize_positions=True)
        agent = PigChaseChallengeAgent(name)
        if type(agent.current_agent) == RandomAgent:
            agent_type = PigChaseEnvironment.AGENT_TYPE_1
        else:
            agent_type = PigChaseEnvironment.AGENT_TYPE_2

        obs = env.reset(agent_type)
        reward = 0
        agent_done = False

        while True:
            if env.done:
                if type(agent.current_agent) == RandomAgent:
                    agent_type = PigChaseEnvironment.AGENT_TYPE_1
                else:
                    agent_type = PigChaseEnvironment.AGENT_TYPE_2

                obs = env.reset(agent_type)
                while obs is None:
                    # this can happen if the episode ended with the first
                    # action of the other agent
                    print('Warning: received obs == None.')
                    obs = env.reset(agent_type)

            # select an action
            action = agent.act(obs, reward, agent_done, is_training=True)
            # take a step
            obs, reward, agent_done = env.do(action)

    else:
        env = PigChaseEnvironment(clients,
                                  MalmoALEStateBuilder(),
                                  role=role,
                                  randomize_positions=True)
        memory = TemporalMemory(100000, (84, 84))

        if backend == 'cntk':
            from malmopy.model.cntk import QNeuralNetwork
            model = QNeuralNetwork((memory.history_length, 84, 84),
                                   env.available_actions, device)
        else:
            from malmopy.model.chainer import QNeuralNetwork, DQNChain
            chain = DQNChain((memory.history_length, 84, 84),
                             env.available_actions)
            target_chain = DQNChain((memory.history_length, 84, 84),
                                    env.available_actions)
            model = QNeuralNetwork(chain, target_chain, device)

        explorer = LinearEpsilonGreedyExplorer(1, 0.1, 1000000)
        agent = PigChaseQLearnerAgent(name,
                                      env.available_actions,
                                      model,
                                      memory,
                                      0.99,
                                      32,
                                      50000,
                                      explorer=explorer,
                                      visualizer=visualizer)

        obs = env.reset()
        reward = 0
        agent_done = False
        viz_rewards = []

        max_training_steps = EPOCH_SIZE * max_epochs
        for step in six.moves.range(1, max_training_steps + 1):

            # check if env needs reset
            if env.done:

                visualize_training(visualizer, step, viz_rewards)
                agent.inject_summaries(step)
                viz_rewards = []

                obs = env.reset()
                while obs is None:
                    # this can happen if the episode ended with the first
                    # action of the other agent
                    print('Warning: received obs == None.')
                    obs = env.reset()

            # select an action
            action = agent.act(obs, reward, agent_done, is_training=True)
            # take a step
            obs, reward, agent_done = env.do(action)
            viz_rewards.append(reward)

            if (step % EPOCH_SIZE) == 0:
                if 'model' in locals():
                    model.save('pig_chase-dqn_%d.model' % (step / EPOCH_SIZE))
Exemple #3
0
from common import ENV_AGENT_NAMES
from evaluation import PigChaseEvaluator
from malmopy.agent import TemporalMemory, LinearEpsilonGreedyExplorer
from malmopy.environment.malmo import MalmoALEStateBuilder
from agent import PigChaseChallengeAgent, PigChaseQLearnerAgent
from malmopy.visualization import ConsoleVisualizer
from malmopy.model.chainer import QNeuralNetwork, ReducedDQNChain



if __name__ == '__main__':
    device = -1
    nb_actions = 3
    visualizer = ConsoleVisualizer()

    clients = [('127.0.0.1', 10000), ('127.0.0.1', 10001)]
    memory = TemporalMemory(100000, (18, 18))
    chain = ReducedDQNChain((memory.history_length, 18, 18), nb_actions)
    target_chain = ReducedDQNChain((memory.history_length, 18, 18), nb_actions)
    model = QNeuralNetwork(chain, target_chain, device)
    explorer = LinearEpsilonGreedyExplorer(0.6, 0.1, 1000000)
    agent = PigChaseQLearnerAgent(ENV_AGENT_NAMES[1], nb_actions,
                                  model, memory, 0.99, 32, 50000,
                                  explorer=explorer, visualizer=visualizer)

    #builder = MalmoALEStateBuilder()
    builder = PigChaseTopDownStateBuilder(True)
    eval = PigChaseEvaluator(clients, agent, agent, builder)
    eval.run()
    eval.save('qlearner_exp', 'qlearner_results.json')
def run_experiment(backend, device_id, max_epoch, record, clients, logdir,
                   visualizer):

    env = MazeEnvironment(mission,
                          [str.split(client, ':') for client in clients])
    env.recording = False

    if backend == 'cntk':
        from malmopy.model.cntk import QNeuralNetwork as CntkDQN
        model = CntkDQN((4, 84, 84),
                        env.available_actions,
                        momentum=0.95,
                        device_id=device_id,
                        visualizer=visualizer)
    else:
        from malmopy.model.chainer import DQNChain, QNeuralNetwork as ChainerDQN
        chain = DQNChain((4, 84, 84), env.available_actions)
        target_chain = DQNChain((4, 84, 84), env.available_actions)
        model = ChainerDQN(chain,
                           target_chain,
                           momentum=0.95,
                           device_id=device_id)

    memory = TemporalMemory(1000000, model.input_shape[1:])
    agent = QLearnerAgent("DQN Agent",
                          env.available_actions,
                          model,
                          memory,
                          0.99,
                          32,
                          train_after=10000,
                          reward_clipping=(-1, 1),
                          visualizer=visualizer)

    #taking random actions
    EPOCH_SIZE = 250000
    max_training_steps = 50 * EPOCH_SIZE
    state = env.reset()
    reward = 0
    agent_done = False
    viz_rewards = []
    for step in range(1, max_training_steps + 1):

        # check if env needs reset
        if env.done:
            visualize_training(visualizer, step, viz_rewards)
            agent.inject_summaries(step)
            viz_rewards = []
            state = env.reset()

            # select an action
        action = agent.act(state, reward, agent_done, is_training=True)
        if type(action) == int:
            print('ACTION BEING TAKEN: ', action)
        else:
            print('ACTION BEING TAKEN: ', np.asscalar(action))

            # take a step
        state, reward, agent_done = env.do(action)
        viz_rewards.append(reward)

        if (step % EPOCH_SIZE) == 0:
            model.save('%s-%s-dqn_%d.model' %
                       (backend, environment, step / EPOCH_SIZE))