Esempio n. 1
0
 def __init__(self, env_id, visualize=False):
     self.env_id = env_id
     self.visualize = visualize
     env_dict = {
         "doom" : Doom,
         "flappybird" : FlappyBird,
         "monsterkong" : MonsterKong,
         "catcher" : Catcher,
         "pixelcopter" : Pixelcopter,
         "pong" : Pong,
         "puckworld" : PuckWorld,
         "raycastmaze" : RaycastMaze,
         "snake" : Snake,
         "waterworld" : WaterWorld
     }
     try:
         # Maybe try to implement for python 2.7? Definitely deprecated for 3.6
         if self.env_id == "doom":
             raise TensorForceError("Doom-Py Deprecated")
         else:
             self.game = env_dict[env_id]()
             self.env = ple.PLE(self.game, display_screen=visualize)
     except KeyError:
         print('Game not implemented in PyGame-Learning-Environemnt or these bindings')
         print('Implemented environments include:')
         print('"flappybird", "monsterkong", "catcher"')
         print('"pixelcopter", "pong", "puckworld", "waterworld"')
Esempio n. 2
0
    def __init__(self, level, visualize=False, frame_skip=1, fps=30):
        super().__init__()

        import ple

        if isinstance(level, str):
            assert level in PyGameLearningEnvironment.levels()
            level = getattr(ple.games, level)()

        if not visualize:
            os.putenv('SDL_VIDEODRIVER', 'fbcon')
            os.environ['SDL_VIDEODRIVER'] = 'dummy'

        self.environment = ple.PLE(
            game=level,
            fps=fps,
            frame_skip=frame_skip,
            display_screen=visualize
            # num_steps=1, reward_values={}, force_fps=True, add_noop_action=True, NOOP=K_F15,
            # state_preprocessor=None, rng=24
        )
        self.environment.init()

        self.has_game_state = self.environment.getGameStateDims() is not None
        self.available_actions = tuple(self.environment.getActionSet())
Esempio n. 3
0
    def __init__(self):
        num_inputs, num_outputs = 7, 1
        config.update(num_inputs, num_outputs)

        self.game = ple.games.pixelcopter.Pixelcopter(width=144, height=144)
        self.env = ple.PLE(self.game,
                           fps=240,
                           display_screen=False,
                           force_fps=True)
        self.action_set = self.env.getActionSet()
        self.env.init()
Esempio n. 4
0
        def __init__(self):
            os.putenv('SDL_VIDEODRIVER', 'fbcon')
            os.environ["SDL_VIDEODRIVER"] = "dummy"
            super().__init__()
            self.ple = ple.PLE(
                game_class(**kwargs),
                state_preprocessor=state_preprocessor,
                display_screen=False
            )

            self.ple.init()

            self.reward_range = (
                min(self.ple.game.rewards.values()),
                max(self.ple.game.rewards.values())
            )

            self.obs_type = obs_type
            if self.obs_type == 'rgb':
                self.get_obs = self.ple.getScreenRGB
                self.observation_space = gym.spaces.Box(
                    low=0, high=255, shape=(*self.ple.getScreenDims(), 3)
                )
            elif self.obs_type == 'state_vector':
                self.get_obs = self.ple.getGameState
                self.observation_space = gym.spaces.Box(
                    low=-1000, high=1000, shape=self.get_obs().shape, dtype=np.float64
                )
            else:
                assert False, "obs_type must be rgb or state_vector"

            self.action_space = gym.spaces.Discrete(6)
            assert len(self.ple.getActionSet()) < 6
            self._actions = self.ple.getActionSet()
            self._actions += [
                None for _ in range(6 - len(self._actions))
            ]
            self._action_mapping = self.ple.game.actions
            self._action_mapping['NOOP'] = None

            self.ale = self.ple
            self.np_random = np.random.RandomState(0)
Esempio n. 5
0
def run():
    game = ple.games.flappybird.FlappyBird()
    # game = ple.games.snake.Snake(width=512, height=512)
    # game = ple.games.pong.Pong(width=512, height=512)
    p = ple.PLE(game, fps=30, display_screen=args.is_render)
    p.init()
    plt.figure()

    all_scores = []
    all_losses = []
    all_t = []

    agent = PGAgent(len(p.getGameState()), len(p.getActionSet()))
    is_end = p.game_over()

    for e in range(args.episodes):
        p.reset_game()
        s_t0 = np.asarray(list(p.getGameState().values()), dtype=np.float32)
        reward_total = 0
        pipes = 0

        transitions = []
        for t in range(args.max_steps):
            a_t0_idx = agent.act(s_t0)
            a_t0 = p.getActionSet()[a_t0_idx]
            r_t1 = p.act(a_t0)
            is_end = p.game_over()
            s_t1 = np.asarray(list(p.getGameState().values()),
                              dtype=np.float32)
            reward_total += r_t1

            if r_t1 == 1.0:
                pipes += 1

            if t == args.max_steps - 1:
                r_t1 = -100
                is_end = True

            transitions.append([s_t0, a_t0_idx, r_t1])
            s_t0 = s_t1

            if is_end:
                all_scores.append(reward_total)
                break

        for t in range(len(transitions)):
            R = 0
            for t_c, (s_t0, a_t0_idx, r_t) in enumerate(transitions[t:]):
                R += args.gamma**t_c * r_t

            s_t0, a_t0_idx, r_t1 = transitions[t]
            tr = [s_t0, a_t0_idx, R]
            agent.replay_memory.push(tr)

        loss = 0
        if len(agent.replay_memory) > args.batch_size:
            loss = agent.replay()
            all_losses.append(loss)

        all_t.append(t)

        metrics_episode = {
            'loss': loss,
            'score': reward_total,
            't': t,
            'e': agent.epsilon,
            'pipes': pipes
        }

        if args.is_csv is True:
            CsvUtils.add_hparams(sequence_dir=os.path.join(
                '.', args.sequence_name),
                                 sequence_name=args.sequence_name,
                                 run_name=args.run_name,
                                 args_dict=args.__dict__,
                                 metrics_dict=metrics_episode,
                                 global_step=e)
        else:
            logging.info(f'episode: {e}/{args.episodes} ', metrics_episode)
            print(f'episode: {e}/{args.episodes} ', metrics_episode)

        if e % 100 == 0 and not args.is_inference:
            # save logs, graphics and weights during training
            plt.clf()

            plt.subplot(3, 1, 1)
            plt.ylabel('Score')
            plt.plot(all_scores)

            plt.subplot(3, 1, 2)
            plt.ylabel('Loss')
            plt.plot(all_losses)

            plt.subplot(3, 1, 3)
            plt.ylabel('Steps')
            plt.plot(all_t)

            plt.xlabel('Episode')
            plt.savefig(os.path.join(seq_run_name, f'plt-{e}.png'))
            torch.save(agent.p_model.cpu().state_dict(),
                       os.path.join(seq_run_name, f'model-{e}.pt'))
Esempio n. 6
0
    game = ple.games.waterworld.WaterWorld(
        width=WIDTH,
        height=HEIGHT,
        num_creeps=CREEPS,
    )

    reward_values = {
        'positive': 10.0,
        'negative': -11.0,
        'tick': -0.01,
        'loss': -5.0,
        'win': 1000000.0
    }

    env = ple.PLE(game,
                  fps=FPS,
                  display_screen=DISPLAY,
                  reward_values=reward_values)
    print("rewards :", game.rewards)

    # agent = Agent(actions=env.getActionSet(),
    #               load=LOAD, game=game)

    agent = Sensors(actions=env.getActionSet(), load=LOAD, game=game)

    env.init()

    rewards_a = []
    scores = []
    reward = 0.0

    won = 0
    def __init__(self, checkpoint_path="deep_suna_networks"):
        """
        Example of deep q network for pong

        :param checkpoint_path: directory to store checkpoints in
        :type checkpoint_path: str
        """

        self._time = self.START_TIME
        self._checkpoint_path = checkpoint_path

        # set the first action to do nothing
        self._last_action = np.zeros(self.ACTIONS_COUNT)
        self._last_action[1] = 1

        self._last_state = None

        # Create an optimizer that performs gradient descent.
        self.opt = tf.train.AdamOptimizer(self.LEARN_RATE)

        self._input_states = tf.placeholder("float", [
            None, self.RESIZED_SCREEN_X, self.RESIZED_SCREEN_Y,
            self.STATE_FRAMES
        ])
        # with tf.device('/gpu:0'):
        with tf.variable_scope('conv1'):
            self.kernel1 = _variable_with_weight_decay(
                'weights',
                shape=[8, 8, self.STATE_FRAMES, 32],
                stddev=0.01,
                wd=None)
            self.biases1 = _variable_on_gpu('biases', [32],
                                            tf.constant_initializer(0.01))
            conv = tf.nn.conv2d(self._input_states,
                                self.kernel1, [1, 2, 2, 1],
                                padding='SAME')
            pre_activation = tf.nn.bias_add(conv, self.biases1)
            conv1 = tf.nn.relu(pre_activation)

        with tf.variable_scope('conv2'):
            self.kernel2 = _variable_with_weight_decay('weights',
                                                       shape=[4, 4, 32, 64],
                                                       stddev=0.01,
                                                       wd=0.0)
            self.biases2 = _variable_on_gpu('biases', [64],
                                            tf.constant_initializer(0.01))
            # conv2
            conv = tf.nn.conv2d(conv1,
                                self.kernel2, [1, 2, 2, 1],
                                padding='SAME')
            pre_activation = tf.nn.bias_add(conv, self.biases2)
            conv2 = tf.nn.relu(pre_activation)
            # _activation_summary(conv2)

        with tf.variable_scope('conv3'):
            self.kernel3 = _variable_with_weight_decay('weights',
                                                       shape=[3, 3, 64, 64],
                                                       stddev=0.01,
                                                       wd=0.0)
            self.biases3 = _variable_on_gpu('biases', [64],
                                            tf.constant_initializer(0.01))
            # conv3
            conv = tf.nn.conv2d(conv2,
                                self.kernel3, [1, 1, 1, 1],
                                padding='SAME')
            pre_activation = tf.nn.bias_add(conv, self.biases3)
            conv3 = tf.nn.relu(pre_activation)

        with tf.variable_scope('local3'):
            self.weights4 = _variable_with_weight_decay('weights',
                                                        shape=[6400, 256],
                                                        stddev=0.01,
                                                        wd=0.0)
            self.biases4 = _variable_on_gpu('biases', [256],
                                            tf.constant_initializer(0.01))
            # local3
            # Move everything into depth so we can perform a single matrix multiply.
            reshape = tf.reshape(conv3, [-1, 6400])
            # dim = reshape.get_shape()[1].value
            local3 = tf.nn.relu(
                tf.matmul(reshape, self.weights4) + self.biases4)

        with tf.variable_scope('softmax_linear'):
            self.weights6 = _variable_with_weight_decay(
                'weights', [256, self.ACTIONS_COUNT], stddev=0.01, wd=0.0)
            self.biases6 = _variable_on_gpu('biases', [self.ACTIONS_COUNT],
                                            tf.constant_initializer(0.01))
            self.output_layer = tf.add(tf.matmul(local3, self.weights6),
                                       self.biases6)
            # _activation_summary(self.output_layer[d])

        self._session = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))

        self.replay_number = 0
        self.terminal = True

        self.restore_filename = 100000
        self.full_file_eval = False
        self.file_folder = "deep_suna_networks"
        self.initial_env_number = 2

        self.environment = simple.Agent()
        self.env = ple.PLE(self.environment,
                           fps=10,
                           force_fps=self.full_file_eval,
                           display_screen=True)
        self.ple_action_list = self.env.getActionSet()

        self.saver = tf.train.Saver()
        self.saver.restore(
            self._session,
            self.file_folder + "/model" + str(self.restore_filename))
        if self.full_file_eval:
            self.stepsfile = open(self.file_folder + "/Stepsdata.txt", "w")
Esempio n. 8
0
    def __init__(self, checkpoint_path="deep_suna_networks"):
        """
        Example of deep q network for pong

        :param checkpoint_path: directory to store checkpoints in
        :type checkpoint_path: str
        """

        self._time = self.START_TIME
        self._checkpoint_path = checkpoint_path

        # pygame.init()
        self.environment = simple.Agent()
        self.env = ple.PLE(self.environment, display_screen=False)
        self.ple_action_list = self.env.getActionSet()

        # self.env.init()

        # set the first action to do nothing
        self._last_action = np.zeros(self.ACTIONS_COUNT)
        self._last_action[1] = 1

        self._last_state = None

        global_step = tf.Variable(0, trainable=False)
        # Decay the learning rate exponentially based on the number of steps.
        self.learning_rate = tf.train.exponential_decay(
            self.INITIAL_LEARNING_RATE,
            global_step,
            self.DECAY_STEPS,
            self.LEARNING_RATE_DECAY_FACTOR,
            staircase=True)

        # Create an optimizer that performs gradient descent.
        self.opt = tf.train.AdamOptimizer(self.LEARN_RATE)

        # Calculate the gradients for each model tower.
        # self.tower_grads = []

        self._input_states = tf.placeholder("float", [
            None, self.RESIZED_SCREEN_X, self.RESIZED_SCREEN_Y,
            self.STATE_FRAMES
        ])
        self._action = tf.placeholder("float", [None, self.ACTIONS_COUNT])
        self._target = tf.placeholder("float", [None], name="target_Q")
        self._target_input_states = tf.placeholder("float", [
            None, self.RESIZED_SCREEN_X, self.RESIZED_SCREEN_Y,
            self.STATE_FRAMES
        ])
        # self.readout_action.append(None)
        # self.cost.append(None)

        # with tf.device('/gpu:0'):
        with tf.variable_scope('conv1'):
            self.kernel1 = _variable_with_weight_decay(
                'weights',
                shape=[8, 8, self.STATE_FRAMES, 32],
                stddev=0.01,
                wd=None)
            self.biases1 = _variable_on_gpu('biases', [32],
                                            tf.constant_initializer(0.01))
            conv = tf.nn.conv2d(self._input_states,
                                self.kernel1, [1, 2, 2, 1],
                                padding='SAME')
            pre_activation = tf.nn.bias_add(conv, self.biases1)
            batch_norm, self.beta1, self.gamma1 = _batch_normalization(
                pre_activation)
            conv1 = tf.nn.relu(batch_norm)

        with tf.variable_scope('conv2'):
            self.kernel2 = _variable_with_weight_decay('weights',
                                                       shape=[4, 4, 32, 64],
                                                       stddev=0.01,
                                                       wd=0.0)
            self.biases2 = _variable_on_gpu('biases', [64],
                                            tf.constant_initializer(0.01))
            # conv2
            conv = tf.nn.conv2d(conv1,
                                self.kernel2, [1, 2, 2, 1],
                                padding='SAME')
            pre_activation = tf.nn.bias_add(conv, self.biases2)
            batch_norm, self.beta2, self.gamma2 = _batch_normalization(
                pre_activation)
            conv2 = tf.nn.relu(batch_norm)

        with tf.variable_scope('conv3'):
            self.kernel3 = _variable_with_weight_decay('weights',
                                                       shape=[3, 3, 64, 64],
                                                       stddev=0.01,
                                                       wd=0.0)
            self.biases3 = _variable_on_gpu('biases', [64],
                                            tf.constant_initializer(0.01))
            # conv3
            conv = tf.nn.conv2d(conv2,
                                self.kernel3, [1, 1, 1, 1],
                                padding='SAME')
            pre_activation = tf.nn.bias_add(conv, self.biases3)
            batch_norm, self.beta3, self.gamma3 = _batch_normalization(
                pre_activation)
            conv3 = tf.nn.relu(batch_norm)

        with tf.variable_scope('local3'):
            self.weights4 = _variable_with_weight_decay('weights',
                                                        shape=[6400, 256],
                                                        stddev=0.01,
                                                        wd=0.0)
            self.biases4 = _variable_on_gpu('biases', [256],
                                            tf.constant_initializer(0.01))
            # local3
            # Move everything into depth so we can perform a single matrix multiply.
            reshape = tf.reshape(conv3, [-1, 6400])
            # dim = reshape.get_shape()[1].value
            fully_connected = tf.matmul(reshape, self.weights4) + self.biases4
            # batch_norm = _batch_normalization(fully_connected, 256, 2)
            local3 = tf.nn.relu(fully_connected)
            # _activation_summary(local3)
            # fiction_dropout = tf.nn.dropout(local3, keep_prob=0.5)

        with tf.variable_scope('softmax_linear'):
            self.weights6 = _variable_with_weight_decay(
                'weights', [256, self.ACTIONS_COUNT], stddev=0.01, wd=0.0)
            self.biases6 = _variable_on_gpu('biases', [self.ACTIONS_COUNT],
                                            tf.constant_initializer(0.01))
            self.output_layer = tf.add(tf.matmul(local3, self.weights6),
                                       self.biases6)
            # _activation_summary(self.output_layer[d])

        self.readout_action = tf.reduce_sum(tf.multiply(
            self.output_layer, self._action),
                                            axis=1)
        self.cost = tf.reduce_mean(
            tf.square(self._target - self.readout_action))
        # tf.scalar_summary("loss%d" % d, self.cost)

        # tf.add_to_collection('losses', self.cost)
        # losses = tf.get_collection('losses')
        # total_loss = tf.add_n(losses, name='total_loss')
        grads = self.opt.compute_gradients(self.cost)
        self.tower_grads = grads

        # with tf.device('/gpu:0'):
        with tf.variable_scope('target_conv1'):
            self.target_kernel1 = _variable_with_weight_decay(
                'target_weights',
                shape=[8, 8, self.STATE_FRAMES, 32],
                stddev=0.01,
                wd=0.0)
            self.target_biases1 = _variable_on_gpu(
                'target_biases', [32], tf.constant_initializer(0.01))
            target_conv = tf.nn.conv2d(self._target_input_states,
                                       self.target_kernel1, [1, 2, 2, 1],
                                       padding='SAME')
            target_pre_activation = target_conv + self.target_biases1
            target_batch_norm, self.target_beta1, self.target_gamma1 = _batch_normalization(
                target_pre_activation)
            target_conv1 = tf.nn.relu(target_batch_norm)

        with tf.variable_scope('target_conv2'):
            self.target_kernel2 = _variable_with_weight_decay(
                'target_weights', shape=[4, 4, 32, 64], stddev=0.01, wd=0.0)
            self.target_biases2 = _variable_on_gpu(
                'target_biases', [64], tf.constant_initializer(0.01))
            # conv2
            target_conv = tf.nn.conv2d(target_conv1,
                                       self.target_kernel2, [1, 2, 2, 1],
                                       padding='SAME')
            target_pre_activation = target_conv + self.target_biases2
            target_batch_norm, self.target_beta2, self.target_gamma2 = _batch_normalization(
                target_pre_activation)
            target_conv2 = tf.nn.relu(target_batch_norm)

        with tf.variable_scope('target_conv3'):
            self.target_kernel3 = _variable_with_weight_decay(
                'target_weights', shape=[3, 3, 64, 64], stddev=0.01, wd=0.0)
            self.target_biases3 = _variable_on_gpu(
                'target_biases', [64], tf.constant_initializer(0.01))
            # conv3
            target_conv = tf.nn.conv2d(target_conv2,
                                       self.target_kernel3, [1, 1, 1, 1],
                                       padding='SAME')
            target_pre_activation = target_conv + self.target_biases3
            target_batch_norm, self.target_beta3, self.target_gamma3 = _batch_normalization(
                target_pre_activation)
            target_conv3 = tf.nn.relu(target_batch_norm)

        with tf.variable_scope('target_local3'):
            self.target_weights4 = _variable_with_weight_decay(
                'target_weights', shape=[6400, 256], stddev=0.01, wd=0.0)
            self.target_biases4 = _variable_on_gpu(
                'target_biases', [256], tf.constant_initializer(0.01))
            # local3
            # Move everything into depth so we can perform a single matrix multiply.
            reshape = tf.reshape(target_conv3, [-1, 6400])
            # dim = reshape.get_shape()[1].value

            target_local3 = tf.nn.relu(
                tf.matmul(reshape, self.target_weights4) + self.target_biases4)
            # _activation_summary(target_local3)
            # target_fiction_dropout = tf.nn.dropout(target_local3, keep_prob=0.5)

        with tf.variable_scope('target_softmax_linear'):
            self.target_weights6 = _variable_with_weight_decay(
                'target_weights', [256, self.ACTIONS_COUNT],
                stddev=0.01,
                wd=0.0)
            self.target_biases6 = _variable_on_gpu(
                'target_biases', [self.ACTIONS_COUNT],
                tf.constant_initializer(0.01))

            self.target_output_layer = tf.add(
                tf.matmul(target_local3, self.target_weights6),
                self.target_biases6)
        # self._train_operation1 = tf.train.AdamOptimizer(self.LEARN_RATE).minimize(self.cost[0])
        # self._train_operation2 = tf.train.AdamOptimizer(self.LEARN_RATE).minimize(self.cost[1])

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        # grads = average_gradients(self.tower_grads)

        # Apply the gradients to adjust the shared variables.
        self.apply_gradient_op = self.opt.apply_gradients(grads, global_step)

        self._observations = deque()
        self._last_scores = deque()

        self._probability_of_random_action = self.INITIAL_RANDOM_ACTION_PROB

        init_op = tf.global_variables_initializer()
        self._session = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True, log_device_placement=False))

        # self.merged = tf.summary.merge_all()
        # self.writer = tf.summary.FileWriter("/home/uchi/catkin_ws/environment/apple_game/na_logs",
        #                                      self._session.graph)

        self._session.run(init_op)

        self.duration = 0
        self.terminal = True

        if not os.path.exists(self._checkpoint_path):
            os.mkdir(self._checkpoint_path)
        # write into a file
        self.fileqdata = open(self._checkpoint_path + "/Qdata.txt", "w")
Esempio n. 9
0
def run():
    game = ple.games.flappybird.FlappyBird()
    # game = ple.games.snake.Snake(width=512, height=512)
    # game = ple.games.pong.Pong(width=512, height=512)
    p = ple.PLE(game, fps=30, display_screen=args.is_render)
    p.init()
    plt.figure()

    all_scores = []
    all_losses = []
    all_losses_a = []
    all_losses_c = []
    all_t = []

    agent = A2CAgent(len(p.getGameState()), len(p.getActionSet()))
    is_end = p.game_over()

    for e in range(args.episodes):
        p.reset_game()
        s_t0 = np.asarray(list(p.getGameState().values()), dtype=np.float32)
        reward_total = 0
        pipes = 0

        transitions = []
        states_t1 = []
        end_t1 = []
        for t in range(args.max_steps):
            a_t0_idx = agent.act(s_t0)
            a_t0 = p.getActionSet()[a_t0_idx]
            r_t1 = p.act(a_t0)
            is_end = p.game_over()
            s_t1 = np.asarray(list(p.getGameState().values()), dtype=np.float32)
            end_t1.append(is_end)
            reward_total += r_t1

            if r_t1 == 1.0:
                pipes += 1

            transitions.append([s_t0, a_t0_idx, r_t1])
            states_t1.append(s_t1)
            s_t0 = s_t1

            if is_end:
                all_scores.append(reward_total)
                break

        t_states_t1 = torch.FloatTensor(states_t1).to(args.device)
        v_t1 = agent.model_c.forward(t_states_t1)
        np_v_t1 = v_t1.cpu().data.numpy().squeeze()
        for t in range(len(transitions)):
            s_t0, a_t0_idx, r_t1 = transitions[t]
            is_end = end_t1[t]
            delta = r_t1
            if not is_end:
                delta = r_t1 + args.gamma * np_v_t1[t]
            agent.replay_memory.push([s_t0, a_t0_idx, delta])

        loss = loss_a = loss_c = 0
        if len(agent.replay_memory) > args.batch_size:
            loss_a, loss_c = agent.replay()
            loss = loss_a + loss_c

            all_losses.append(loss)
            all_losses_a.append(loss_a)
            all_losses_c.append(loss_c)

        all_t.append(t)

        metrics_episode = {
            'loss': loss,
            'loss_a': loss_a,
            'loss_c': loss_c,
            'score': reward_total,
            't': t,
            'e': agent.epsilon,
            'pipes': pipes
        }

        if args.is_csv is True:
            CsvUtils.add_hparams(
                sequence_dir=os.path.join('.', args.sequence_name),
                sequence_name=args.sequence_name,
                run_name=args.run_name,
                args_dict=args.__dict__,
                metrics_dict=metrics_episode,
                global_step=e
            )
        else:
            logging.info(f'episode: {e}/{args.episodes} ', metrics_episode)
            print(f'episode: {e}/{args.episodes} ', metrics_episode)

        if e % 100 == 0:
            plt.clf()

            plt.subplot(5, 1, 1)
            plt.ylabel('Score')
            plt.plot(all_scores)

            plt.subplot(5, 1, 2)
            plt.ylabel('Loss')
            plt.plot(all_losses)

            plt.subplot(5, 1, 3)
            plt.ylabel('Loss Actor')
            plt.plot(all_losses_a)

            plt.subplot(5, 1, 4)
            plt.ylabel('Loss Critic')
            plt.plot(all_losses_c)

            plt.subplot(5, 1, 5)
            plt.ylabel('Steps')
            plt.plot(all_t)

            plt.xlabel('Episode')
            plt.savefig(os.path.join(seq_run_name, f'plt-{e}.png'))
            torch.save(agent.model_c.cpu().state_dict(), os.path.join(seq_run_name, f'model-{e}-c.pt'))
            torch.save(agent.model_a.cpu().state_dict(), os.path.join(seq_run_name, f'model-{e}-a.pt'))
Esempio n. 10
0
def run():
    game = ple.games.flappybird.FlappyBird()
    # game = ple.games.snake.Snake(width=512, height=512)
    # game = ple.games.pong.Pong(width=512, height=512)
    p = ple.PLE(game, fps=30, display_screen=args.is_render)
    p.init()

    plt.figure()

    all_scores = []
    all_losses = []
    all_t = []

    agent = DQNAgent(len(p.getGameState()), len(p.getActionSet()), args)
    is_end = p.game_over()

    for e in range(args.episodes):
        p.reset_game()
        s_t0 = np.asarray(list(p.getGameState().values()), dtype=np.float32)
        reward_total = 0
        pipes = 0
        episode_loss = []
        for t in range(args.max_steps):
            a_t0_idx = agent.act(s_t0)
            a_t0 = p.getActionSet()[a_t0_idx]
            r_t1 = p.act(a_t0)
            is_end = p.game_over()
            s_t1 = np.asarray(list(p.getGameState().values()), dtype=np.float32)

            reward_total += r_t1

            '''
            from /PyGame-Learning-Environment/ple/games/base/pygamewrapper.py
            self.rewards = {
            "positive": 1.0,
            "negative": -1.0,
            "tick": 0,
            "loss": -5.0,
            "win": 5.0
            }
            '''
            if r_t1 == 1.0:
                pipes += 1

            if t == args.max_steps - 1:
                r_t1 = -100
                is_end = True

            agent.replay_memory.push(
                (s_t0, a_t0_idx, r_t1, s_t1, is_end)
            )
            s_t0 = s_t1

            if len(agent.replay_memory) > args.batch_size:
                loss = agent.replay()
                episode_loss.append(loss)

            if is_end:
                all_scores.append(reward_total)
                all_losses.append(np.mean(episode_loss))
                break

        all_t.append(t)

        metrics_episode = {
            'loss': all_losses[-1],
            'score': reward_total,
            't': t,
            'e': agent.epsilon,
            'pipes': pipes
        }

        if args.is_csv is True:
            CsvUtils.add_hparams(
                sequence_dir=os.path.join('.', args.sequence_name),
                sequence_name=args.sequence_name,
                run_name=args.run_name,
                args_dict=args.__dict__,
                metrics_dict=metrics_episode,
                global_step=e
            )
        else:
            logging.info(f'episode: {e}/{args.episodes} ', metrics_episode)
            print(f'episode: {e}/{args.episodes} ', metrics_episode)

        if e % 100 == 0 and not args.is_inference:
            # save logs, graphics and weights during training
            plt.clf()

            plt.subplot(3, 1, 1)
            plt.ylabel('Score')
            plt.plot(all_scores)

            plt.subplot(3, 1, 2)
            plt.ylabel('Loss')
            plt.plot(all_losses)

            plt.subplot(3, 1, 3)
            plt.ylabel('Steps')
            plt.plot(all_t)

            plt.xlabel('Episode')
            plt.savefig(os.path.join(seq_run_name, f'plt-{e}.png'))
            torch.save(agent.q_model.cpu().state_dict(), os.path.join(seq_run_name, f'model-{e}.pt'))