def __init__(self, env_id, visualize=False): self.env_id = env_id self.visualize = visualize env_dict = { "doom" : Doom, "flappybird" : FlappyBird, "monsterkong" : MonsterKong, "catcher" : Catcher, "pixelcopter" : Pixelcopter, "pong" : Pong, "puckworld" : PuckWorld, "raycastmaze" : RaycastMaze, "snake" : Snake, "waterworld" : WaterWorld } try: # Maybe try to implement for python 2.7? Definitely deprecated for 3.6 if self.env_id == "doom": raise TensorForceError("Doom-Py Deprecated") else: self.game = env_dict[env_id]() self.env = ple.PLE(self.game, display_screen=visualize) except KeyError: print('Game not implemented in PyGame-Learning-Environemnt or these bindings') print('Implemented environments include:') print('"flappybird", "monsterkong", "catcher"') print('"pixelcopter", "pong", "puckworld", "waterworld"')
def __init__(self, level, visualize=False, frame_skip=1, fps=30): super().__init__() import ple if isinstance(level, str): assert level in PyGameLearningEnvironment.levels() level = getattr(ple.games, level)() if not visualize: os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ['SDL_VIDEODRIVER'] = 'dummy' self.environment = ple.PLE( game=level, fps=fps, frame_skip=frame_skip, display_screen=visualize # num_steps=1, reward_values={}, force_fps=True, add_noop_action=True, NOOP=K_F15, # state_preprocessor=None, rng=24 ) self.environment.init() self.has_game_state = self.environment.getGameStateDims() is not None self.available_actions = tuple(self.environment.getActionSet())
def __init__(self): num_inputs, num_outputs = 7, 1 config.update(num_inputs, num_outputs) self.game = ple.games.pixelcopter.Pixelcopter(width=144, height=144) self.env = ple.PLE(self.game, fps=240, display_screen=False, force_fps=True) self.action_set = self.env.getActionSet() self.env.init()
def __init__(self): os.putenv('SDL_VIDEODRIVER', 'fbcon') os.environ["SDL_VIDEODRIVER"] = "dummy" super().__init__() self.ple = ple.PLE( game_class(**kwargs), state_preprocessor=state_preprocessor, display_screen=False ) self.ple.init() self.reward_range = ( min(self.ple.game.rewards.values()), max(self.ple.game.rewards.values()) ) self.obs_type = obs_type if self.obs_type == 'rgb': self.get_obs = self.ple.getScreenRGB self.observation_space = gym.spaces.Box( low=0, high=255, shape=(*self.ple.getScreenDims(), 3) ) elif self.obs_type == 'state_vector': self.get_obs = self.ple.getGameState self.observation_space = gym.spaces.Box( low=-1000, high=1000, shape=self.get_obs().shape, dtype=np.float64 ) else: assert False, "obs_type must be rgb or state_vector" self.action_space = gym.spaces.Discrete(6) assert len(self.ple.getActionSet()) < 6 self._actions = self.ple.getActionSet() self._actions += [ None for _ in range(6 - len(self._actions)) ] self._action_mapping = self.ple.game.actions self._action_mapping['NOOP'] = None self.ale = self.ple self.np_random = np.random.RandomState(0)
def run(): game = ple.games.flappybird.FlappyBird() # game = ple.games.snake.Snake(width=512, height=512) # game = ple.games.pong.Pong(width=512, height=512) p = ple.PLE(game, fps=30, display_screen=args.is_render) p.init() plt.figure() all_scores = [] all_losses = [] all_t = [] agent = PGAgent(len(p.getGameState()), len(p.getActionSet())) is_end = p.game_over() for e in range(args.episodes): p.reset_game() s_t0 = np.asarray(list(p.getGameState().values()), dtype=np.float32) reward_total = 0 pipes = 0 transitions = [] for t in range(args.max_steps): a_t0_idx = agent.act(s_t0) a_t0 = p.getActionSet()[a_t0_idx] r_t1 = p.act(a_t0) is_end = p.game_over() s_t1 = np.asarray(list(p.getGameState().values()), dtype=np.float32) reward_total += r_t1 if r_t1 == 1.0: pipes += 1 if t == args.max_steps - 1: r_t1 = -100 is_end = True transitions.append([s_t0, a_t0_idx, r_t1]) s_t0 = s_t1 if is_end: all_scores.append(reward_total) break for t in range(len(transitions)): R = 0 for t_c, (s_t0, a_t0_idx, r_t) in enumerate(transitions[t:]): R += args.gamma**t_c * r_t s_t0, a_t0_idx, r_t1 = transitions[t] tr = [s_t0, a_t0_idx, R] agent.replay_memory.push(tr) loss = 0 if len(agent.replay_memory) > args.batch_size: loss = agent.replay() all_losses.append(loss) all_t.append(t) metrics_episode = { 'loss': loss, 'score': reward_total, 't': t, 'e': agent.epsilon, 'pipes': pipes } if args.is_csv is True: CsvUtils.add_hparams(sequence_dir=os.path.join( '.', args.sequence_name), sequence_name=args.sequence_name, run_name=args.run_name, args_dict=args.__dict__, metrics_dict=metrics_episode, global_step=e) else: logging.info(f'episode: {e}/{args.episodes} ', metrics_episode) print(f'episode: {e}/{args.episodes} ', metrics_episode) if e % 100 == 0 and not args.is_inference: # save logs, graphics and weights during training plt.clf() plt.subplot(3, 1, 1) plt.ylabel('Score') plt.plot(all_scores) plt.subplot(3, 1, 2) plt.ylabel('Loss') plt.plot(all_losses) plt.subplot(3, 1, 3) plt.ylabel('Steps') plt.plot(all_t) plt.xlabel('Episode') plt.savefig(os.path.join(seq_run_name, f'plt-{e}.png')) torch.save(agent.p_model.cpu().state_dict(), os.path.join(seq_run_name, f'model-{e}.pt'))
game = ple.games.waterworld.WaterWorld( width=WIDTH, height=HEIGHT, num_creeps=CREEPS, ) reward_values = { 'positive': 10.0, 'negative': -11.0, 'tick': -0.01, 'loss': -5.0, 'win': 1000000.0 } env = ple.PLE(game, fps=FPS, display_screen=DISPLAY, reward_values=reward_values) print("rewards :", game.rewards) # agent = Agent(actions=env.getActionSet(), # load=LOAD, game=game) agent = Sensors(actions=env.getActionSet(), load=LOAD, game=game) env.init() rewards_a = [] scores = [] reward = 0.0 won = 0
def __init__(self, checkpoint_path="deep_suna_networks"): """ Example of deep q network for pong :param checkpoint_path: directory to store checkpoints in :type checkpoint_path: str """ self._time = self.START_TIME self._checkpoint_path = checkpoint_path # set the first action to do nothing self._last_action = np.zeros(self.ACTIONS_COUNT) self._last_action[1] = 1 self._last_state = None # Create an optimizer that performs gradient descent. self.opt = tf.train.AdamOptimizer(self.LEARN_RATE) self._input_states = tf.placeholder("float", [ None, self.RESIZED_SCREEN_X, self.RESIZED_SCREEN_Y, self.STATE_FRAMES ]) # with tf.device('/gpu:0'): with tf.variable_scope('conv1'): self.kernel1 = _variable_with_weight_decay( 'weights', shape=[8, 8, self.STATE_FRAMES, 32], stddev=0.01, wd=None) self.biases1 = _variable_on_gpu('biases', [32], tf.constant_initializer(0.01)) conv = tf.nn.conv2d(self._input_states, self.kernel1, [1, 2, 2, 1], padding='SAME') pre_activation = tf.nn.bias_add(conv, self.biases1) conv1 = tf.nn.relu(pre_activation) with tf.variable_scope('conv2'): self.kernel2 = _variable_with_weight_decay('weights', shape=[4, 4, 32, 64], stddev=0.01, wd=0.0) self.biases2 = _variable_on_gpu('biases', [64], tf.constant_initializer(0.01)) # conv2 conv = tf.nn.conv2d(conv1, self.kernel2, [1, 2, 2, 1], padding='SAME') pre_activation = tf.nn.bias_add(conv, self.biases2) conv2 = tf.nn.relu(pre_activation) # _activation_summary(conv2) with tf.variable_scope('conv3'): self.kernel3 = _variable_with_weight_decay('weights', shape=[3, 3, 64, 64], stddev=0.01, wd=0.0) self.biases3 = _variable_on_gpu('biases', [64], tf.constant_initializer(0.01)) # conv3 conv = tf.nn.conv2d(conv2, self.kernel3, [1, 1, 1, 1], padding='SAME') pre_activation = tf.nn.bias_add(conv, self.biases3) conv3 = tf.nn.relu(pre_activation) with tf.variable_scope('local3'): self.weights4 = _variable_with_weight_decay('weights', shape=[6400, 256], stddev=0.01, wd=0.0) self.biases4 = _variable_on_gpu('biases', [256], tf.constant_initializer(0.01)) # local3 # Move everything into depth so we can perform a single matrix multiply. reshape = tf.reshape(conv3, [-1, 6400]) # dim = reshape.get_shape()[1].value local3 = tf.nn.relu( tf.matmul(reshape, self.weights4) + self.biases4) with tf.variable_scope('softmax_linear'): self.weights6 = _variable_with_weight_decay( 'weights', [256, self.ACTIONS_COUNT], stddev=0.01, wd=0.0) self.biases6 = _variable_on_gpu('biases', [self.ACTIONS_COUNT], tf.constant_initializer(0.01)) self.output_layer = tf.add(tf.matmul(local3, self.weights6), self.biases6) # _activation_summary(self.output_layer[d]) self._session = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) self.replay_number = 0 self.terminal = True self.restore_filename = 100000 self.full_file_eval = False self.file_folder = "deep_suna_networks" self.initial_env_number = 2 self.environment = simple.Agent() self.env = ple.PLE(self.environment, fps=10, force_fps=self.full_file_eval, display_screen=True) self.ple_action_list = self.env.getActionSet() self.saver = tf.train.Saver() self.saver.restore( self._session, self.file_folder + "/model" + str(self.restore_filename)) if self.full_file_eval: self.stepsfile = open(self.file_folder + "/Stepsdata.txt", "w")
def __init__(self, checkpoint_path="deep_suna_networks"): """ Example of deep q network for pong :param checkpoint_path: directory to store checkpoints in :type checkpoint_path: str """ self._time = self.START_TIME self._checkpoint_path = checkpoint_path # pygame.init() self.environment = simple.Agent() self.env = ple.PLE(self.environment, display_screen=False) self.ple_action_list = self.env.getActionSet() # self.env.init() # set the first action to do nothing self._last_action = np.zeros(self.ACTIONS_COUNT) self._last_action[1] = 1 self._last_state = None global_step = tf.Variable(0, trainable=False) # Decay the learning rate exponentially based on the number of steps. self.learning_rate = tf.train.exponential_decay( self.INITIAL_LEARNING_RATE, global_step, self.DECAY_STEPS, self.LEARNING_RATE_DECAY_FACTOR, staircase=True) # Create an optimizer that performs gradient descent. self.opt = tf.train.AdamOptimizer(self.LEARN_RATE) # Calculate the gradients for each model tower. # self.tower_grads = [] self._input_states = tf.placeholder("float", [ None, self.RESIZED_SCREEN_X, self.RESIZED_SCREEN_Y, self.STATE_FRAMES ]) self._action = tf.placeholder("float", [None, self.ACTIONS_COUNT]) self._target = tf.placeholder("float", [None], name="target_Q") self._target_input_states = tf.placeholder("float", [ None, self.RESIZED_SCREEN_X, self.RESIZED_SCREEN_Y, self.STATE_FRAMES ]) # self.readout_action.append(None) # self.cost.append(None) # with tf.device('/gpu:0'): with tf.variable_scope('conv1'): self.kernel1 = _variable_with_weight_decay( 'weights', shape=[8, 8, self.STATE_FRAMES, 32], stddev=0.01, wd=None) self.biases1 = _variable_on_gpu('biases', [32], tf.constant_initializer(0.01)) conv = tf.nn.conv2d(self._input_states, self.kernel1, [1, 2, 2, 1], padding='SAME') pre_activation = tf.nn.bias_add(conv, self.biases1) batch_norm, self.beta1, self.gamma1 = _batch_normalization( pre_activation) conv1 = tf.nn.relu(batch_norm) with tf.variable_scope('conv2'): self.kernel2 = _variable_with_weight_decay('weights', shape=[4, 4, 32, 64], stddev=0.01, wd=0.0) self.biases2 = _variable_on_gpu('biases', [64], tf.constant_initializer(0.01)) # conv2 conv = tf.nn.conv2d(conv1, self.kernel2, [1, 2, 2, 1], padding='SAME') pre_activation = tf.nn.bias_add(conv, self.biases2) batch_norm, self.beta2, self.gamma2 = _batch_normalization( pre_activation) conv2 = tf.nn.relu(batch_norm) with tf.variable_scope('conv3'): self.kernel3 = _variable_with_weight_decay('weights', shape=[3, 3, 64, 64], stddev=0.01, wd=0.0) self.biases3 = _variable_on_gpu('biases', [64], tf.constant_initializer(0.01)) # conv3 conv = tf.nn.conv2d(conv2, self.kernel3, [1, 1, 1, 1], padding='SAME') pre_activation = tf.nn.bias_add(conv, self.biases3) batch_norm, self.beta3, self.gamma3 = _batch_normalization( pre_activation) conv3 = tf.nn.relu(batch_norm) with tf.variable_scope('local3'): self.weights4 = _variable_with_weight_decay('weights', shape=[6400, 256], stddev=0.01, wd=0.0) self.biases4 = _variable_on_gpu('biases', [256], tf.constant_initializer(0.01)) # local3 # Move everything into depth so we can perform a single matrix multiply. reshape = tf.reshape(conv3, [-1, 6400]) # dim = reshape.get_shape()[1].value fully_connected = tf.matmul(reshape, self.weights4) + self.biases4 # batch_norm = _batch_normalization(fully_connected, 256, 2) local3 = tf.nn.relu(fully_connected) # _activation_summary(local3) # fiction_dropout = tf.nn.dropout(local3, keep_prob=0.5) with tf.variable_scope('softmax_linear'): self.weights6 = _variable_with_weight_decay( 'weights', [256, self.ACTIONS_COUNT], stddev=0.01, wd=0.0) self.biases6 = _variable_on_gpu('biases', [self.ACTIONS_COUNT], tf.constant_initializer(0.01)) self.output_layer = tf.add(tf.matmul(local3, self.weights6), self.biases6) # _activation_summary(self.output_layer[d]) self.readout_action = tf.reduce_sum(tf.multiply( self.output_layer, self._action), axis=1) self.cost = tf.reduce_mean( tf.square(self._target - self.readout_action)) # tf.scalar_summary("loss%d" % d, self.cost) # tf.add_to_collection('losses', self.cost) # losses = tf.get_collection('losses') # total_loss = tf.add_n(losses, name='total_loss') grads = self.opt.compute_gradients(self.cost) self.tower_grads = grads # with tf.device('/gpu:0'): with tf.variable_scope('target_conv1'): self.target_kernel1 = _variable_with_weight_decay( 'target_weights', shape=[8, 8, self.STATE_FRAMES, 32], stddev=0.01, wd=0.0) self.target_biases1 = _variable_on_gpu( 'target_biases', [32], tf.constant_initializer(0.01)) target_conv = tf.nn.conv2d(self._target_input_states, self.target_kernel1, [1, 2, 2, 1], padding='SAME') target_pre_activation = target_conv + self.target_biases1 target_batch_norm, self.target_beta1, self.target_gamma1 = _batch_normalization( target_pre_activation) target_conv1 = tf.nn.relu(target_batch_norm) with tf.variable_scope('target_conv2'): self.target_kernel2 = _variable_with_weight_decay( 'target_weights', shape=[4, 4, 32, 64], stddev=0.01, wd=0.0) self.target_biases2 = _variable_on_gpu( 'target_biases', [64], tf.constant_initializer(0.01)) # conv2 target_conv = tf.nn.conv2d(target_conv1, self.target_kernel2, [1, 2, 2, 1], padding='SAME') target_pre_activation = target_conv + self.target_biases2 target_batch_norm, self.target_beta2, self.target_gamma2 = _batch_normalization( target_pre_activation) target_conv2 = tf.nn.relu(target_batch_norm) with tf.variable_scope('target_conv3'): self.target_kernel3 = _variable_with_weight_decay( 'target_weights', shape=[3, 3, 64, 64], stddev=0.01, wd=0.0) self.target_biases3 = _variable_on_gpu( 'target_biases', [64], tf.constant_initializer(0.01)) # conv3 target_conv = tf.nn.conv2d(target_conv2, self.target_kernel3, [1, 1, 1, 1], padding='SAME') target_pre_activation = target_conv + self.target_biases3 target_batch_norm, self.target_beta3, self.target_gamma3 = _batch_normalization( target_pre_activation) target_conv3 = tf.nn.relu(target_batch_norm) with tf.variable_scope('target_local3'): self.target_weights4 = _variable_with_weight_decay( 'target_weights', shape=[6400, 256], stddev=0.01, wd=0.0) self.target_biases4 = _variable_on_gpu( 'target_biases', [256], tf.constant_initializer(0.01)) # local3 # Move everything into depth so we can perform a single matrix multiply. reshape = tf.reshape(target_conv3, [-1, 6400]) # dim = reshape.get_shape()[1].value target_local3 = tf.nn.relu( tf.matmul(reshape, self.target_weights4) + self.target_biases4) # _activation_summary(target_local3) # target_fiction_dropout = tf.nn.dropout(target_local3, keep_prob=0.5) with tf.variable_scope('target_softmax_linear'): self.target_weights6 = _variable_with_weight_decay( 'target_weights', [256, self.ACTIONS_COUNT], stddev=0.01, wd=0.0) self.target_biases6 = _variable_on_gpu( 'target_biases', [self.ACTIONS_COUNT], tf.constant_initializer(0.01)) self.target_output_layer = tf.add( tf.matmul(target_local3, self.target_weights6), self.target_biases6) # self._train_operation1 = tf.train.AdamOptimizer(self.LEARN_RATE).minimize(self.cost[0]) # self._train_operation2 = tf.train.AdamOptimizer(self.LEARN_RATE).minimize(self.cost[1]) # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. # grads = average_gradients(self.tower_grads) # Apply the gradients to adjust the shared variables. self.apply_gradient_op = self.opt.apply_gradients(grads, global_step) self._observations = deque() self._last_scores = deque() self._probability_of_random_action = self.INITIAL_RANDOM_ACTION_PROB init_op = tf.global_variables_initializer() self._session = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False)) # self.merged = tf.summary.merge_all() # self.writer = tf.summary.FileWriter("/home/uchi/catkin_ws/environment/apple_game/na_logs", # self._session.graph) self._session.run(init_op) self.duration = 0 self.terminal = True if not os.path.exists(self._checkpoint_path): os.mkdir(self._checkpoint_path) # write into a file self.fileqdata = open(self._checkpoint_path + "/Qdata.txt", "w")
def run(): game = ple.games.flappybird.FlappyBird() # game = ple.games.snake.Snake(width=512, height=512) # game = ple.games.pong.Pong(width=512, height=512) p = ple.PLE(game, fps=30, display_screen=args.is_render) p.init() plt.figure() all_scores = [] all_losses = [] all_losses_a = [] all_losses_c = [] all_t = [] agent = A2CAgent(len(p.getGameState()), len(p.getActionSet())) is_end = p.game_over() for e in range(args.episodes): p.reset_game() s_t0 = np.asarray(list(p.getGameState().values()), dtype=np.float32) reward_total = 0 pipes = 0 transitions = [] states_t1 = [] end_t1 = [] for t in range(args.max_steps): a_t0_idx = agent.act(s_t0) a_t0 = p.getActionSet()[a_t0_idx] r_t1 = p.act(a_t0) is_end = p.game_over() s_t1 = np.asarray(list(p.getGameState().values()), dtype=np.float32) end_t1.append(is_end) reward_total += r_t1 if r_t1 == 1.0: pipes += 1 transitions.append([s_t0, a_t0_idx, r_t1]) states_t1.append(s_t1) s_t0 = s_t1 if is_end: all_scores.append(reward_total) break t_states_t1 = torch.FloatTensor(states_t1).to(args.device) v_t1 = agent.model_c.forward(t_states_t1) np_v_t1 = v_t1.cpu().data.numpy().squeeze() for t in range(len(transitions)): s_t0, a_t0_idx, r_t1 = transitions[t] is_end = end_t1[t] delta = r_t1 if not is_end: delta = r_t1 + args.gamma * np_v_t1[t] agent.replay_memory.push([s_t0, a_t0_idx, delta]) loss = loss_a = loss_c = 0 if len(agent.replay_memory) > args.batch_size: loss_a, loss_c = agent.replay() loss = loss_a + loss_c all_losses.append(loss) all_losses_a.append(loss_a) all_losses_c.append(loss_c) all_t.append(t) metrics_episode = { 'loss': loss, 'loss_a': loss_a, 'loss_c': loss_c, 'score': reward_total, 't': t, 'e': agent.epsilon, 'pipes': pipes } if args.is_csv is True: CsvUtils.add_hparams( sequence_dir=os.path.join('.', args.sequence_name), sequence_name=args.sequence_name, run_name=args.run_name, args_dict=args.__dict__, metrics_dict=metrics_episode, global_step=e ) else: logging.info(f'episode: {e}/{args.episodes} ', metrics_episode) print(f'episode: {e}/{args.episodes} ', metrics_episode) if e % 100 == 0: plt.clf() plt.subplot(5, 1, 1) plt.ylabel('Score') plt.plot(all_scores) plt.subplot(5, 1, 2) plt.ylabel('Loss') plt.plot(all_losses) plt.subplot(5, 1, 3) plt.ylabel('Loss Actor') plt.plot(all_losses_a) plt.subplot(5, 1, 4) plt.ylabel('Loss Critic') plt.plot(all_losses_c) plt.subplot(5, 1, 5) plt.ylabel('Steps') plt.plot(all_t) plt.xlabel('Episode') plt.savefig(os.path.join(seq_run_name, f'plt-{e}.png')) torch.save(agent.model_c.cpu().state_dict(), os.path.join(seq_run_name, f'model-{e}-c.pt')) torch.save(agent.model_a.cpu().state_dict(), os.path.join(seq_run_name, f'model-{e}-a.pt'))
def run(): game = ple.games.flappybird.FlappyBird() # game = ple.games.snake.Snake(width=512, height=512) # game = ple.games.pong.Pong(width=512, height=512) p = ple.PLE(game, fps=30, display_screen=args.is_render) p.init() plt.figure() all_scores = [] all_losses = [] all_t = [] agent = DQNAgent(len(p.getGameState()), len(p.getActionSet()), args) is_end = p.game_over() for e in range(args.episodes): p.reset_game() s_t0 = np.asarray(list(p.getGameState().values()), dtype=np.float32) reward_total = 0 pipes = 0 episode_loss = [] for t in range(args.max_steps): a_t0_idx = agent.act(s_t0) a_t0 = p.getActionSet()[a_t0_idx] r_t1 = p.act(a_t0) is_end = p.game_over() s_t1 = np.asarray(list(p.getGameState().values()), dtype=np.float32) reward_total += r_t1 ''' from /PyGame-Learning-Environment/ple/games/base/pygamewrapper.py self.rewards = { "positive": 1.0, "negative": -1.0, "tick": 0, "loss": -5.0, "win": 5.0 } ''' if r_t1 == 1.0: pipes += 1 if t == args.max_steps - 1: r_t1 = -100 is_end = True agent.replay_memory.push( (s_t0, a_t0_idx, r_t1, s_t1, is_end) ) s_t0 = s_t1 if len(agent.replay_memory) > args.batch_size: loss = agent.replay() episode_loss.append(loss) if is_end: all_scores.append(reward_total) all_losses.append(np.mean(episode_loss)) break all_t.append(t) metrics_episode = { 'loss': all_losses[-1], 'score': reward_total, 't': t, 'e': agent.epsilon, 'pipes': pipes } if args.is_csv is True: CsvUtils.add_hparams( sequence_dir=os.path.join('.', args.sequence_name), sequence_name=args.sequence_name, run_name=args.run_name, args_dict=args.__dict__, metrics_dict=metrics_episode, global_step=e ) else: logging.info(f'episode: {e}/{args.episodes} ', metrics_episode) print(f'episode: {e}/{args.episodes} ', metrics_episode) if e % 100 == 0 and not args.is_inference: # save logs, graphics and weights during training plt.clf() plt.subplot(3, 1, 1) plt.ylabel('Score') plt.plot(all_scores) plt.subplot(3, 1, 2) plt.ylabel('Loss') plt.plot(all_losses) plt.subplot(3, 1, 3) plt.ylabel('Steps') plt.plot(all_t) plt.xlabel('Episode') plt.savefig(os.path.join(seq_run_name, f'plt-{e}.png')) torch.save(agent.q_model.cpu().state_dict(), os.path.join(seq_run_name, f'model-{e}.pt'))