コード例 #1
0
ファイル: apl_ddpg.py プロジェクト: mdheller/costar_plan
    def fit(self, *args, **kwargs):

        MEM_SZ = MEM_SIZE_FCL

        sess = K.get_session()
        K.set_learning_phase(1)

        self.actor = ActorNetwork(sess,
                                  self.state_dim,
                                  self.nn_action_dim,
                                  BATCH_SIZE,
                                  TAU,
                                  LRA,
                                  convolutional=CONVOLUTIONAL,
                                  output_activation=ACTION_ACTIVATION)
        self.critic = CriticNetwork(sess,
                                    self.state_dim,
                                    self.nn_action_dim,
                                    BATCH_SIZE,
                                    TAU,
                                    LRC,
                                    convolutional=CONVOLUTIONAL)

        self.memory = Memory(MEM_SZ)

        self.actor.target_model.summary()
        self.critic.target_model.summary()

        if LOAD_WEIGHTS:
            self.actor.model.load_weights(LOAD_WEIGHTS_PREFIX +
                                          "actor_model_" +
                                          LOAD_WEIGHTS_EPISODE + ".h5")
            self.critic.model.load_weights(LOAD_WEIGHTS_PREFIX +
                                           "critic_model_" +
                                           LOAD_WEIGHTS_EPISODE + ".h5")
            self.actor.target_model.load_weights(LOAD_WEIGHTS_PREFIX +
                                                 "actor_target_model_" +
                                                 LOAD_WEIGHTS_EPISODE + ".h5")
            self.critic.target_model.load_weights(LOAD_WEIGHTS_PREFIX +
                                                  "critic_target_model_" +
                                                  LOAD_WEIGHTS_EPISODE + ".h5")
            print("Weights Loaded!")

        #====================================================
        #Initialize noise processes
        #self.noise_procs = []
        #for i in range(NUM_NOISE_PROCS):
        #    self.noise_procs.append(OUProcess(OU_MEAN, OU_THETA, OU_STD_DEV))

        #====================================================

        PRE_LEARNING_EPISODES = STARTING_EPISODE + PRE_LEARNING_EPS
        steps = STARTING_EPISODE * EPISODE_LENGTH
        start_time = time.time()
        last_ep_time = time.time()
        if MAKE_PLOT:
            reward_graph = Grapher()

        for ep in range(STARTING_EPISODE, EPISODES):

            #reset noise processes
            #for ou in self.noise_procs:
            #    ou.reset()

            self.noise.reset()

            #start time counter
            if (ep == PRE_LEARNING_EPISODES):
                start_time = time.time()

            print("Episode: " + str(ep) + "  Frames: " +
                  str(ep * EPISODE_LENGTH) + "  Uptime: " + str(
                      (time.time() - start_time) / 3600.0) +
                  " hrs    ===========")

            state = self.env.reset()

            play_only = (ep % 10 == 0)

            total_reward = 0

            if play_only or ALREADY_TRAINED:
                for step in range(TEST_EPISODE_LENGTH):

                    #print ">>>>>>>>>>>>>", state.shape
                    #img = np.array([np.subtract(img, 128)], dtype=np.float32) #zero center
                    #img = np.multiply(img, 1.0/128.0) #scale [-1,1]
                    #img = np.transpose(state, (1,2,0))

                    #img = np.array(state)
                    #img = np.transpose(img, (1,2,0))

                    #print ">>>>>>>>>>>>>", state.shape

                    state = np.reshape(state, state.shape + (1, ))

                    action, control_action = self.selectAction(
                        state, can_be_random=False, use_target=True)

                    nstate, reward, done, info = self.env.step(control_action)
                    total_reward += reward
                    state = nstate
            else:
                for step in range(EPISODE_LENGTH):

                    # ACT ==============================
                    epsilon = (float(steps) / float(EPSILON_STEPS)) * (
                        EPSILON_RANGE[1] - EPSILON_RANGE[0]) + EPSILON_RANGE[0]

                    state = np.reshape(state, state.shape + (1, ))

                    action, control_action = self.selectAction(state,
                                                               epsilon=epsilon)
                    new_state, reward, done, info = self.env.step(
                        control_action)
                    done = done or (step >= EPISODE_LENGTH)
                    self.memory.addMemory(state, action, reward, new_state,
                                          done)
                    state = new_state

                    # LEARN ============================
                    if ep > PRE_LEARNING_EPISODES:
                        batch, idxs = self.memory.getMiniBatch(BATCH_SIZE)
                        self.learnFromBatch(batch)

                    if done:
                        break
                    # CLEANUP ==========================
                    steps += 1

            #we need to consider the episodes without noise to actually tell how the system is doing
            if play_only and MAKE_PLOT:
                reward_graph.addSample(total_reward)
                reward_graph.displayPlot()

            #calculate fph on total frames
            total_frames = (ep - PRE_LEARNING_EPISODES) * EPISODE_LENGTH
            elapsed = time.time() - start_time
            fps = total_frames / elapsed
            fph = fps * 3600.0

            #re-calculate fps on this episode, so it updates quickly
            fps = EPISODE_LENGTH / (time.time() - last_ep_time)
            last_ep_time = time.time()
            print("fps: " + str(fps) + "  fph: " + str(fph) + "\n")

            #save plot and weights
            if (ep > 0 and ep % EPISODE_SAVE_FREQUENCY
                    == 0) and not ALREADY_TRAINED:

                #plot
                if MAKE_PLOT:
                    reward_graph.savePlot(SAVE_WEIGHTS_PREFIX + "graph_" +
                                          str(ep) + ".jpg")

                #weights
                self.actor.model.save_weights(SAVE_WEIGHTS_PREFIX +
                                              "actor_model_" + str(ep) + ".h5",
                                              overwrite=True)
                self.actor.target_model.save_weights(
                    SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) +
                    ".h5",
                    overwrite=True)
                self.critic.model.save_weights(
                    SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) + ".h5",
                    overwrite=True)
                self.critic.target_model.save_weights(
                    SAVE_WEIGHTS_PREFIX + "critic_target_model_" + str(ep) +
                    ".h5",
                    overwrite=True)

                #network structures (although I don't think I ever actually use these)
                with open(
                        SAVE_WEIGHTS_PREFIX + "actor_model_" + str(ep) +
                        ".json", "w") as outfile:
                    json.dump(self.actor.model.to_json(), outfile)
                with open(
                        SAVE_WEIGHTS_PREFIX + "actor_target_model_" + str(ep) +
                        ".json", "w") as outfile:
                    json.dump(self.actor.target_model.to_json(), outfile)
                with open(
                        SAVE_WEIGHTS_PREFIX + "critic_model_" + str(ep) +
                        ".json", "w") as outfile:
                    json.dump(self.critic.model.to_json(), outfile)
                with open(
                        SAVE_WEIGHTS_PREFIX + "critic_target_model_" +
                        str(ep) + ".json", "w") as outfile:
                    json.dump(self.critic.target_model.to_json(), outfile)