var=var)
    net.load()

    # Train
    reward_list = []
    for i in range(episode):
        s = env.reset()
        total_reward = 0
        for j in range(max_iter):
            # env.render()
            a = net.chooseAction(s)
            s_, r, finish, info = env.step(a)

            # 將資料存到記憶庫並更新參數
            net.store_path(s, a, r / 10, s_)
            net.update()

            # 更新total reward和s_t資訊
            s = s_
            total_reward += r
            if j == max_iter - 1:
                print(
                    "Episode: %d \tReward: %i \t Explore: %.2f \t Pointer: %d"
                    % (i, total_reward, net.var, net.memory_counter))
                reward_list.append(total_reward)
                break
    net.save()
    env.close()
    plt.plot(range(len(reward_list)), reward_list, '-o')
    plt.title('The reward curve of DDPG')
    plt.show()
Ejemplo n.º 2
0
def master_loop(env):

    logger = logging.getLogger()
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')

    fileHandler = logging.FileHandler('./log/test.log')
    fileHandler.setFormatter(formatter)

    logger.addHandler(fileHandler)
    logger.setLevel(logging.INFO)

    s_dim = env.get_s_dim()
    a_dim = env.get_a_dim()
    a_high = env.get_a_high()
    a_low = env.get_a_low()
    # print(a_bound)
    print("s_dim: {}, a_dim{}, a_high:{}, a_low:{}".format(
        s_dim, a_dim, a_high, a_low))
    ddpg = DDPG(a_dim,
                s_dim,
                a_high,
                a_low,
                lr_a=LR_A,
                lr_c=LR_C,
                gamma=GAMMA,
                tau=TAU,
                rpm_size=MEMORY_CAPACITY,
                batch_size=BATCH_SIZE)

    status = MPI.Status()
    start_time = time.time()
    reset_time = time.time()

    total_eps = 0
    total_step = 0

    n_step = 0
    n_eps = 0

    max_reward = -9999
    max_reward_rank = 0

    ddpg.load()

    while total_eps < MAX_EPISODES:
        data = comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
        source = status.Get_source()
        tag = status.Get_tag()

        if tag == REQ_ACTION:
            # action = env.action_space.sample()
            action = ddpg.choose_action(data)
            comm.send((action, total_eps, total_step),
                      dest=source,
                      tag=RSP_ACTION)

        elif tag == OBS_DATA:
            n_step += 1
            total_step += 1
            (s, a, r, s_, done, ep_reward, ep_step) = data
            is_done = 0.0
            if done:
                is_done = 1.0

            ddpg.store_transition(s, a, r, s_, is_done)

            if ddpg.pointer > LEARN_START and total_step % 3 == 0:
                ddpg.learn()

            if done:
                total_eps += 1
                if ep_reward > max_reward:
                    max_reward = ep_reward
                    max_reward_rank = source

                s = "eps: {:>8}, worker: {:>3}, ep_reward:{:7.4f}, max:{:7.4f}/{:>3}, step:{:4}".format(
                    total_eps, source, ep_reward, max_reward, max_reward_rank,
                    ep_step)
                #print(s)
                logging.info(s)

                if total_eps % 500 == 0:
                    ddpg.save(total_eps)
                    interval = time.time() - reset_time
                    s = "# total_step: {:>8} ,total_eps: {:>6} eps/min: {:>6}, frame/sec: {:>6}".format(
                        total_step, total_eps, n_eps / interval * 60,
                        n_step / interval)
                    #print(s)
                    logging.info(s)

                    n_step = 0
                    n_eps = 0
                    reset_time = time.time()