def create_model(self):
     pi = policy(model_path=self.model_path,
                 have_model=False,
                 need_log=False,
                 action_space=self.act_dim,
                 state_space=self.obs_dim)
     pi.save_model()
Esempio n. 2
0
def train(batch):

    pi = policy(have_model=True,
                need_log=True,
                action_space=act_dim,
                state_space=obs_dim)
    pi.train(batch)
    pi.save_model()
    def train(self, batch):

        pi = policy(model_path=self.model_path,
                    have_model=True,
                    need_log=True,
                    action_space=self.act_dim,
                    state_space=self.obs_dim)
        pi.train(batch)
        pi.save_model()
Esempio n. 4
0
def evaluateWorker(perform_type: str, eva_num, share_lock):
    """
    evaluate
    :param perform_type: target or source
    :param policy_type: expert or policy
    :return:
    """

    if perform_type == "expert":
        pi = policy(have_model=True,
                    need_log=False,
                    action_space=act_dim,
                    state_space=obs_dim)
    else:
        pi = existing_pi(model_path='./Documents/success/%s/policy' %
                         env_name.lower())

    batch = {"sum_reward": []}

    game_num = 0

    while True:
        game_num += 1

        s = env.reset()

        sum_reward = 0

        while True:
            if eva_num.value > 200:
                return batch

            a = pi.get_means(s)

            s_, r, done, _ = env.step(a * high[0])

            sum_reward += r
            s = s_

            if done:
                batch["sum_reward"].append(sum_reward)

                share_lock.acquire()
                eva_num.value += 1
                share_lock.release()

                break
    def see_performance_worker(self):
        pi = policy(model_path=self.model_path,
                    have_model=True,
                    need_log=False,
                    action_space=self.act_dim,
                    state_space=self.obs_dim)

        batch = {}
        for key in self.rl_keys:
            batch[key] = []

        traj = 0
        while traj < 3:

            s = self.env.reset()

            traj_batch = {"reward": []}

            step = 0

            while True:

                a = pi.get_means(s)

                s_, r, done, info = self.env.step(a * self.high[0])

                traj_batch["reward"].append(r)

                s = s_
                step += 1

                if done:
                    batch["sum_reward"].append(sum(traj_batch["reward"]))

                    traj += 1
                    break

        return batch
Esempio n. 6
0
def worker(points_num, share_lock):

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    high = env.action_space.high

    pi = policy(have_model=True,
                need_log=False,
                action_space=act_dim,
                state_space=obs_dim)

    batch = {}
    for key in rl_keys:
        batch[key] = []

    point = 0
    while points_num.value <= args.points_num:

        s = env.reset()

        traj_batch = {
            "state": [],
            "actions": [],
            "reward": [],
            "gae": [],
            "value": []
        }

        step = 0

        while True:
            if points_num.value > args.points_num:
                break

            ret = pi.get_action(s)
            a = ret["actions"]

            s_, r, done, info = env.step(a * high[0])

            r *= (1 - pi.gamma)

            traj_batch["state"].append(s)
            traj_batch["reward"].append(r)
            traj_batch["actions"].append(ret["actions"])
            traj_batch["value"].append(ret["value"])

            s = s_
            step += 1
            point += 1

            if done:

                v = pi.get_value(s_)
                real_next = traj_batch["value"][1:] + [np.array(v)]

                ret = pi.get_return(traj_batch["reward"])

                gae = pi.get_gaes(traj_batch["reward"], traj_batch["value"],
                                  real_next)

                batch["state"].append(traj_batch["state"])
                batch["reward"].append(traj_batch["reward"])
                batch["action"].append(traj_batch["actions"])
                batch["gae"].append(gae)
                batch["return"].append(ret)
                batch["trajectory_len"].append(len(traj_batch["state"]))
                # batch["sum_reward"].append(sum(traj_batch["reward"]))

                share_lock.acquire()
                points_num.value += len(traj_batch["state"])
                share_lock.release()

                break

    traj = 0
    while traj < 3:

        s = env.reset()

        traj_batch = {"reward": []}

        step = 0

        while True:

            a = pi.get_means(s)

            s_, r, done, info = env.step(a * high[0])

            traj_batch["reward"].append(r)

            s = s_
            step += 1

            if done:
                batch["sum_reward"].append(sum(traj_batch["reward"]))

                traj += 1
                break

    return batch
Esempio n. 7
0
def create_model():
    pi = policy(have_model=False,
                need_log=False,
                action_space=act_dim,
                state_space=obs_dim)
    pi.save_model()
    def worker(self, points_num, share_lock):

        pi = policy(model_path=self.model_path,
                    have_model=True,
                    need_log=False,
                    action_space=self.act_dim,
                    state_space=self.obs_dim)

        batch = {}
        for key in self.rl_keys:
            batch[key] = []

        point = 0
        while True:

            s = self.env.reset()

            traj_batch = {
                "state": [],
                "actions": [],
                "reward": [],
                "gae": [],
                "value": []
            }

            step = 0

            while True:
                if points_num.value > args.points_num:
                    return batch

                ret = pi.get_action(s)
                a = ret["actions"]

                s_, r, done, info = self.env.step(a * self.high[0])

                r *= (1 - pi.gamma)

                traj_batch["state"].append(s)
                traj_batch["reward"].append(r)
                traj_batch["actions"].append(ret["actions"])
                traj_batch["value"].append(ret["value"])

                s = s_
                step += 1
                point += 1

                if done:
                    v = pi.get_value(s_)
                    real_next = traj_batch["value"][1:] + [np.array(v)]

                    ret = pi.get_return(traj_batch["reward"])

                    gae = pi.get_gaes(traj_batch["reward"],
                                      traj_batch["value"], real_next)

                    batch["state"].append(traj_batch["state"])
                    batch["reward"].append(traj_batch["reward"])
                    batch["action"].append(traj_batch["actions"])
                    batch["gae"].append(gae)
                    batch["return"].append(ret)
                    batch["trajectory_len"].append(len(traj_batch["state"]))
                    batch["sum_reward"].append(sum(traj_batch["reward"]))

                    share_lock.acquire()
                    points_num.value += len(traj_batch["state"])
                    share_lock.release()

                    break
Esempio n. 9
0
def worker():

    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.shape[0]
    high = env.action_space.high

    pi = policy(have_model=True,
                need_log=False,
                action_space=act_dim,
                state_space=obs_dim)

    batch = {}
    for key in rl_keys:
        batch[key] = []

    point = 0
    points_per_process = args.points_num / args.process_num
    while point < points_per_process:

        s = env.reset()

        traj_batch = {
            "state": [],
            "actions": [],
            "reward": [],
            "gae": [],
            "value": [],
        }

        step = 0

        while True:

            ret = pi.get_action(s)
            a = ret["actions"]

            s_, r, done, info = env.step(a * high[0])

            r *= (1 - pi.gamma)

            traj_batch["state"].append(s)
            traj_batch["reward"].append(r)
            traj_batch["actions"].append(ret["actions"])
            traj_batch["value"].append(ret["value"])

            s = s_
            step += 1
            point += 1

            if done:

                v = pi.get_value(s_)
                real_next = traj_batch["value"][1:] + [np.array(v)]

                ret = pi.get_return(traj_batch["reward"])

                gae = pi.get_gaes(traj_batch["reward"], traj_batch["value"],
                                  real_next)

                batch["state"].append(traj_batch["state"])
                batch["reward"].append(traj_batch["reward"])
                batch["action"].append(traj_batch["actions"])
                batch["gae"].append(gae)
                batch["return"].append(ret)
                batch["sum_reward"].append(sum(traj_batch["reward"]))
                batch["trajectory_len"].append(len(traj_batch["state"]))

                break

    return batch