Beispiel #1
0
 def get_demonstration(self, fold):
     state1 = self.get_instance(fold)
     action1 = (0, 2)
     state2, r1, _ = state1.step(action1)
     action2 = (0, state1.target)
     state3, r2, _ = state2.step(action2)
     assert r2 == 1
     ep = []
     ep.append(Experience(state1, None, action1, state2, None, r1, False))
     ep.append(Experience(state2, None, action2, state3, None, r2, True))
     return ep
    def generate_experience(self, nr, serialize=False):
        result = []

        for x in range(nr):
            domain = self.__domains[random.randint(0, len(self.__domains) - 1)]
            projects = self.generate_projects(random.randint(1, 5), False)
            exp = Experience(domain, random.randint(1, 15), projects)
            result.append(exp)

        if serialize:
            return [exp.serialize() for exp in result]
        else:
            return result
    def generate_experience(self, nr, serialize=False):
        result = []

        for x in range(nr):
            domain = self.__domains[random.randint(0, len(self.__domains) - 1)]
            projects = self.generate_projects(random.randint(1, 5), False)
            exp = Experience(domain, random.randint(1, 15), projects)
            result.append(exp)

        if serialize:
            return [exp.serialize() for exp in result]
        else:
            return result
Beispiel #4
0
  def __init__(self, env=None, agent=None, logdir=None, should_render=None, should_load=None):
    self.env = env
    self.agent = agent
    self.config = self.agent.config
    self.logdir = logdir
    self.should_render = should_render
    self.experience = Experience(self.config)

    if should_load:
      self.load()
    else:
      self.step = 0
      self.epsilon = 0.3
      self.train_rewards = [0] * 100
      self.current_episode = 0
    def add_grains(self, grains: List[List[Image.Image]]):
        """Add new grains to memory

        Params:
            grains: List[List[Image.Image]]
                2D List of new grains

        Returns:
            2D List of novelty for new grains
        """

        # print("Adding new grains to memory...")
        assert len(grains) == 2  # Currently, we only allow 4 grains
        assert len(grains[0]) == 2  # Currently, we only allow 4 grains
        nov_list = []

        for row in grains:
            temp_nov = []
            for g in row:
                grain_tf = self._grain_to_tensor(g)
                grain_tf = tf.reshape(
                    grain_tf, (1, grain_tf.shape[0], grain_tf.shape[1],
                               grain_tf.shape[2]))  # Reshape to (1,H,W,C)
                predicted_grain = self._network(grain_tf)
                nov = self.novelty_function(grain_tf, predicted_grain).numpy()
                temp_nov.append(nov)
                self._memory.push(Experience(nov, g))
            nov_list.append(temp_nov)

        return nov_list
Beispiel #6
0
    def run_episode(self, max_steps, train=True):
        """
        Executes a single episode.

        Params
        ======
        max_steps (integer): The maximum time steps to run in a single episode.
        train (Boolean): If true, run episode in train mode.  If false, run in
            eval mode.
        """

        env_info = self.env.reset(train_mode=train)[self.brain_name]
        states = env_info.vector_observations
        scores = np.zeros(len(states))
        for i in range(max_steps):
            actions = self.agent.act(states, noise=train)
            env_info = self.env.step(actions)[self.brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            scores += rewards
            self.agent.learn(
                Experience(states, actions, rewards, next_states, dones))
            if dones[0]:
                break
            states = next_states
        self.agent.end_episode()
        return scores.max()
Beispiel #7
0
 def get_experience(self, key):
     with dbapi2.connect(self.app.config['dsn']) as connection:
         cursor = connection.cursor()
         query = "SELECT TITLE, USERNAME, START,FINISH,PERIOD,LENGTH FROM EXPERIENCE WHERE (ID = %s)"
         cursor.execute(query, (key,))
         title, username, start, finish, period,length = cursor.fetchone()
     return Experience(title, username, start, finish, period,length)
Beispiel #8
0
def _do_tr_rollout(code_agent, desc_agent, task, rollout_ph, model, desc_model,
                   desc_to_code, code_to_desc, session, config, h0, z0, fold,
                   mode):
    worlds = [
        task.get_instance(fold)
        for _ in range(config.trainer.n_rollout_episodes)
    ]
    done = [False] * config.trainer.n_rollout_episodes
    episodes = [[] for i in range(config.trainer.n_rollout_episodes)]
    hs, zs = h0, z0
    dhs = h0
    for t in range(config.trainer.n_timeout):
        hs_, zs_, qs = session.run(
            [model.tt_rollout_h, model.tt_rollout_z, model.tt_rollout_q],
            rollout_ph.feed(hs, zs, dhs, worlds, task, config))
        dhs_, dqs = session.run(
            [desc_model.tt_rollout_h, desc_model.tt_rollout_q],
            rollout_ph.feed(hs, zs, dhs, worlds, task, config))
        for i in range(config.trainer.n_rollout_episodes):
            if done[i]:
                continue

            actions = [None, None]
            actions[code_agent] = np.argmax(qs[code_agent][i, :])
            actions[desc_agent] = np.argmax(dqs[desc_agent][i, :])

            world_, reward, done_ = worlds[i].step(actions)

            code = desc_to_code(world_.l_msg[code_agent], mode)[0]
            zs_[desc_agent][i, :] = code

            l_words = code_to_desc(zs_[code_agent][i, :], mode)[:5]
            l_msg = np.zeros(len(task.lexicon))
            for l_word in l_words:
                l_msg[task.lexicon.index(l_word)] += 1
            l_msg /= np.sum(l_msg)

            world_.l_msg = list(world_.l_msg)
            world_.l_msg[desc_agent] = l_msg
            world_.l_msg = tuple(world_.l_msg)

            episodes[i].append(
                Experience(worlds[i], None, tuple(actions), world_, None,
                           reward, done_))
            worlds[i] = world_
            done[i] = done_

            if config.evaluator.simulate_l:
                assert False

        hs = hs_
        zs = zs_
        dhs = dhs_
        if all(done):
            break

    return (sum(e.r for ep in episodes
                for e in ep) * 1. / config.trainer.n_rollout_episodes,
            sum(ep[-1].s2.success
                for ep in episodes) * 1. / config.trainer.n_rollout_episodes)
def main():
    from experience import Experience
    from visualization import hist_classes, scatter_classes

    class_count_list = []
    agents = [
        'RainbowAgent', 'SimpleAgent', 'SecondAgent', 'ProbabilisticAgent'
    ]

    for agent in agents:

        exp = Experience(agent, load=True)
        labels, _, examples, _ = exp.load()
        class_count, _ = divide_and_count(examples, labels)
        class_count_list.append(class_count)

    scatter_classes(class_count_list, agents)
Beispiel #10
0
 def get_myexperiences(self,name):
       with dbapi2.connect(self.app.config['dsn']) as connection:
         cursor = connection.cursor()
         query = "SELECT * FROM EXPERIENCE where (username=%s)"
         cursor.execute(query,(name,))
         experiences = [(key, Experience(title, username, start, finish, period, length))
                   for key, title, username, start, finish, period, length,userid,date in cursor]
         return experiences
Beispiel #11
0
def run(env,
        num_episodes,
        num_time_steps,
        replay_batch_size,
        scores_filename=None):
    exploration = EpsilonGreedy(epsilon_start=1.0,
                                epsilon_min=0.01,
                                epsilon_decay=0.999)

    # [Mnih 2015] used:
    #  - replay over 2% of the total experience
    #  - batch size of 32
    #  - minimum replay start size of 0.1%
    experience_max_size = int(num_episodes * num_time_steps * 0.02)
    replay_start_size = int(num_episodes * num_time_steps * 0.001)
    experience_replay = Experience(max_size=experience_max_size,
                                   batch_size=replay_batch_size,
                                   replay_start_size=replay_start_size)
    # experience_replay = PrioritizedExperience(
    #     max_size=experience_max_size, batch_size=replay_batch_size, replay_start_size=replay_start_size,
    #     initial_td_error=10, alpha=0.4, beta=0.4, anneal_rate=0.95, epsilon=0.001)

    model = ExampleModel(state_size=env.state_size,
                         action_size=env.action_size,
                         learning_rate=0.001)
    model.build()
    target_model = ExampleModel(state_size=env.state_size,
                                action_size=env.action_size,
                                learning_rate=0.001)
    target_model.build()

    qmodel = QModel(model=model,
                    target_model=target_model,
                    experience_replay=experience_replay,
                    tau=0.1,
                    use_double_q=True)

    agent = QAgent(state_size=env.state_size,
                   action_size=env.action_size,
                   model=qmodel,
                   exploration=exploration,
                   discount_rate=0.95)

    scores = np.empty((num_episodes, ))
    time_start = time.time()

    for e in range(num_episodes):
        scores[e] = agent.train(env=env, episode_length=num_time_steps)
        print('episode: {}/{}, score: {}, e: {:.2}'.format(
            e + 1, num_episodes, scores[e], agent.exploration.epsilon))

    time_end = time.time()
    print('Average score for last 10% of episodes:',
          np.mean(scores[int(np.floor(num_episodes * 0.1)):]))
    print('Time taken:', time_end - time_start, 'seconds')

    if scores_filename is not None:
        np.savetxt(scores_filename, scores, delimiter=',')
Beispiel #12
0
 def search_experience(self,keyword):
     with dbapi2.connect(self.app.config['dsn']) as connection:
         cursor = connection.cursor()
         query="SELECT * FROM EXPERIENCE WHERE (TITLE ILIKE %s OR START ILIKE%s OR FINISH ILIKE %s ) ORDER BY ID"
         keyword='%'+keyword+'%'
         cursor.execute(query, (keyword,keyword,keyword))
         experiences = [(key, Experience(title, username, start, finish, period, length))
                   for key, title, username, start, finish, period, length,userid,date in cursor]
     return experiences
def extract_tensors(experiences):
    # Convert batch of Experiences to Experience of batches
    batch = Experience(*zip(*experiences))

    t1 = torch.cat(batch.state)
    t2 = torch.cat(batch.action)
    t3 = torch.cat(batch.reward)
    t4 = torch.cat(batch.next_state)

    return t1, t2, t3, t4
Beispiel #14
0
    def __init__(self, model, env, action_size, args, state):
        self.model = model
        self.env = env
        self.action_size = action_size
        self.state = state
        self.hx = None
        self.cx = None
        self.eps_len = 0
        self.args = args
        self.values = []
        self.log_probs = []
        self.rewards = []
        self.entropies = []
        self.done = True  # 初始化,可以设置一次新的状态值
        self.info = None
        self.reward = 0
        self.gpu_id = -1
        # 参数

        self.memory = Experience(history_size=2000)
Beispiel #15
0
 def to_experiences(self, states, actions, rewards, next_states, dones):
     """
     Turns vectors representing components of multiple experiences into a
     vector of Experience objects.
     """
     experiences = []
     for (state, action, reward, next_state,
          done) in zip(states, actions, rewards, next_states, dones):
         experiences.append(
             Experience(state, action, reward, next_state, done))
     return experiences
Beispiel #16
0
    def __init__(self, rank, args, shared_model, optimizer, lr):
        # CUDA 相关
        self.gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
        torch.manual_seed(args.seed + rank)
        if self.gpu_id >= 0:
            torch.cuda.manual_seed(args.seed + rank)

        self.replay_buffer = Experience(history_size=2000)
        self.cx = None  # todo: 仍然是 一次 step 就前向传播
        self.hx = None
        self.episodic_score = 0
        self.rank = rank
        self.args = args
        self.shared_model = shared_model
        self.optimizer = optimizer
        self.local_t = 0
        # 初始化
        # 初始化环境
        print('Training Agent: {}'.format(self.rank))
        # todo: 需要给 gym 环境加上 pc 等

        # agent 代理对象
        model = UNREAL(in_channels=3, action_size=6, enable_pixel_control=True)

        if self.gpu_id >= 0:
            with torch.cuda.device(self.gpu_id):
                model = model.cuda()

        model.train()

        # 学习率
        self.initial_learning_rate = lr
        self.max_global_time_step = 10 * 10**7
        # 记录时间
        # For log output
        self.prev_local_t = 0

        self.model = model
        self.env = None
        self.reset()  # cx hx
Beispiel #17
0
    def __init__(self,
                 num_action,
                 frame_height,
                 frame_width,
                 rng,
                 network_type,
                 algorithm,
                 network_file=None,
                 num_ignore=0,
                 exp_file=None):
        self.rng = rng
        self.num_action = num_action
        self.mbsize = Agent.MINIBATCH_SIZE
        self.validate_size = Agent.VALID_SIZE
        self.num_train_obs = 0
        self.network_type = network_type
        self.eps_decay = (Agent.FINAL_EXPLORE - Agent.INIT_EXPLORE) \
            / Agent.EXPLORE_FRAMES

        self.validate_states = None
        self.exp_file = exp_file
        if exp_file is not None:
            with open(exp_file, 'rb') as f:
                npz = np.load(exp_file)
                self.num_train_obs = np.sum(npz['num_train_obs'])
                self.validate_states = npz['validate_states']
                self.exp_train = Experience(Agent.REPLAY_SIZE, frame_height,
                                            frame_width, Agent.HISTORY, rng,
                                            npz)
        else:
            self.exp_train = Experience(Agent.REPLAY_SIZE, frame_height,
                                        frame_width, Agent.HISTORY, rng)
        self.exp_eval = Experience(Agent.HISTORY + 1, frame_height,
                                   frame_width, Agent.HISTORY, rng)

        self.network = Network(num_action, self.mbsize, Agent.HISTORY,
                               frame_height, frame_width, Agent.DISCOUNT,
                               Agent.UPDATE_FREQ, rng, network_type, algorithm,
                               network_file, num_ignore)
Beispiel #18
0
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, env_type, env_name,
                 use_pixel_change, use_value_replay, use_reward_prediction,
                 pixel_change_lambda, entropy_beta, local_t_max, gamma,
                 gamma_pc, experience_history_size, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.env_type = env_type
        self.env_name = env_name
        self.use_pixel_change = use_pixel_change
        self.use_value_replay = use_value_replay
        self.use_reward_prediction = use_reward_prediction
        self.local_t_max = local_t_max
        self.gamma = gamma
        self.gamma_pc = gamma_pc
        self.experience_history_size = experience_history_size
        self.max_global_time_step = max_global_time_step
        self.action_size = Environment.get_action_size(env_type, env_name)

        self.local_network = UnrealModel(self.action_size, thread_index,
                                         use_pixel_change, use_value_replay,
                                         use_reward_prediction,
                                         pixel_change_lambda, entropy_beta,
                                         device)
        self.local_network.prepare_loss()

        self.apply_gradients = grad_applier.minimize_local(
            self.local_network.total_loss, global_network.get_vars(),
            self.local_network.get_vars())

        self.sync = self.local_network.sync_from(global_network)
        self.experience = Experience(self.experience_history_size)
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        # For log output
        self.prev_local_t = 0
Beispiel #19
0
def read_experience():
    experience = Experience()
    experience.employer = input("Enter name of employer: ")
    experience.title = input("Enter title: ")
    experience.responsibilities = input("Enter responsibilities: ")
    experience.duration_years = int(
        input("Enter number of years of experience: "))

    return experience
Beispiel #20
0
 def run_episode(self, train=True):
     env_info = self.env.reset(train_mode=train)[self.brain_name]
     state = env_info.vector_observations[0]
     score = 0
     while True:
         action = self.agent.act(state, explore=train)
         env_info = self.env.step(action)[self.brain_name]
         next_state = env_info.vector_observations[0]
         reward = env_info.rewards[0]
         done = env_info.local_done[0]
         score += reward
         if train:
             self.agent.learn(
                 Experience(state, action, reward, next_state, done))
         state = next_state
         if done:
             break
     return score
Beispiel #21
0
 def run_episode(self):
     self.env.reset()
     self.env.random_start()
     t = 0
     experiences = []
     
     
     while(not self.env.terminal):
         #predict action, value
         action, value = self.predict(self.env.state)
         self.env.step(action)
         
         experience = Experience(self.env.state, action, self.env.reward,
                                 None,
                                 self.env.terminal)
         experiences.append(experience)
         yield experience
         t += 1
Beispiel #22
0
def main(args):
    with tf.device(args['device']):

        # tf
        tf.set_random_seed(args['rand_seed'])
        sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

        # env
        env = gym.make('TestEnv-v0')
        env.seed(args['rand_seed'])
        s_dim = env.observation_space.shape[0]
        a_dim = env.action_space.shape[0]
        concat_dim = 2
        batched_s_dim = [None, s_dim, concat_dim]
        batched_a_dim = [None, a_dim]

        # agents
        actor = Actor(sess, args['actor_lr'], args['tau'], args['batch_size'],
                      args['clip_val'], batched_s_dim, batched_a_dim)
        critic = Critic(sess, args['critic_lr'], args['tau'], args['clip_val'],
                        batched_s_dim, batched_a_dim)

        # experience
        exp = Experience(args['buffer_size'], args['batch_size'],
                         args['rand_seed'])

        # noise
        actor_noise = ActorNoise(actor.predict,
                                 a_dim,
                                 noise_type=args['noise_type'])

        # initialize
        init = tf.global_variables_initializer()
        sess.run(init)
        saver = Model(sess, args['restore_path'])
        saver.restore_model()

        # training
        her = HER(saver, exp, env, actor, critic, actor_noise)
        if args['mode'] == 'train':
            her.train(args['gamma'], args['her_k'], args['max_episodes'],
                      args['max_episode_len'], args['replay_len'])
        else:
            her.play(args['max_episodes'], args['max_episode_len'])
Beispiel #23
0
    def run_episode(self):
        self.env.reset()
        self.history.add(self.env.state)

        random_start_steps = max(self.config.history_length,
                                 self.env.random_start_steps)
        for _ in range(random_start_steps):
            self.env.step(self.env.random_step())
            self.history.add(self.env.state)

        t = 0
        while (not self.env.terminal):
            #predict action, value
            prev_state = self.env.state
            action = self.predict(self.history.get())
            self.env.step(action)
            experience = Experience(prev_state, action, self.env.reward,
                                    self.env.state, self.env.terminal)
            yield experience
            t += 1
Beispiel #24
0
def index():
    app.logger.info("Connection from %s" % str(request.environ['REMOTE_ADDR']))
    database = databaseconnector.databaseObject(app)
    skills = dict(database.query("SELECT title, rating FROM skills"))
    tools = dict(database.query("SELECT title, rating FROM tools"))
    languages = dict(database.query("SELECT title, rating FROM languages"))
    experience = database.query(
        "SELECT title, dates, shortDescription, longDescription, image, id FROM experience"
    )
    experienceStructs = []
    for line in experience:
        newStruct = Experience(line[0], line[1], line[2], line[3], line[4],
                               'a' + str(line[5]))
        experienceStructs.append(newStruct)

    database.close()
    return render_template('index.html',
                           skills=skills,
                           tools=tools,
                           languages=languages,
                           experiences=experienceStructs)
Beispiel #25
0
def main():

    candidate_database = CandidateDatabase()

    candidate = Candidate()
    candidate.name = "Alicia Toomtest"
    candidate.title = "Python Developer"
    candidate.address = "Gothenburg, Sweden"
    candidate.phone = "0722879879"
    candidate.email = "*****@*****.**"
    candidate.hobbies = "Gardening"

    candidate.education = [
        Education(name="Education", school="School", level="Level"),
        Education(name="Education2", school="School2", level="Level2")
    ]

    candidate.experience = [
        Experience(employer="Volvo",
                   title="Python developer",
                   responsibilities="code",
                   duration_years="2018-present")
    ]

    candidate.note = Note()
    candidate.note.summary = "Gslf9ehdlsdfnjslsleofjfms,"
    candidate.note.comment = "dki9eufsklwodudndjskwoeifjdk"

    # print_candidate(candidate)
    candidate_database.add_candidate(candidate)
    find_result = candidate_database.find_candidates("toom")

    if len(find_result) > 0:
        print_candidates(find_result)
    else:
        print("No result found")

    print_candidate(candidate)

    return
Beispiel #26
0
def qtest(model_eval, model_target,epsilon=0.05, n_vehicles=20, **opt):
    # global epsilon
    epoch = 0
    n_epoch = opt.get('n_epoch', 1500)
    max_memory = opt.get('max_memory', 1000)
    data_size = opt.get('data_size', 50)
    weights_file = opt.get('weights_file', "")
    start_time = datetime.datetime.now()
    n_vehicles = n_vehicles

    get_dest_count = 0
    failed_count = 0

    eval_train_time = 0
    # If you want to continue training from a previous model,
    # just supply the h5 file name to weights_file option
    if weights_file:
        print("loading weights from file: %s" % (weights_file,))
        model_eval.load_weights(weights_file)

    # Initialize experience replay object
    experience = Experience(model_eval, model_target, max_memory=max_memory)

    # records : 记录各种指标
    # reward部分:
    records_of_total_reward = []
    records_of_veh_reward = []
    records_of_se_reward = []
    # 这一项totalreward是veh与se_reward的单纯相加,不带权值。
    records_of_pure_total_reward = []
    # 需要输出的部分:
    records_of_veh_drive = []
    records_of_veh_drive_speed = []
    records_of_se_delay = []
    records_of_se_SR = []

    veh_actions_record = [ [] for veh in range(n_vehicles) ]
    se_actions_record = [ [] for veh in range(n_vehicles) ]

    for epoch in range(n_epoch):

        qmaze.reset()
        # print("训练一轮结束,重置qmaze")
        # print(qmaze.vehs_og_list)
        seenv.reset(qmaze)#SE相关环境信息的重置
        game_over = False

        # get initial envstate
        envstates_list = [qmaze.observe(veh) for veh in range(n_vehicles)]
        se_envstates_list = [seenv.SE_observe(veh, qmaze) for veh in range(n_vehicles)]

        #reward_list 是用来计算SE部分和Veh行车部分两个reward的累加值,即“总的reward”。
        reward_list = np.zeros((n_vehicles))
        veh_step_reward_list = np.zeros((n_vehicles))
        se_step_reward_list = np.zeros((n_vehicles))

        n_episodes = 0

        while not game_over:
            # 开始前都将需要记录这个tag置零
            qmaze.should_save = [0] * n_vehicles

            # 开始前把每一步经历的step_cost_list全部置零
            veh_step_cost_list = np.zeros((n_vehicles))
            se_step_cost_list = np.zeros((n_vehicles))
            pure_reward_list = np.zeros((n_vehicles))
            prev_envstates_list = envstates_list
            prev_se_envstates_list = se_envstates_list
            # 获取当前各个网格车辆的信息,汽车行动前的网格汽车数目
            # _2nd_channel = background_vehs[np.newaxis,:]
            # background_vehs = background_vehs[np.newaxis,:]
            vehsnum_before_act = qmaze.count_cells_vehsnum()
            sesnum_before_act = seenv.SE_count(qmaze)

            actions = -1 * np.ones((n_vehicles)) # 默认 -1,表示不采取行动
            se_actions = -1 * np.ones((n_vehicles))  # 默认 -1,表示不采取行

            nVehsCell_list = qmaze.count_cell_vehsnum_list()
            nSEsCell_list  = seenv.SE_count_list(qmaze)
            # print(nVehsCell_list)
            # print(veh_actions_record)
            # print(nSEsCell_list)
            # print(se_actions_record)
            # 每个时段的的开头部分
            # 在时段的开头先统计各个网格的车辆,
            # 从而得知,在这一个时段内,汽车们、SE们可能得到的拥挤开销
            # 这等于是在哥哥时刻开始之前计算拥挤开销被计算入reward之前的拥挤开销的积累
            qmaze.get_veh_cost(veh_step_cost_list)
            seenv.get_se_cost(se_step_cost_list, qmaze)
            # 计算一个评判行车指标drive_cost
            qmaze.get_drive_cost(background_vehs,nVehsCell_list)
            # 每个时段的的开头部分
            for veh in range(n_vehicles):
                if qmaze.status_flag[veh] != 0:
                    # 说明汽车已经1或者-1了,不用在对其进行动作。
                    continue
                else:
                    # 进入这一分支说明该车还没结束
                    if qmaze.last_act_done[veh] == 1:
                    # 汽车是刚开始的状态或者完成了上一个动作的执行,需要委派新的动作。
                        valid_actions = qmaze.valid_actions(qmaze.vehs_cur_list[veh])

                        qmaze.vehs_change_act[veh] = actions[veh] = np.argmax(
                            experience.predict_e(prev_envstates_list[veh], vehsnum_before_act,
                                                 prev_se_envstates_list[veh], sesnum_before_act))
                        seenv.SEs_next_mrg_list[veh] = se_actions[veh] = np.argmax(
                            experience.predict_e_se(prev_envstates_list[veh], vehsnum_before_act,
                                                    prev_se_envstates_list[veh], sesnum_before_act))
                        veh_actions_record[veh].append(qmaze.vehs_change_act[veh])
                        se_actions_record[veh].append(seenv.SEs_next_mrg_list[veh])
                        # 重新指定time_remain_list
                        # 获取reward_will_get
                        # 还得记录一下vehs的nexttogo
                        qmaze.get_some_will(actions[veh], veh)

                    if qmaze.last_act_done[veh] == 0:
                    # 未完成上一个迁移动作,SEs和Vehs的状态都是GOING
                        actions[veh] = GOING
                        se_actions[veh] = GOING
                        continue
            # 每个时段的结尾部分
            nVehsCell_list = qmaze.count_cell_vehsnum_list()
            nSEsCell_list = seenv.SE_count_list(qmaze)
            vehcurpos = qmaze.vehs_cur_list

            for veh in range(n_vehicles):
                # 查询汽车是否为“完成状态”,即到达终点或者累计reward过大;
                if qmaze.status_flag[veh] != 0:
                    # 说明该车状态为完成,即已到达终点或者累计reward过大
                    continue
                else:
                    # 说明汽车不是“完成”状态。
                    qmaze.time_remain_list[veh] -= 1
                    qmaze.action_times_list[veh] += 1
                    nVehsCell = nVehsCell_list[transfer_dict[vehcurpos[veh]]]
                    nSEsCell = nSEsCell_list[transfer_dict[seenv.SE_curpos_list[veh]]]
                    veh_pos = vehcurpos[veh]
                    se_pos = seenv.SE_curpos_list[veh]
                    bv = seenv.SE_data_list[veh]
                    cpuneed = seenv.CPUNeedList[veh]
                    if qmaze.time_remain_list[veh] != 0:
                        # 首先把last_act_done设置成0,表示上一个动作未做完
                        qmaze.last_act_done[veh] = 0
                        # 说明未发生迁移,所以得到的reward都为0,只是记录服务时延
                        veh_step_reward_list[veh] = 0
                        # 计算一下服务时延,
                        # 记录一下时延——后面看一下总的时延时间;
                        # 记录一下服务时延的成功率;
                        delay_time = seenv.sum_delay(nVehsCell, nSEsCell, veh_pos, se_pos, bv, cpuneed)
                        if delay_time < Delay_Threshold:
                            seenv.record_success_rate[veh].append(1)
                        else:
                            seenv.record_success_rate[veh].append(0)
                        seenv.record_delay_list[veh].append(delay_time)
                        seenv.SE_delay_list[veh] += delay_time
                        se_step_reward_list[veh] = 0
                        continue
                    if qmaze.time_remain_list[veh] == 0:
                        # 说明将要完成迁移
                        # 首先把last_act_done设置成1,表示上一个动作已经做完
                        qmaze.last_act_done[veh] = 1
                        # qmaze.should_change[veh] = 1
                        # veh的rewrd是直接读取will_get即可
                        # se部分计算时延、计算reward
                        veh_step_reward_list[veh] = qmaze.reward_will_get[veh] + veh_step_cost_list[veh]
                        qmaze.reward_will_get[veh] = 0
                        veh_step_cost_list[veh] = 0
                        # 计算一下服务时延,
                        # 记录一下时延——后面看一下总的时延时间;
                        # 记录一下服务时延的成功率;
                        delay_time = seenv.sum_delay(nVehsCell, nSEsCell, veh_pos, se_pos, bv, cpuneed)
                        if delay_time < Delay_Threshold:
                            seenv.record_success_rate[veh].append(1)
                        else:
                            seenv.record_success_rate[veh].append(0)
                        seenv.record_delay_list[veh].append(delay_time)
                        seenv.SE_delay_list[veh] += delay_time
                        avg_delay = seenv.SE_delay_list[veh] / qmaze.action_times_list[veh]
                        if avg_delay > Delay_Threshold:
                            delay_cost = - 0.20
                        else:
                            delay_cost = 0
                        seenv.SE_delay_list[veh] = 0  # 若计算了delaycost 要重新清零delay累加
                        se_step_reward_list[veh] = delay_cost + -1 * CostMrgS + se_step_cost_list[veh]
                        se_step_cost_list[veh] = 0 # 被计入reward过后的拥挤开销需要清零

                        # 计算累计的reward,加入了拥挤开销的reward
                        qmaze.game_acc_veh_reward[veh] += veh_step_reward_list[veh]
                        seenv.game_acc_se_reward[veh] += se_step_reward_list[veh]
                        pure_reward_list[veh] = veh_step_reward_list[veh] +  se_step_reward_list[veh]
                        reward_list[veh] = vr_weights * veh_step_reward_list[veh] + (1 - vr_weights) * se_step_reward_list[veh]
                        qmaze.game_acc_reward[veh] += reward_list[veh]
                        qmaze.game_acc_pure_reward[veh] += pure_reward_list[veh]
                        # 判断行车的accreward是否小于最低要求而游戏失败
                        if qmaze.game_acc_veh_reward[veh] < qmaze.min_reward:
                            # 行车失败
                            qmaze.status_flag[veh] = -1
                        else:

                            # 尚未达到失败的标准,继续行车:
                            # 根据车辆的will的得到的动作信息,更新veh与se状态。
                            # 判断一个是否到达终点的信息,即用来更新汽车状态,又用来在experience的判定因素。
                            qmaze.update_state_single__(veh)
                            seenv.update_se_pos_(veh)

                            # 维护一个should_save,表示有必要存储的经验过程数据
                            qmaze.should_save[veh] = 1

            print("E:",n_episodes)
            print("cur:",qmaze.vehs_cur_list)
            print("remain:",qmaze.time_remain_list)
            print("next:",qmaze.vehs_next_go_list)
            envstates_list = [qmaze.observe(veh) for veh in range(n_vehicles)]
            se_envstates_list = [seenv.SE_observe(veh, qmaze) for veh in range(n_vehicles)]
            # 获取当前各个网格车辆的信息,汽车行动后的网格汽车数目
            vehsnum_after_act = qmaze.count_cells_vehsnum()
            sesnum_after_act = seenv.SE_count(qmaze)

            # print(qmaze.count_cell_vehsnum_list())
            # print(qmaze.acc_drive_cost)
            for veh in range(n_vehicles):
                if qmaze.should_save[veh] == 1:
                    # 记录一个episode经验
                    if qmaze.status_flag[veh] == 1:
                        get_dest = True
                    else:
                        get_dest = False
                    episode = [prev_envstates_list[veh], prev_se_envstates_list[veh],
                               qmaze.vehs_change_act[veh], seenv.SEs_next_mrg_list[veh],
                               reward_list[veh], veh_step_reward_list[veh], se_step_reward_list[veh],
                               envstates_list[veh], se_envstates_list[veh],
                               get_dest, vehsnum_before_act, vehsnum_after_act,
                               sesnum_before_act, sesnum_after_act]
                    experience.remember(episode)
                else:
                    continue

            # 重置两个计数器
            get_dest_count = 0
            failed_count = 0
            #  todo 判断条件要改
            # 计算已经完成的 actor 数量,包括 1(成功),2(失败)
            for veh in range(n_vehicles):
                if qmaze.status_flag[veh] == 1:
                    get_dest_count += 1
                elif qmaze.status_flag[veh] == -1:
                    failed_count += 1

            # 当所有的 actor 都已达到终点(不一定是最优解)或者未能达终点(失败),该 episode 训练结束
            if get_dest_count + failed_count == n_vehicles:
                game_over = True
            else:
                game_over = False

            n_episodes += 1


        # 一系列reward信息的统计
        sum_pure_total_reward = 0
        sum_total_reward = 0
        sum_veh_reward = 0
        sum_se_reward = 0
        for veh in qmaze.done_list:
            sum_total_reward += qmaze.game_acc_reward[veh]
            sum_veh_reward += qmaze.game_acc_veh_reward[veh]
            sum_se_reward += seenv.game_acc_se_reward[veh]
            sum_pure_total_reward += qmaze.game_acc_pure_reward[veh]

        if len(qmaze.done_list) == 0:
            records_veh = 0
            records_total = 0
            records_se = 0
            records_total_pure = 0
        else:
            records_veh = sum_veh_reward / len(qmaze.done_list)
            records_total = sum_total_reward / len(qmaze.done_list)
            records_se = sum_se_reward / len(qmaze.done_list)
            records_total_pure = sum_pure_total_reward / len(qmaze.done_list)
        records_of_total_reward.append(records_total)
        records_of_veh_reward.append(records_veh)
        records_of_se_reward.append(records_se)
        records_of_pure_total_reward.append(records_total_pure)
        # 统计汽车的行车代价
        sum_veh_drive = 0
        sum_veh_drive_speed = 0
        # print(qmaze.acc_drive_cost)
        for veh in qmaze.done_list:
            qmaze.game_drive_cost[veh] = qmaze.acc_drive_cost[veh] / qmaze.action_times_list[veh]
            qmaze.game_drive_speed[veh] = 1/qmaze.game_drive_cost[veh]
            sum_veh_drive += qmaze.acc_drive_cost[veh] / qmaze.action_times_list[veh]
            sum_veh_drive_speed += qmaze.game_drive_speed[veh]
        if len(qmaze.done_list) == 0:
            records_drive = 0
            records_drive_speed = 0
        else:
            records_drive = sum_veh_drive / len(qmaze.done_list)
            records_drive_speed = sum_veh_drive_speed / len(qmaze.done_list)
        records_of_veh_drive.append(records_drive)
        records_of_veh_drive_speed.append(records_drive_speed)

        # 统计各个车辆的在每一局(每一个训练轮次)的平均delay与服务成功率
        se_delay_avg = [0]*n_vehicles
        se_success_rate = [0]*n_vehicles


        for veh in qmaze.done_list:
            se_delay_avg[veh] = sum(seenv.record_delay_list[veh])/ len(seenv.record_delay_list[veh])
            se_success_rate[veh]  = sum(seenv.record_success_rate[veh])/ len(seenv.record_success_rate[veh])

        # 在一个轮次中对于各个车辆的delay与服务成功率进行求平均
        AvgDelayforAll = 0
        SRforAll = 0
        SRcount = 0
        for veh in qmaze.done_list:
            AvgDelayforAll += se_delay_avg[veh]
            SRforAll += sum(seenv.record_success_rate[veh])
            SRcount += len(seenv.record_success_rate[veh])
        if len(qmaze.done_list) == 0:
            records_delay = 0
        else:
            records_delay = AvgDelayforAll / len(qmaze.done_list)
        if SRcount == 0:
            records_SR =0
        else:
            records_SR = SRforAll / SRcount
        records_of_se_delay.append(records_delay)
        records_of_se_SR.append(records_SR)

        dt = datetime.datetime.now() - start_time
        t = format_time(dt.total_seconds())
        template = "Epoch: {:03d}/{:d} | Episodes: {:d} | GetDestCount: {:d}/{:d} |FailedCount: {:d}/{:d}| time: {},| loss_weight:{}"
        print(template.format(epoch, n_epoch - 1,n_episodes ,get_dest_count, n_vehicles ,failed_count, n_vehicles, t, model_eval.loss_weights))
        print("Arrived vehs:", qmaze.done_list)
        print("SE_delay_avg:",se_delay_avg)
        print("SE_success_rate:",se_success_rate)
        print("veh_drive", qmaze.game_drive_speed)
        print("【AVG_Veh_drive】:", records_drive)
        print("【AVG_Veh_drive_speed】:", records_drive_speed)

    end_time = datetime.datetime.now()
    dt = datetime.datetime.now() - start_time
    seconds = dt.total_seconds()
    t = format_time(seconds)
    weight_rate = '_'+ str(vr_weights) + '_'+str(seenv.datasize_base)+'_'+str(n_vehicles) +'_'
    # 保存训练过程中的指标
    parent_path = 'save_res0108/'
    method_name = 'merge_test' + weight_rate
    print("Saving Reward:")
    with open(parent_path + method_name + "total_reward.pl", 'wb') as f:
        print("recording total_reward..")
        pickle.dump(records_of_total_reward, f)

    with open(parent_path + method_name +"pure_total_reward.pl", 'wb') as f:
        print("recording pure total_reward..")
        pickle.dump(records_of_pure_total_reward, f)

    with open(parent_path + method_name + "veh_reward.pl", 'wb') as f:
        print("recording veh_reward..")
        pickle.dump(records_of_veh_reward, f)

    with open(parent_path + method_name + "se_reward.pl", 'wb') as f:
        print("recording se_reward..")
        pickle.dump(records_of_se_reward, f)

    print("Saving Index:")
    with open(parent_path + method_name + "veh_drive.pl", 'wb') as f:
        print("recording veh_drive..")
        pickle.dump(records_of_veh_drive, f)

    with open(parent_path + method_name + "veh_drive_speed.pl", 'wb') as f:
        print("recording veh_drive_speed..")
        pickle.dump(records_of_veh_drive_speed, f)

    with open(parent_path + method_name + "se_delay.pl", 'wb') as f:
        print("recording se_delay..")
        pickle.dump(records_of_se_delay, f)

    with open(parent_path + method_name + "se_SR.pl", 'wb') as f:
        print("recording se_SR..")
        pickle.dump(records_of_se_SR, f)

    # print(veh_actions_record)
    # print(se_actions_record)
    print(seenv.SE_data_list)
Beispiel #27
0
class Agent(object):
    def __init__(self, model, env, action_size, args, state):
        self.model = model
        self.env = env
        self.action_size = action_size
        self.state = state
        self.hx = None
        self.cx = None
        self.eps_len = 0
        self.args = args
        self.values = []
        self.log_probs = []
        self.rewards = []
        self.entropies = []
        self.done = True  # 初始化,可以设置一次新的状态值
        self.info = None
        self.reward = 0
        self.gpu_id = -1
        # 参数

        self.memory = Experience(history_size=2000)

    def fill_experience(self):
        prev_state = self.env.last_state
        last_action = self.env.last_action
        last_reward = self.env.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)
        with torch.no_grad():
            state = torch.from_numpy(self.env.last_state).unsqueeze(0)
            lar = torch.from_numpy(last_action_reward).unsqueeze(0)
            _, pi, (self.hx, self.cx) = self.model(task_type='a3c',
                                                   states=state,
                                                   hx=self.hx,
                                                   cx=self.cx,
                                                   last_action_rewards=lar)

            action_index = pi.max(1)[1].view(1, 1).item()

        new_state, reward, terminal, pixel_change = self.env.step(
            action_index)  # 存储为数组

        frame = ExperienceFrame(prev_state, reward, action_index, terminal,
                                pixel_change, last_action, last_reward)
        self.memory.add_frame(frame)

        if terminal:
            self.env.reset()
        if self.memory.is_full():
            self.env.reset()
            print("Replay buffer filled")
        self.done = terminal

    def a3c_process(self):
        """
        在 on-policy 下运行程序
        :return:
        """
        states = []
        last_action_rewards = []
        actions = []  #
        rewards = []
        values = []  # V
        actions_prob = []

        terminal_end = False

        # t_max times loop
        for _ in range(self.args.num_steps):
            # Prepare last action reward
            last_action = self.env.last_action
            last_reward = self.env.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)
            state = torch.from_numpy(self.env.last_state).unsqueeze(0)
            lar = torch.from_numpy(last_action_reward)

            v, pi, (self.hx,
                    self.cx) = self.model(task_type='a3c',
                                          states=state,
                                          hx=self.hx,
                                          cx=self.cx,
                                          last_action_rewards=lar.unsqueeze(0))

            action_index = pi.max(1)[1].view(1, 1).item()

            states.append(torch.from_numpy(self.env.last_state))
            actions_prob.append(torch.squeeze(pi, dim=0))
            last_action_rewards.append(lar)
            actions.append(action_index)
            values.append(v)

            prev_state = self.env.last_state

            new_state, reward, terminal, pixel_change = self.env.step(
                action_index)
            frame = ExperienceFrame(prev_state, reward, action_index, terminal,
                                    pixel_change, last_action, last_reward)

            # Store to experience
            self.memory.add_frame(frame)

            # self.episode_reward += reward

            rewards.append(reward)

            self.update_lstm_state()
            if terminal:
                self.env.reset()
                break

        R = torch.zeros(1, 1)
        if not terminal_end:
            state = torch.from_numpy(new_state).unsqueeze(0)
            lar = torch.from_numpy(frame.get_action_reward(
                self.action_size)).unsqueeze(0)
            value, _, _ = self.model(task_type='a3c',
                                     states=state,
                                     hx=self.hx,
                                     cx=self.cx,
                                     last_action_rewards=lar)
            R = value.data
        # 构造误差项
        actions.reverse()
        rewards.reverse()
        values.reverse()

        batch_a = []
        batch_adv = []
        batch_R = []

        for (ai, ri, Vi) in zip(actions, rewards, values):
            R = ri + self.args.gamma * R
            adv = R - Vi
            a = np.zeros([self.action_size], dtype=np.float32)
            a[ai] = 1.0

            batch_a.append(torch.from_numpy(a))
            batch_adv.append(adv)
            batch_R.append(R)

        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()
        # 转换为张量

        return batch_a, batch_adv, batch_R, last_action_rewards, states, actions_prob, values

    def a3c_loss(self, batch_a, batch_adv, batch_R, last_action_rewards,
                 states, actions_prob, values):
        batch_a = torch.stack(batch_a)  # batch, 6
        batch_adv = torch.stack(batch_adv)  # batch,1,1
        last_action_rewards = torch.stack(last_action_rewards)  # batch,7
        batch_R = torch.stack(batch_R)  # batch,1,1
        states = torch.stack(states)  # batch,3,84,84
        actions_prob = torch.stack(actions_prob)  # batch,6
        values = torch.stack(values)
        # 损失函数
        log_pi = torch.log(torch.clamp(actions_prob, min=1e-20, max=1.0))
        entropy = -torch.sum(log_pi * actions_prob, dim=1)
        # 对应的 a_i 的概率
        log_pi_a_i = torch.sum(torch.mul(log_pi, batch_a), dim=1)
        policy_loss = torch.sum(log_pi_a_i * batch_adv + entropy * 0.001)
        # value_loss
        value_loss = 0.5 * F.mse_loss(batch_R, values)
        return policy_loss + value_loss

    def action_train(self):
        value, logit, (self.hx, self.cx) = self.model(
            (Variable(self.state.unsqueeze(0)), (self.hx, self.cx)))
        prob = F.softmax(logit, dim=1)
        log_prob = F.log_softmax(logit, dim=1)
        entropy = -(log_prob * prob).sum(1)
        self.entropies.append(entropy)
        action = prob.multinomial(1).data
        log_prob = log_prob.gather(1, Variable(action))
        state, self.reward, self.done, self.info = self.env.step(
            action.cpu().numpy())
        self.state = torch.from_numpy(state).float()
        if self.gpu_id >= 0:
            with torch.cuda.device(self.gpu_id):
                self.state = self.state.cuda()
        self.reward = max(min(self.reward, 1), -1)
        self.values.append(value)
        self.log_probs.append(log_prob)
        self.rewards.append(self.reward)
        return self

    def action_test(self):
        with torch.no_grad():
            self.update_lstm_state()
            state = torch.from_numpy(self.env.last_state).unsqueeze(0)

            last_action = self.env.last_action
            last_reward = np.clip(self.env.last_reward, -1, 1)
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)
            lar = torch.from_numpy(last_action_reward)

            v, pi, (self.hx,
                    self.cx) = self.model(task_type='a3c',
                                          states=state,
                                          hx=self.hx,
                                          cx=self.cx,
                                          last_action_rewards=lar.unsqueeze(0))
        prob = F.softmax(pi, dim=1)
        action = prob.max(1)[1].data.cpu().numpy()
        state, self.reward, self.done, pixel_change = self.env.step(action[0])
        self.info = 5
        self.state = torch.from_numpy(state).float()
        if self.gpu_id >= 0:
            with torch.cuda.device(self.gpu_id):
                self.state = self.state.cuda()
        self.eps_len += 1
        return self

    def update_lstm_state(self):
        if self.done:
            if self.gpu_id >= 0:
                with torch.cuda.device(self.gpu_id):
                    self.cx = Variable(torch.zeros(1, 256).cuda())
                    self.hx = Variable(torch.zeros(1, 256).cuda())
            else:
                self.cx = Variable(torch.zeros(1, 256))
                self.hx = Variable(torch.zeros(1, 256))
        else:
            self.cx = Variable(self.cx.data)
            self.hx = Variable(self.hx.data)

    def clear_actions(self):
        self.values = []
        self.log_probs = []
        self.rewards = []
        self.entropies = []
        return self
Beispiel #28
0
class Trainer(object):
    def __init__(self, thread_index, global_network, initial_learning_rate,
                 learning_rate_input, grad_applier, env_type, env_name,
                 use_pixel_change, use_value_replay, use_reward_prediction,
                 pixel_change_lambda, entropy_beta, local_t_max, gamma,
                 gamma_pc, experience_history_size, max_global_time_step,
                 device):

        self.thread_index = thread_index
        self.learning_rate_input = learning_rate_input
        self.env_type = env_type
        self.env_name = env_name
        self.use_pixel_change = use_pixel_change
        self.use_value_replay = use_value_replay
        self.use_reward_prediction = use_reward_prediction
        self.local_t_max = local_t_max
        self.gamma = gamma
        self.gamma_pc = gamma_pc
        self.experience_history_size = experience_history_size
        self.max_global_time_step = max_global_time_step
        self.action_size = Environment.get_action_size(env_type, env_name)

        self.local_network = UnrealModel(self.action_size, thread_index,
                                         use_pixel_change, use_value_replay,
                                         use_reward_prediction,
                                         pixel_change_lambda, entropy_beta,
                                         device)
        self.local_network.prepare_loss()

        self.apply_gradients = grad_applier.minimize_local(
            self.local_network.total_loss, global_network.get_vars(),
            self.local_network.get_vars())

        self.sync = self.local_network.sync_from(global_network)
        self.experience = Experience(self.experience_history_size)
        self.local_t = 0
        self.initial_learning_rate = initial_learning_rate
        self.episode_reward = 0
        # For log output
        self.prev_local_t = 0

    def prepare(self):
        print('')
        print('trainer creating env...')
        print('')
        self.environment = Environment.create_environment(
            self.env_type, self.env_name)

    def stop(self):
        self.environment.stop()

    def _anneal_learning_rate(self, global_time_step):
        learning_rate = self.initial_learning_rate * (
            self.max_global_time_step -
            global_time_step) / self.max_global_time_step
        if learning_rate < 0.0:
            learning_rate = 0.0
        return learning_rate

    def choose_action(self, pi_values):
        return np.random.choice(range(len(pi_values)), p=pi_values)

    def _record_score(self, sess, summary_writer, summary_op, score_input,
                      score, global_t):
        summary_str = sess.run(summary_op, feed_dict={score_input: score})
        summary_writer.add_summary(summary_str, global_t)
        summary_writer.flush()

    def set_start_time(self, start_time):
        self.start_time = start_time

    def _fill_experience(self, sess):
        """
    Fill experience buffer until buffer is full.
    """
        prev_state = self.environment.last_state
        last_action = self.environment.last_action
        last_reward = self.environment.last_reward
        last_action_reward = ExperienceFrame.concat_action_and_reward(
            last_action, self.action_size, last_reward)

        pi_, _ = self.local_network.run_base_policy_and_value(
            sess, self.environment.last_state, last_action_reward)
        action = self.choose_action(pi_)

        new_state, reward, terminal, pixel_change = self.environment.process(
            action)

        #print('action:', action, terminal)

        frame = ExperienceFrame(prev_state, reward, action, terminal,
                                pixel_change, last_action, last_reward)
        self.experience.add_frame(frame)

        if terminal:
            self.environment.reset()
        if self.experience.is_full():
            self.environment.reset()
            print("Replay buffer filled")

    def _print_log(self, global_t):
        if (self.thread_index == 0) and (self.local_t - self.prev_local_t >=
                                         PERFORMANCE_LOG_INTERVAL):
            self.prev_local_t += PERFORMANCE_LOG_INTERVAL
            elapsed_time = time.time() - self.start_time
            steps_per_sec = global_t / elapsed_time
            print(
                "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour"
                .format(global_t, elapsed_time, steps_per_sec,
                        steps_per_sec * 3600 / 1000000.))

    def _process_base(self, sess, global_t, summary_writer, summary_op,
                      score_input):
        # [Base A3C]
        states = []
        last_action_rewards = []
        actions = []
        rewards = []
        values = []

        terminal_end = False

        start_lstm_state = self.local_network.base_lstm_state_out

        # t_max times loop
        for _ in range(self.local_t_max):
            # Prepare last action reward
            last_action = self.environment.last_action
            last_reward = self.environment.last_reward
            last_action_reward = ExperienceFrame.concat_action_and_reward(
                last_action, self.action_size, last_reward)

            pi_, value_ = self.local_network.run_base_policy_and_value(
                sess, self.environment.last_state, last_action_reward)

            action = self.choose_action(pi_)

            states.append(self.environment.last_state)
            last_action_rewards.append(last_action_reward)
            actions.append(action)
            values.append(value_)

            if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0):
                print("pi={}".format(pi_))
                print(" V={}".format(value_))

            prev_state = self.environment.last_state

            # Process game
            new_state, reward, terminal, pixel_change = self.environment.process(
                action)
            frame = ExperienceFrame(prev_state, reward, action, terminal,
                                    pixel_change, last_action, last_reward)

            # Store to experience
            self.experience.add_frame(frame)

            self.episode_reward += reward

            rewards.append(reward)

            self.local_t += 1

            if terminal:
                terminal_end = True
                print("score={}".format(self.episode_reward))

                self._record_score(sess, summary_writer, summary_op,
                                   score_input, self.episode_reward, global_t)

                self.episode_reward = 0
                self.environment.reset()
                self.local_network.reset_state()
                break

        R = 0.0
        if not terminal_end:
            R = self.local_network.run_base_value(
                sess, new_state, frame.get_action_reward(self.action_size))

        actions.reverse()
        states.reverse()
        rewards.reverse()
        values.reverse()

        batch_si = []
        batch_a = []
        batch_adv = []
        batch_R = []

        for (ai, ri, si, Vi) in zip(actions, rewards, states, values):
            R = ri + self.gamma * R
            adv = R - Vi
            a = np.zeros([self.action_size])
            a[ai] = 1.0

            batch_si.append(si)
            batch_a.append(a)
            batch_adv.append(adv)
            batch_R.append(R)

        batch_si.reverse()
        batch_a.reverse()
        batch_adv.reverse()
        batch_R.reverse()

        return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state

    def _process_pc(self, sess):
        # [pixel change]
        # Sample 20+1 frame (+1 for last next state)
        pc_experience_frames = self.experience.sample_sequence(
            self.local_t_max + 1)
        # Reverse sequence to calculate from the last
        pc_experience_frames.reverse()

        batch_pc_si = []
        batch_pc_a = []
        batch_pc_R = []
        batch_pc_last_action_reward = []

        pc_R = np.zeros([20, 20], dtype=np.float32)
        if not pc_experience_frames[1].terminal:
            pc_R = self.local_network.run_pc_q_max(
                sess, pc_experience_frames[0].state,
                pc_experience_frames[0].get_last_action_reward(
                    self.action_size))

        for frame in pc_experience_frames[1:]:
            pc_R = frame.pixel_change + self.gamma_pc * pc_R
            a = np.zeros([self.action_size])
            a[frame.action] = 1.0
            last_action_reward = frame.get_last_action_reward(self.action_size)

            batch_pc_si.append(frame.state)
            batch_pc_a.append(a)
            batch_pc_R.append(pc_R)
            batch_pc_last_action_reward.append(last_action_reward)

        batch_pc_si.reverse()
        batch_pc_a.reverse()
        batch_pc_R.reverse()
        batch_pc_last_action_reward.reverse()

        return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R

    def _process_vr(self, sess):
        # [Value replay]
        # Sample 20+1 frame (+1 for last next state)
        vr_experience_frames = self.experience.sample_sequence(
            self.local_t_max + 1)
        # Reverse sequence to calculate from the last
        vr_experience_frames.reverse()

        batch_vr_si = []
        batch_vr_R = []
        batch_vr_last_action_reward = []

        vr_R = 0.0
        if not vr_experience_frames[1].terminal:
            vr_R = self.local_network.run_vr_value(
                sess, vr_experience_frames[0].state,
                vr_experience_frames[0].get_last_action_reward(
                    self.action_size))

        # t_max times loop
        for frame in vr_experience_frames[1:]:
            vr_R = frame.reward + self.gamma * vr_R
            batch_vr_si.append(frame.state)
            batch_vr_R.append(vr_R)
            last_action_reward = frame.get_last_action_reward(self.action_size)
            batch_vr_last_action_reward.append(last_action_reward)

        batch_vr_si.reverse()
        batch_vr_R.reverse()
        batch_vr_last_action_reward.reverse()

        return batch_vr_si, batch_vr_last_action_reward, batch_vr_R

    def _process_rp(self):
        # [Reward prediction]
        rp_experience_frames = self.experience.sample_rp_sequence()
        # 4 frames

        batch_rp_si = []
        batch_rp_c = []

        for i in range(3):
            batch_rp_si.append(rp_experience_frames[i].state)

        # one hot vector for target reward
        r = rp_experience_frames[3].reward
        rp_c = [0.0, 0.0, 0.0]
        if r == 0:
            rp_c[0] = 1.0  # zero
        elif r > 0:
            rp_c[1] = 1.0  # positive
        else:
            rp_c[2] = 1.0  # negative
        batch_rp_c.append(rp_c)
        return batch_rp_si, batch_rp_c

    def process(self, sess, global_t, summary_writer, summary_op, score_input):
        # Fill experience replay buffer
        if not self.experience.is_full():
            self._fill_experience(sess)
            return 0

        start_local_t = self.local_t

        cur_learning_rate = self._anneal_learning_rate(global_t)

        # Copy weights from shared to local
        sess.run(self.sync)

        # [Base]
        batch_si, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state = \
              self._process_base(sess,
                                 global_t,
                                 summary_writer,
                                 summary_op,
                                 score_input)
        feed_dict = {
            self.local_network.base_input: batch_si,
            self.local_network.base_last_action_reward_input:
            batch_last_action_rewards,
            self.local_network.base_a: batch_a,
            self.local_network.base_adv: batch_adv,
            self.local_network.base_r: batch_R,
            self.local_network.base_initial_lstm_state: start_lstm_state,
            # [common]
            self.learning_rate_input: cur_learning_rate
        }

        # [Pixel change]
        if self.use_pixel_change:
            batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc(
                sess)

            pc_feed_dict = {
                self.local_network.pc_input: batch_pc_si,
                self.local_network.pc_last_action_reward_input:
                batch_pc_last_action_reward,
                self.local_network.pc_a: batch_pc_a,
                self.local_network.pc_r: batch_pc_R
            }
            feed_dict.update(pc_feed_dict)

        # [Value replay]
        if self.use_value_replay:
            batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr(
                sess)

            vr_feed_dict = {
                self.local_network.vr_input: batch_vr_si,
                self.local_network.vr_last_action_reward_input:
                batch_vr_last_action_reward,
                self.local_network.vr_r: batch_vr_R
            }
            feed_dict.update(vr_feed_dict)

        # [Reward prediction]
        if self.use_reward_prediction:
            batch_rp_si, batch_rp_c = self._process_rp()
            rp_feed_dict = {
                self.local_network.rp_input: batch_rp_si,
                self.local_network.rp_c_target: batch_rp_c
            }
            feed_dict.update(rp_feed_dict)

        # Calculate gradients and copy them to global network.
        sess.run(self.apply_gradients, feed_dict=feed_dict)

        self._print_log(global_t)

        # Return advanced local step size
        diff_local_t = self.local_t - start_local_t
        return diff_local_t
Beispiel #29
0
        from visdom import Visdom
        viz = Visdom()

    # Build Environment Template -> Lazy Evaluated Callable, for spawning environments
    env_template = build_env(args.env)

    # Build Distributed Environments
    envs = get_distributed_backend(env_template,
                                   args.num_processes,
                                   backend=args.distributed_backend)

    # Obtain Environment metadata
    metadata = envs.get_metadata()

    # Instantiate Policy
    policy = get_policy(args.policy, metadata)

    # Create agent, with the given training algorithm
    agent = get_algorithm(args.algorithm, policy, envs, args, visdom=viz)

    # Create Experience Buffer, with the environment metadata
    experience = Experience(metadata['max_episode_length'], args.num_processes,
                            metadata['obs_shape'], metadata['action_type'],
                            metadata['action_shape'])

    # Train agent
    agent.train(num_frames=args.num_frames)

    import IPython
    IPython.embed()
Beispiel #30
0
 def update_learner(self):
     sample_idxs, weights, sample = self.replay_buffer.sample(self.sample_size)
     loss, learner_info = self.learner.update(Experience.training_items(sample), weights)
     self.replay_buffer.update(sample_idxs, loss)
     return loss, learner_info
 def null_experience_list(self, count=100):
     return [Experience(None, None, None, None, None) for _ in range(count)]