Exemple #1
0
 def apply_adam_updates(self,
                        variables_server,
                        gradients,
                        learning_rate,
                        epsilon=1e-6):
     update_steps = hlp.load_object(
         variables_server.get('update_steps')) + 1
     variables_server.set('update_steps', hlp.dump_object(update_steps))
     learning_rate = learning_rate * (
         (1 - 0.999**update_steps)**0.5) / (1 - 0.9**update_steps)
     for i, gradient in enumerate(gradients):
         momentum = hlp.load_object(
             variables_server.get('momentum_{}'.format(i)))
         momentum = 0.999 * momentum + (1 - 0.999) * gradient * gradient
         variables_server.set('momentum_{}'.format(i),
                              hlp.dump_object(momentum))
         velocity = hlp.load_object(
             variables_server.get('velocity_{}'.format(i)))
         velocity = 0.9 * velocity + (1 - 0.9) * gradient
         variables_server.set('velocity_{}'.format(i),
                              hlp.dump_object(velocity))
         weight = hlp.load_object(
             variables_server.get('weight_{}'.format(i)))
         new_weight = weight - velocity * learning_rate / (
             (momentum**0.5) + epsilon)
         variables_server.set('weight_{}'.format(i),
                              hlp.dump_object(new_weight))
     return update_steps
Exemple #2
0
    def make_rollout(self):
        variables_server = Redis(port=12000)
        if self.scale != 'off':
            try:
                means = hlp.load_object(variables_server.get("means"))
                stds = hlp.load_object(variables_server.get("stds"))
                self.sess.run(self.norm_set_op, feed_dict=dict(zip(self.norm_phs, [means, stds])))
            except:
                pass
        try:
            weights = [hlp.load_object(variables_server.get("weight_{}".format(i))) for i in
                       range(len(self.weights))]
            self.set_weights(weights)
        except:
            pass
        env = self.env
        n_tasks = self.n_tests

        timestep = 0
        i_task = 0

        paths = []
        while i_task < n_tasks:
            path = {}
            observations, action_tuples, rewards, dist_tuples, timestamps = [], [], [], [], []
            sums = np.zeros((1, env.get_observation_space()))
            sumsqrs = np.zeros(sums.shape)

            env.reset()
            while not env.done and env.timestamp < self.timesteps_per_launch:
                sums += env.features
                sumsqrs += np.square(env.features)
                observations.append(env.features[0])
                timestamps.append(env.timestamp)

                actions = self.act(env.features)

                env.step(actions)
                timestep += 1

                action_tuples.append(actions)
                rewards.append(env.reward)

            path["observations"] = np.array(observations)
            path["action_tuples"] = np.array(action_tuples)
            path["rewards"] = np.array(rewards)
            if not self.test_mode:
                path["dist_tuples"] = np.array(dist_tuples)
            path["timestamps"] = np.array(timestamps)
            path["sumobs"] = sums
            path["sumsqrobs"] = sumsqrs
            path["terminated"] = env.done
            path["total"] = env.get_total_reward()
            paths.append(path)
            i_task += 1

        if self.distributed:
            variables_server.set("paths_{}".format(self.id_worker), hlp.dump_object(paths))
        else:
            self.paths = paths
Exemple #3
0
    def work(self):
        variables_server = Redis(port=12000)
        if self.scale != 'off':
            try:
                means = hlp.load_object(variables_server.get("means"))
                stds = hlp.load_object(variables_server.get("stds"))
                self.sess.run(self.norm_set_op, feed_dict=dict(zip(self.norm_phs, [means, stds])))
            except:
                pass
        try:
            weights = [hlp.load_object(variables_server.get("weight_{}".format(i))) for i in
                       range(len(self.weights))]
            self.set_weights(weights)
        except:
            pass
        env = self.env

        while True:
            observations, action_tuples, rewards, dist_tuples, timestamps = [], [], [], [], []
            for _ in range(self.n_steps):
                observations.append(env.features[0])
                timestamps.append(env.timestamp)

                actions = self.act(env.features)
                env.step(actions)

                action_tuples.append(actions)
                rewards.append(env.reward)
                if env.done or env.timestamp > self.timesteps_per_launch:
                    variables_server.lpush('results', hlp.dump_object(env.get_total_reward()))
                    print("Episode reward: {}".format(env.get_total_reward()), "Length: {}".format(env.timestamp))
                    break
            timestamps.append(env.timestamp)

            observations_batch = np.array(observations)
            actions_batch = np.array(action_tuples)

            feed_dict = {self.state_input: observations_batch,
                         self.targets["action"]: actions_batch}

            if env.done or env.timestamp > self.timesteps_per_launch:
                rewards.append(0)
                env.reset()
            else:
                obs = observations[-1]
                rewards.append(self.sess.run(self.value, feed_dict={self.state_input: obs.reshape((1,) + obs.shape)}))
            returns_batch = hlp.discount(np.array(rewards), self.gamma, np.array(timestamps))[:-1]
            values = self.sess.run(self.value, feed_dict)
            feed_dict[self.targets["advantage"]] = returns_batch - values
            feed_dict[self.targets["return"]] = returns_batch
            gradients = self.sess.run(self.gradients, feed_dict)
            self.apply_adam_updates(variables_server, gradients, self.learning_rate)
            weights = [hlp.load_object(variables_server.get("weight_{}".format(i))) for i in
                       range(len(self.weights))]
            self.set_weights(weights)
Exemple #4
0
    def train(self):
        cmd_server = 'redis-server --port 12000'
        p = subprocess.Popen(cmd_server, shell=True, preexec_fn=os.setsid)
        self.variables_server = Redis(port=12000)
        means = "-"
        stds = "-"
        if self.scale != 'off':
            if self.timestep == 0:
                print("Time to measure features!")
                if self.distributed:
                    worker_args = \
                        {
                            'config': self.config,
                            'test_mode': False,
                        }
                    hlp.launch_workers(worker_args, self.n_workers)
                    paths = []
                    for i in range(self.n_workers):
                        paths += hlp.load_object(self.variables_server.get("paths_{}".format(i)))
                else:
                    self.test_mode = False
                    self.make_rollout()
                    paths = self.paths

                for path in paths:
                    self.sums += path["sumobs"]
                    self.sumsqrs += path["sumsqrobs"]
                    self.sumtime += len(path["rewards"])

            stds = np.sqrt((self.sumsqrs - np.square(self.sums) / self.sumtime) / (self.sumtime - 1))
            means = self.sums / self.sumtime
            print("Init means: {}".format(means))
            print("Init stds: {}".format(stds))
            self.variables_server.set("means", hlp.dump_object(means))
            self.variables_server.set("stds", hlp.dump_object(stds))
            self.sess.run(self.norm_set_op, feed_dict=dict(zip(self.norm_phs, [means, stds])))
        print("Let's go!")
        self.update_target_weights(alpha=1.0)
        index_replay = 0
        iteration = 0
        episode = 0
        idxs_range = np.arange(self.xp_size)
        xp_replay_state = np.zeros(shape=(self.xp_size, self.env.get_observation_space()))
        xp_replay_next_state = np.zeros(shape=(self.xp_size, self.env.get_observation_space()))
        xp_replay_reward = np.zeros(shape=(self.xp_size,))
        xp_replay_action = np.zeros(shape=(self.xp_size,))
        xp_replay_terminal = np.zeros(shape=(self.xp_size,))
        if self.prioritized:
            xp_replay_priority = np.zeros(shape=(self.xp_size,))
            self.max_prior = 1
        start_time = time.time()
        self.last_state = self.env.reset()
        discounts = self.gamma ** np.arange(self.n_steps)
        self.last_rewards = np.zeros(shape=(self.n_steps,))
        self.last_states = np.zeros(shape=(self.n_steps, self.n_features))
        self.last_actions = np.zeros(shape=(self.n_steps, ))
        buffer_index = 0
        env = self.env
        while True:
            if iteration <= self.random_steps:
                actions = env.env.action_space.sample()
            else:
                actions = self.act(env.features, exploration=True)
            self.last_states[buffer_index] = env.features.reshape(-1)
            self.last_actions[buffer_index] = actions
            env.step([actions])
            self.last_rewards[buffer_index] = env.reward
            buffer_index = (buffer_index + 1) % self.n_steps

            if env.timestamp >= self.n_steps:
                xp_replay_state[index_replay] = np.copy(self.last_states[buffer_index])
                xp_replay_next_state[index_replay] = env.features.reshape(-1)
                discounted_return = np.sum(discounts*self.last_rewards[np.roll(np.arange(self.n_steps), -(buffer_index))])
                xp_replay_reward[index_replay] = discounted_return
                xp_replay_action[index_replay] = self.last_actions[buffer_index]
                xp_replay_terminal[index_replay] = env.done
                if self.prioritized:
                    xp_replay_priority[index_replay] = self.max_prior
                index_replay = (index_replay + 1) % self.xp_size

            if env.done or env.timestamp > self.timesteps_per_launch:
                episode += 1
                print("Episode #{}".format(episode), env.get_total_reward())
                self.train_scores.append(env.get_total_reward())
                for i in range(1, self.n_steps):
                    buffer_index = (buffer_index + 1) % self.n_steps

                    xp_replay_state[index_replay] = np.copy(self.last_states[buffer_index])
                    xp_replay_next_state[index_replay] = env.features.reshape(-1)
                    discounted_return = np.sum(
                        discounts[:self.n_steps-i] * self.last_rewards[np.roll(np.arange(self.n_steps), -(buffer_index))[:self.n_steps-i]])
                    xp_replay_reward[index_replay] = discounted_return
                    xp_replay_action[index_replay] = self.last_actions[buffer_index]
                    xp_replay_terminal[index_replay] = env.done
                    index_replay = (index_replay + 1) % self.xp_size
                env.reset()
                self.last_rewards = np.zeros(shape=(self.n_steps,))
                self.last_states = np.zeros(shape=(self.n_steps, self.n_features))
                self.last_actions = np.zeros(shape=(self.n_steps,))
                buffer_index = 0

            self.last_state = env.features
            if iteration % 1000 == 0:
                print("Iteration #{}".format(iteration))
                self.save(self.config[:-5])

            if iteration > self.random_steps:
                if self.prioritized:
                    max_id = np.min([xp_replay_state.shape[0], iteration])
                    probs = xp_replay_priority[:max_id]/np.sum(xp_replay_priority[:max_id])
                    idxs = np.random.choice(idxs_range[:max_id], size=self.batch_size, p=probs)
                    importance_weights = (1/(max_id*probs[idxs]))**self.prior_beta
                else:
                    idxs = np.random.randint(np.min([xp_replay_state.shape[0], iteration]), size=self.batch_size)
                    importance_weights = np.ones(shape=(self.batch_size,))

                state_batch = xp_replay_state[idxs]
                next_state_batch = xp_replay_next_state[idxs]
                action_batch = xp_replay_action[idxs]
                reward_batch = xp_replay_reward[idxs]
                done_batch = xp_replay_terminal[idxs]

                feed_dict = {
                    self.state_input: state_batch,
                    self.next_state_input: next_state_batch,
                    self.action_input: action_batch,
                    self.importance_weights: importance_weights
                }
                target_atom_probs = self.sess.run(self.target_atom_probs, feed_dict)
                target_atom_probs = np.exp(target_atom_probs)

                if not self.double:
                    target_q_values = target_atom_probs * np.tile(np.arange(self.n_atoms).reshape((1, 1, self.n_atoms)), [self.batch_size, 1, 1])
                    target_q_values = np.sum(target_q_values, axis=2)
                    target_greedy_actions = np.argmax(target_q_values, axis=1).astype(np.int32).reshape((-1, 1))
                    target_probs = target_atom_probs[np.arange(self.batch_size).reshape((-1, 1)), target_greedy_actions]
                else:
                    feed_dict[self.state_input] = next_state_batch
                    atom_probs = self.sess.run(self.atom_probs, feed_dict)
                    atom_probs = np.exp(atom_probs)
                    q_values = atom_probs * np.tile(np.arange(self.n_atoms).reshape((1, 1, self.n_atoms)),
                                                                  [self.batch_size, 1, 1])
                    q_values = np.sum(q_values, axis=2)
                    greedy_actions = np.argmax(q_values, axis=1).astype(np.int32).reshape((-1, 1))
                    target_probs = target_atom_probs[np.arange(self.batch_size).reshape((-1, 1)), greedy_actions]
                    feed_dict[self.state_input] = state_batch

                atom_values = np.arange(self.n_atoms, dtype=np.float32).reshape((-1, self.n_atoms))
                atom_values = 2 * self.max_q_magnitude * (
                np.tile(atom_values, [self.batch_size, 1]) / (self.n_atoms - 1) - 0.5)
                atom_new_values = np.clip((self.gamma**self.n_steps) * atom_values * (1-done_batch).reshape(-1, 1) + reward_batch.reshape((-1, 1)),
                                                   - self.max_q_magnitude, self.max_q_magnitude)
                new_positions = ((atom_new_values / (2 * self.max_q_magnitude) + 0.5) * (self.n_atoms - 1)).reshape((-1))
                lower = np.floor(new_positions).astype(np.int32).reshape(-1)
                upper = np.floor(new_positions).astype(np.int32).reshape(-1) + 1

                final_target_probs = np.zeros(shape=(self.batch_size, self.n_atoms+1, self.n_atoms))
                final_target_probs[np.sort(np.tile(np.arange(self.batch_size), [self.n_atoms])), lower, np.tile(np.arange(self.n_atoms), [self.batch_size])] += (upper-new_positions) * target_probs.reshape((-1))
                final_target_probs[np.sort(np.tile(np.arange(self.batch_size), [self.n_atoms])), upper, np.tile(np.arange(self.n_atoms), [self.batch_size])] += (new_positions-lower) * target_probs.reshape((-1))

                final_target_probs = np.sum(final_target_probs, axis=2)[:, :-1]
                feed_dict[self.target_probs] = final_target_probs
                KLs = self.sess.run([self.loss, self.train_op], feed_dict)[0]
                if self.prioritized:
                    xp_replay_priority[idxs] = KLs ** self.prior_alpha
                self.update_target_weights()

                if iteration % self.test_every == 0:
                    weights = self.get_weights()
                    for i, weight in enumerate(weights):
                        self.variables_server.set("weight_" + str(i), hlp.dump_object(weight))
                    print("Time to test!")
                    if self.distributed:
                        weights = self.get_weights()
                        for i, weight in enumerate(weights):
                            self.variables_server.set("weight_" + str(i), hlp.dump_object(weight))
                        worker_args = \
                            {
                                'config': self.config,
                                'test_mode': True,
                            }
                        hlp.launch_workers(worker_args, self.n_workers)
                        paths = []
                        for i in range(self.n_workers):
                            paths += hlp.load_object(self.variables_server.get("paths_{}".format(i)))
                    else:
                        self.test_mode = True
                        self.make_rollout()
                        paths = self.paths

                    total_rewards = np.array([path["total"] for path in paths])
                    eplens = np.array([len(path["rewards"]) for path in paths])
                    print("""
-------------------------------------------------------------
Mean test score:           {test_scores}
Mean test episode length:  {test_eplengths}
Max test score:            {max_test}
Time for iteration:        {tt}
Mean of features:          {means}
Std of features:           {stds}
-------------------------------------------------------------
                                    """.format(
                        means=means,
                        stds=stds,
                        test_scores=np.mean(total_rewards),
                        test_eplengths=np.mean(eplens),
                        max_test=np.max(total_rewards),
                        tt=time.time() - start_time
                    ))
                    start_time = time.time()
                    self.test_scores.append(np.mean(total_rewards))

            iteration += 1
            self.timestep += 1
Exemple #5
0
    def train(self):
        cmd_server = 'redis-server --port 12000'
        p = subprocess.Popen(cmd_server, shell=True, preexec_fn=os.setsid)
        self.variables_server = Redis(port=12000)
        means = "-"
        stds = "-"
        if self.scale != 'off':
            if self.timestep == 0:
                print("Time to measure features!")
                if self.distributed:
                    worker_args = \
                        {
                            'config': self.config,
                            'test_mode': False,
                        }
                    hlp.launch_workers(worker_args, self.n_workers)
                    paths = []
                    for i in range(self.n_workers):
                        paths += hlp.load_object(
                            self.variables_server.get("paths_{}".format(i)))
                else:
                    self.test_mode = False
                    self.make_rollout()
                    paths = self.paths

                for path in paths:
                    self.sums += path["sumobs"]
                    self.sumsqrs += path["sumsqrobs"]
                    self.sumtime += len(path["rewards"])
                stds = np.sqrt(
                    (self.sumsqrs - np.square(self.sums) / self.sumtime) /
                    (self.sumtime - 1))
                means = self.sums / self.sumtime
                print("Init means: {}".format(means))
                print("Init stds: {}".format(stds))
                self.variables_server.set("means", hlp.dump_object(means))
                self.variables_server.set("stds", hlp.dump_object(stds))
                self.sess.run(self.norm_set_op,
                              feed_dict=dict(zip(self.norm_phs,
                                                 [means, stds])))
        while True:
            print("Iteration {}".format(self.timestep))
            start_time = time.time()
            weight_noises = []
            random.seed()
            seed_for_random = random.randint(0, np.iinfo(np.int32).max)
            np.random.seed(seed_for_random)
            seeds = np.random.randint(-np.iinfo(np.int32).min +
                                      np.iinfo(np.int32).max,
                                      size=self.n_tasks_all)
            self.variables_server.set("seeds", hlp.dump_object(seeds))

            weights = self.get_weights()
            for i, weight in enumerate(weights):
                self.variables_server.set("weight_" + str(i),
                                          hlp.dump_object(weight))
                weight_noises.append(
                    np.empty((self.n_tasks_all, ) + weight.shape))

            for index in range(self.n_tasks_all):
                np.random.seed(seeds[index])
                for i, weight in enumerate(weights):
                    weight_noises[i][index] = np.random.normal(
                        size=weight.shape)

            if self.distributed:
                weights = self.get_weights()
                for i, weight in enumerate(weights):
                    self.variables_server.set("weight_" + str(i),
                                              hlp.dump_object(weight))
                worker_args = \
                    {
                        'config': self.config,
                        'test_mode': False,
                    }
                hlp.launch_workers(worker_args,
                                   self.n_workers,
                                   command='rollout_with_noise')
                paths = []
                for i in range(self.n_workers):
                    paths += hlp.load_object(
                        self.variables_server.get("paths_{}".format(i)))
            else:
                self.test_mode = False
                self.make_rollout()
                paths = self.paths

            scores = []
            train_lengths = []
            for i in range(self.n_tasks_all):
                scores.append(
                    hlp.load_object(
                        self.variables_server.get("scores_" + str(i))))
                train_lengths.append(
                    hlp.load_object(
                        self.variables_server.get("eplen_" + str(i))))
                scores.append(
                    hlp.load_object(
                        self.variables_server.get("scores_" + str(-i))))
                train_lengths.append(
                    hlp.load_object(
                        self.variables_server.get("eplen_" + str(-i))))

            scores = np.array(scores)
            train_mean_score = np.mean(scores)
            ranks = np.zeros(shape=scores.shape)

            if self.normalize == 'ranks':
                ranks[np.argsort(scores)] = np.arange(
                    ranks.shape[0], dtype=np.float32) / (ranks.shape[0] - 1)
                ranks -= 0.5
            elif self.normalize == 'center':
                ranks = scores[:]
                ranks -= train_mean_score
                ranks /= (np.std(ranks, ddof=1) + 0.001)

            gradients = [np.zeros(w.get_shape()) for w in self.weights]
            for i, weight in enumerate(weights):
                for index in 2 * np.arange(seeds.shape[0]):
                    gradients[i] += weight_noises[i][index // 2] * (
                        ranks[index] - ranks[index + 1]) / self.n_tasks_all
                gradients[i] -= self.l1_reg * weights[i]

            if self.adam:
                self.apply_adam_updates(gradients)
            else:
                for i, weight in enumerate(weights):
                    weights[i] += self.learning_rate * gradients[i]
                self.sess.run(self.set_op,
                              feed_dict=dict(zip(self.weights_phs, weights)))

            print("Time to testing!")

            if self.distributed:
                weights = self.get_weights()
                for i, weight in enumerate(weights):
                    self.variables_server.set("weight_" + str(i),
                                              hlp.dump_object(weight))
                worker_args = \
                    {
                        'config': self.config,
                        'test_mode': True,
                    }
                hlp.launch_workers(worker_args, self.n_workers)
                paths = []
                for i in range(self.n_workers):
                    paths += hlp.load_object(
                        self.variables_server.get("paths_{}".format(i)))
            else:
                self.test_mode = True
                self.make_rollout()
                paths = self.paths

            total_rewards = np.array([path["total"] for path in paths])
            eplens = np.array([len(path["rewards"]) for path in paths])
            if self.scale:
                for i in range(self.n_tasks_all):
                    self.sums += hlp.load_object(
                        self.variables_server.get("sum_{}".format(i)))
                    self.sumsqrs += hlp.load_object(
                        self.variables_server.get("sumsqr_{}".format(i)))
                self.sumtime += np.sum(train_lengths)
                stds = np.sqrt(
                    (self.sumsqrs - np.square(self.sums) / self.sumtime) /
                    (self.sumtime - 1))
                means = self.sums / self.sumtime
                self.variables_server.set("means", hlp.dump_object(means))
                self.variables_server.set("stds", hlp.dump_object(stds))
                self.sess.run(self.norm_set_op,
                              feed_dict=dict(zip(self.norm_phs,
                                                 [means, stds])))

            print("""
-------------------------------------------------------------
Mean test score:           {test_scores}
Mean train score:          {train_scores}
Mean test episode length:  {test_eplengths}
Mean train episode length: {train_eplengths}
Max test score:            {max_test}
Max train score:           {max_train}
Mean of features:          {means}
Std of features:           {stds}
Time for iteration:        {tt}
-------------------------------------------------------------
                """.format(means=means,
                           stds=stds,
                           test_scores=np.mean(total_rewards),
                           test_eplengths=np.mean(eplens),
                           train_scores=train_mean_score,
                           train_eplengths=np.mean(train_lengths),
                           max_test=np.max(total_rewards),
                           max_train=np.max(scores),
                           tt=time.time() - start_time))
            self.train_scores.append(train_mean_score)
            self.test_scores.append(np.mean(total_rewards))
            if self.timestep % self.save_every == 0:
                self.save(self.config[:-5])
Exemple #6
0
    def rollout_with_noise(self):
        variables_server = Redis(port=12000)
        if self.scale != 'off':
            means = hlp.load_object(variables_server.get("means"))
            stds = hlp.load_object(variables_server.get("stds"))
            self.sess.run(self.norm_set_op,
                          feed_dict=dict(zip(self.norm_phs, [means, stds])))
        weights = [
            hlp.load_object(variables_server.get("weight_{}".format(i)))
            for i in range(len(self.weights))
        ]
        self.set_weights(weights)
        env = self.env
        seeds = hlp.load_object(variables_server.get("seeds"))
        for id_task_for_worker in range(self.n_tasks):
            id_task = id_task_for_worker * self.n_workers + self.id_worker
            if id_task % self.report_every == 0:
                print("Rollout # {} of {}".format(id_task, self.n_tasks_all))
            seed = seeds[id_task]
            np.random.seed(seed)
            noises = []
            for i, weight in enumerate(weights):
                noise = np.random.normal(size=weight.shape)
                noises.append(noise)
                weights[i] += self.noise_scale * noise
            self.set_weights(weights)

            env.reset()

            sums = np.zeros((1, env.get_observation_space()))
            sumsqrs = np.zeros(sums.shape)

            while not env.done and env.timestamp < self.timesteps_per_launch:
                sums += env.features
                sumsqrs += np.square(env.features)

                actions = self.act(env.features)
                env.step(actions)

            variables_server.set("scores_{}".format(id_task),
                                 hlp.dump_object(env.get_total_reward()))
            variables_server.set("eplen_{}".format(id_task),
                                 hlp.dump_object(env.timestamp))

            for i, weight in enumerate(weights):
                noise = noises[i]
                weights[i] -= 2 * self.noise_scale * noise
            self.set_weights(weights)

            env.reset()

            while not env.done and env.timestamp < self.timesteps_per_launch:
                sums += env.features
                sumsqrs += np.square(env.features)

                actions = self.act(env.features)
                env.step(actions)

            variables_server.set("scores_{}".format(-id_task),
                                 hlp.dump_object(env.get_total_reward()))
            variables_server.set("eplen_{}".format(-id_task),
                                 hlp.dump_object(env.timestamp))
            variables_server.set("sum_{}".format(id_task),
                                 hlp.dump_object(sums))
            variables_server.set("sumsqr_{}".format(id_task),
                                 hlp.dump_object(sumsqrs))
            for i, weight in enumerate(weights):
                noise = noises[i]
                weights[i] += self.noise_scale * noise
            self.set_weights(weights)
Exemple #7
0
    def train(self):
        cmd_server = 'redis-server --port 12000'
        p = subprocess.Popen(cmd_server, shell=True, preexec_fn=os.setsid)
        self.variables_server = Redis(port=12000)
        means = "-"
        stds = "-"
        if self.scale != 'off':
            if self.timestep == 0:
                print("Time to measure features!")
                if self.distributed:
                    worker_args = \
                        {
                            'config': self.config,
                            'test_mode': False,
                        }
                    hlp.launch_workers(worker_args, self.n_workers)
                    paths = []
                    for i in range(self.n_workers):
                        paths += hlp.load_object(
                            self.variables_server.get("paths_{}".format(i)))
                else:
                    self.test_mode = False
                    self.make_rollout()
                    paths = self.paths

                for path in paths:
                    self.sums += path["sumobs"]
                    self.sumsqrs += path["sumsqrobs"]
                    self.sumtime += path["observations"].shape[0]

            stds = np.sqrt(
                (self.sumsqrs - np.square(self.sums) / self.sumtime) /
                (self.sumtime - 1))
            means = self.sums / self.sumtime
            print("Init means: {}".format(means))
            print("Init stds: {}".format(stds))
            self.variables_server.set("means", hlp.dump_object(means))
            self.variables_server.set("stds", hlp.dump_object(stds))
            self.sess.run(self.norm_set_op,
                          feed_dict=dict(zip(self.norm_phs, [means, stds])))

        weights = self.get_weights()
        for i, weight in enumerate(weights):
            self.variables_server.set("weight_" + str(i),
                                      hlp.dump_object(weight))
            self.variables_server.set('momentum_{}'.format(i),
                                      hlp.dump_object(np.zeros(weight.shape)))
            self.variables_server.set('velocity_{}'.format(i),
                                      hlp.dump_object(np.zeros(weight.shape)))
        self.variables_server.set('update_steps', hlp.dump_object(0))

        worker_args = \
            {
                'config': self.config,
                'test_mode': False,
            }
        hlp.launch_workers(worker_args,
                           self.n_workers,
                           command='work',
                           wait=False)

        while True:
            time.sleep(self.test_every)
            print("Time for testing!")
            if self.distributed:
                worker_args = \
                    {
                        'config': self.config,
                        'test_mode': True,
                    }
                hlp.launch_workers(worker_args, self.n_workers)
                paths = []
                for i in range(self.n_workers):
                    paths += hlp.load_object(
                        self.variables_server.get("paths_{}".format(i)))
            else:
                self.test_mode = True
                self.make_rollout()
                paths = self.paths

            total_rewards = np.array([path["total"] for path in paths])
            eplens = np.array([len(path["rewards"]) for path in paths])

            print("""
-------------------------------------------------------------
Mean test score:           {test_scores}
Mean test episode length:  {test_eplengths}
Max test score:            {max_test}
Number of train episodes:  {number}
Mean of features:          {means}
Std of features:           {stds}
-------------------------------------------------------------
                """.format(means=means,
                           stds=stds,
                           test_scores=np.mean(total_rewards),
                           test_eplengths=np.mean(eplens),
                           max_test=np.max(total_rewards),
                           number=self.variables_server.llen('results')))
            self.timestep += 1
            self.train_scores = [
                hlp.load_object(res)
                for res in self.variables_server.lrange('results', 0, -1)
            ][::-1]

            self.test_scores.append(np.mean(total_rewards))
            if self.timestep % self.save_every == 0:
                self.save(self.config[:-5])
Exemple #8
0
    def train(self):
        cmd_server = 'redis-server --port 12000'
        p = subprocess.Popen(cmd_server, shell=True, preexec_fn=os.setsid)
        self.variables_server = Redis(port=12000)

        if self.scale:
            if self.timestep == 0:
                self.test_mode = False
                self.make_rollout()
                paths = self.paths
                for path in paths:
                    self.sums += path["sumobs"]
                    self.sumsqrs += path["sumsqrobs"]
                    self.sumtime += len(path["rewards"])
            stds = np.sqrt(
                (self.sumsqrs - np.square(self.sums) / self.sumtime) /
                (self.sumtime - 1))
            means = self.sums / self.sumtime
            print("Init means: {}".format(means))
            print("Init stds: {}".format(stds))
            self.variables_server.set("means", hlp.dump_object(means))
            self.variables_server.set("stds", hlp.dump_object(stds))
            self.sess.run(self.norm_set_op,
                          feed_dict=dict(zip(self.norm_phs, [means, stds])))
        print("Let's go!")
        self.update_target_weights(alpha=1.0)
        index_replay = 0
        iteration = 0
        episode = 0
        xp_replay_state = np.zeros(shape=(self.xp_size,
                                          self.env.get_observation_space()))
        xp_replay_next_state = np.zeros(
            shape=(self.xp_size, self.env.get_observation_space()))
        xp_replay_reward = np.zeros(shape=(self.xp_size, ))
        xp_replay_action = np.zeros(shape=(self.xp_size, len(self.n_actions)))
        xp_replay_terminal = np.zeros(shape=(self.xp_size, ))

        start_time = time.time()
        self.last_state = self.env.reset()
        env = self.env
        while True:
            if iteration <= self.random_steps:
                actions = env.env.action_space.sample()
            else:
                actions = self.act(env.features)
                actions += np.random.normal(0,
                                            scale=self.action_noise,
                                            size=actions.shape)

            env.step(actions)
            xp_replay_state[index_replay] = self.last_state.reshape(-1)
            xp_replay_next_state[index_replay] = env.features.reshape(-1)
            xp_replay_reward[index_replay] = env.reward
            xp_replay_action[index_replay] = actions.reshape(-1)
            xp_replay_terminal[index_replay] = env.done
            index_replay = (index_replay + 1) % self.xp_size
            if env.done or env.timestamp > self.timesteps_per_launch:
                episode += 1
                print("Episode #{}".format(episode), env.get_total_reward())
                self.train_scores.append(env.get_total_reward())
                env.reset()
            self.last_state = env.features
            if iteration % 1000 == 0:
                print("Iteration #{}".format(iteration))
                self.save(self.config[:-5])
            if iteration > self.random_steps:
                idxs = np.random.randint(np.min(
                    [xp_replay_state.shape[0], iteration]),
                                         size=self.batch_size)
                feed_dict = {
                    self.state_input: xp_replay_state[idxs],
                    self.next_state_input: xp_replay_next_state[idxs],
                    self.action_input: xp_replay_action[idxs],
                    self.reward_input: xp_replay_reward[idxs],
                    self.done_input: xp_replay_terminal[idxs]
                }

                self.sess.run([self.value_train_op], feed_dict)
                self.sess.run(self.train_actor_op, feed_dict)
                self.update_target_weights()
                weights = self.get_weights()
                for i, weight in enumerate(weights):
                    self.variables_server.set("weight_" + str(i),
                                              hlp.dump_object(weight))
                if iteration % self.test_every == 0:
                    print("Time to test!")
                    self.test_mode = True
                    self.make_rollout()
                    paths = self.paths
                    total_rewards = np.array([path["total"] for path in paths])
                    eplens = np.array([len(path["rewards"]) for path in paths])
                    print("""
-------------------------------------------------------------
Mean test score:           {test_scores}
Mean test episode length:  {test_eplengths}
Max test score:            {max_test}
Time for iteration:        {tt}
-------------------------------------------------------------
                                    """.format(
                        test_scores=np.mean(total_rewards),
                        test_eplengths=np.mean(eplens),
                        max_test=np.max(total_rewards),
                        tt=time.time() - start_time))
                    start_time = time.time()
                    self.test_scores.append(np.mean(total_rewards))

                self.timestep += 1
            iteration += 1
Exemple #9
0
    def train(self):
        cmd_server = 'redis-server --port 12000'
        p = subprocess.Popen(cmd_server, shell=True, preexec_fn=os.setsid)
        self.variables_server = Redis(port=12000)
        means = "-"
        stds = "-"
        if self.scale != 'off':
            if self.timestep == 0:
                print("Time to measure features!")
                if self.distributed:
                    worker_args = \
                        {
                            'config': self.config,
                            'test_mode': False,
                        }
                    hlp.launch_workers(worker_args, self.n_workers)
                    paths = []
                    for i in range(self.n_workers):
                        paths += hlp.load_object(
                            self.variables_server.get("paths_{}".format(i)))
                else:
                    self.test_mode = False
                    self.make_rollout()
                    paths = self.paths

                for path in paths:
                    self.sums += path["sumobs"]
                    self.sumsqrs += path["sumsqrobs"]
                    self.sumtime += path["observations"].shape[0]

            stds = np.sqrt(
                (self.sumsqrs - np.square(self.sums) / self.sumtime) /
                (self.sumtime - 1))
            means = self.sums / self.sumtime
            print("Init means: {}".format(means))
            print("Init stds: {}".format(stds))
            self.variables_server.set("means", hlp.dump_object(means))
            self.variables_server.set("stds", hlp.dump_object(stds))
            self.sess.run(self.norm_set_op,
                          feed_dict=dict(zip(self.norm_phs, [means, stds])))
        while True:
            print("Iteration {}".format(self.timestep))
            start_time = time.time()

            if self.distributed:
                weights = self.get_weights()
                for i, weight in enumerate(weights):
                    self.variables_server.set("weight_" + str(i),
                                              hlp.dump_object(weight))
                worker_args = \
                    {
                        'config': self.config,
                        'test_mode': False,
                    }
                hlp.launch_workers(worker_args, self.n_workers)
                paths = []
                for i in range(self.n_workers):
                    paths += hlp.load_object(
                        self.variables_server.get("paths_{}".format(i)))
            else:
                self.test_mode = False
                self.make_rollout()
                paths = self.paths

            observations = np.concatenate(
                [path["observations"] for path in paths])
            actions = np.concatenate([path["action_tuples"] for path in paths])
            action_dists = []
            for _ in range(len(self.n_actions)):
                action_dists.append([])
            returns = []
            advantages = []
            for path in paths:
                self.sums += path["sumobs"]
                self.sumsqrs += path["sumsqrobs"]
                self.sumtime += path["rewards"].shape[0]
                dists = path["dist_tuples"]

                for i in range(len(self.n_actions)):
                    action_dists[i] += [dist[i][0] for dist in dists]
                returns += hlp.discount(path["rewards"], self.gamma,
                                        path["timestamps"]).tolist()
                values = self.sess.run(
                    self.value,
                    feed_dict={self.state_input: path["observations"]})
                values = np.append(values,
                                   0 if path["terminated"] else values[-1])
                deltas = (path["rewards"] + self.gamma * values[1:] -
                          values[:-1])
                advantages += hlp.discount(deltas, self.gamma,
                                           path["timestamps"]).tolist()
            returns = np.array(returns)
            advantages = np.array(advantages)

            if self.normalize == 'ranks':
                ranks = np.zeros_like(advantages)
                ranks[np.argsort(advantages)] = np.arange(
                    ranks.shape[0], dtype=np.float32) / (ranks.shape[0] - 1)
                ranks -= 0.5
                advantages = ranks[:]
            elif self.normalize == 'center':
                advantages -= np.mean(advantages)
                advantages /= (np.std(advantages, ddof=1) + 0.001)

            feed_dict = {
                self.state_input: observations,
                self.targets["return"]: returns,
                self.targets["advantage"]: advantages
            }

            for i in range(len(self.n_actions)):
                feed_dict[self.targets["old_dist_{}".format(i)]] = np.array(
                    action_dists[i])
                feed_dict[self.targets["action_{}".format(i)]] = actions[:, i]

            for i in range(self.value_updates):
                self.sess.run(self.value_train_op, feed_dict)

            train_rewards = np.array([path["rewards"].sum() for path in paths])
            train_lengths = np.array([len(path["rewards"]) for path in paths])

            thprev = self.get_flat()

            def fisher_vector_product(p):
                feed_dict[self.targets["flat_tangent"]] = p
                return self.sess.run(self.fisher_vector_product,
                                     feed_dict) + 0.1 * p

            g = self.sess.run(self.policy_grad, feed_dict)
            stepdir = hlp.conjugate_gradient(fisher_vector_product, -g)

            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / self.max_kl)
            fullstep = stepdir / (lm + 1e-18)

            def loss_kl(th):
                self.set_from_flat(th)
                return self.sess.run([self.loss, self.KL], feed_dict=feed_dict)

            theta = hlp.linesearch(loss_kl, thprev, fullstep, self.max_kl)
            self.set_from_flat(theta)

            lossafter, kloldnew = self.sess.run([self.loss, self.KL],
                                                feed_dict=feed_dict)

            print("Time for testing!")

            if self.distributed:
                weights = self.get_weights()
                for i, weight in enumerate(weights):
                    self.variables_server.set("weight_" + str(i),
                                              hlp.dump_object(weight))
                worker_args = \
                    {
                        'config': self.config,
                        'test_mode': True,
                    }
                hlp.launch_workers(worker_args, self.n_workers)
                paths = []
                for i in range(self.n_workers):
                    paths += hlp.load_object(
                        self.variables_server.get("paths_{}".format(i)))
            else:
                self.test_mode = True
                self.make_rollout()
                paths = self.paths

            total_rewards = np.array([path["total"] for path in paths])
            eplens = np.array([len(path["rewards"]) for path in paths])

            if self.scale != 'full':
                stds = np.sqrt(
                    (self.sumsqrs - np.square(self.sums) / self.sumtime) /
                    (self.sumtime - 1))
                means = self.sums / self.sumtime
                self.variables_server.set("means", hlp.dump_object(means))
                self.variables_server.set("stds", hlp.dump_object(stds))
                self.sess.run(self.norm_set_op,
                              feed_dict=dict(zip(self.norm_phs,
                                                 [means, stds])))

            print("""
-------------------------------------------------------------
Mean test score:           {test_scores}
Mean train score:          {train_scores}
Mean test episode length:  {test_eplengths}
Mean train episode length: {train_eplengths}
Max test score:            {max_test}
Max train score:           {max_train}
KL between old and new     {kl}
Loss after update          {loss}
Mean of features:          {means}
Std of features:           {stds}
-------------------------------------------------------------
                """.format(means=means,
                           stds=stds,
                           test_scores=np.mean(total_rewards),
                           test_eplengths=np.mean(eplens),
                           train_scores=np.mean(train_rewards),
                           train_eplengths=np.mean(train_lengths),
                           max_test=np.max(total_rewards),
                           max_train=np.max(train_rewards),
                           kl=kloldnew,
                           loss=lossafter))
            self.timestep += 1
            self.train_scores.append(np.mean(train_rewards))
            self.test_scores.append(np.mean(total_rewards))
            if self.timestep % self.save_every == 0:
                self.save(self.config[:-5])