Exemple #1
0
    def __init__(self,
                 num_actors,
                 env_kwargs,
                 env_interface,
                 run_name='temp',
                 load_model=False):
        self.num_actors = num_actors
        self.pipes = []
        self.processes = []
        self.threads = []
        self.trajectory_queue = []

        self.name = run_name

        project_root = os.path.dirname(os.path.realpath(__file__))
        self.save_dir = os.path.join(project_root, 'saves', run_name)
        self.weights_dir = os.path.join(self.save_dir, 'weights')
        self.code_dir = os.path.join(self.save_dir, 'code')

        self.weights_path = os.path.join(self.weights_dir, 'weights',
                                         'model.ckpt')
        if not os.path.exists(self.weights_dir):
            os.makedirs(self.weights_dir)
        if not os.path.exists(self.code_dir):
            os.makedirs(self.code_dir)
        os.system('cp -r ' + os.path.join(project_root, './*.py') + ' ' +
                  self.code_dir)
        self.rewards_path = os.path.join(self.save_dir, 'rewards.txt')

        self.epoch = 0
        self.env_kwargs = env_kwargs

        self.discount_factor = 0.95
        self.td_lambda = 0.95

        self.env_interface = env_interface
        self.agent = LSTMAgent(self.env_interface)

        with self.agent.graph.as_default():
            self.rewards_input = tf.placeholder(tf.float32, [None],
                                                name="rewards")  # T
            self.behavior_log_probs_input = tf.placeholder(
                tf.float32, [None], name="behavior_log_probs")  # T
            self.loss = self._ac_loss()
            # self.loss = self._impala_loss()
            self.train_op = tf.train.AdamOptimizer(0.0003).minimize(self.loss)
            self.session = self.agent.session
            self.session.run(tf.global_variables_initializer())
            self.saver = tf.train.Saver()
            if load_model:
                try:
                    self._load_model()
                except Exception:
                    print("Could not load model")
Exemple #2
0
class Actor:
    def __init__(self, pipe, env_interface, env_kwargs):
        self.env_interface = env_interface
        self.agent = LSTMAgent(self.env_interface)
        with self.agent.graph.as_default():
            self.session = self.agent.session
            self.session.run(tf.global_variables_initializer())
            self.variable_names = [v.name for v in tf.trainable_variables()]
            self.assign_placeholders = {t.name: tf.placeholder(t.dtype, t.shape)
                                        for t in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)}
            self.assigns = [tf.assign(tensor, self.assign_placeholders[tensor.name])
                            for tensor in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)]

        self.env_interface = env_interface
        self.env = SCSingleEnvironment(self.env_interface, env_kwargs=env_kwargs)
        # self.env = MultipleEnvironment(lambda: SCEnvironmentWrapper(self.env_interface, env_kwargs=env_kwargs),
        #                                   num_instance=1)

        self.curr_iteration = 0
        self.pipe = pipe

    def generate_trajectory(self):
        """
        Repeatedly generates actions from the agent and steps in the environment until all environments have reached a
        terminal state. Returns each trajectory in the form of rollouts.
        """
        states, masks, _, _ = self.env.reset()
        memory = None
        rollout = Rollout()

        while True:
            action_indices, memory, log_probs = self.agent.step(states, masks, memory)
            new_states, new_masks, rewards, dones = self.env.step(action_indices)

            rollout.add_step(states[0], action_indices[0], rewards[0], masks[0], dones[0], log_probs[0])
            states = new_states
            masks = new_masks
            if all(dones):
                # Add in the done state for rollouts which just finished for calculating the bootstrap value.
                rollout.add_step(states[0])
                break
        self.curr_iteration += 1
        print("=============== Reward on iteration %d is [%.1f]" % (self.curr_iteration, rollout.total_reward()))
        return rollout

    def get_params(self):
        self.pipe.send(("get_params", None))
        names_to_params = self.pipe.recv()

        with self.agent.graph.as_default():
            self.session.run(self.assigns, feed_dict={
                self.assign_placeholders[name]: names_to_params[name] for name in self.variable_names
            })

    def send_trajectory(self, trajectory):
        # print("[ACTOR] sending trajectory:")
        self.pipe.send(("add_trajectory", trajectory))
Exemple #3
0
    def __init__(self, pipe, env_interface, env_kwargs):
        self.env_interface = env_interface
        self.agent = LSTMAgent(self.env_interface)
        with self.agent.graph.as_default():
            self.session = self.agent.session
            self.session.run(tf.global_variables_initializer())
            self.variable_names = [v.name for v in tf.trainable_variables()]
            self.assign_placeholders = {t.name: tf.placeholder(t.dtype, t.shape)
                                        for t in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)}
            self.assigns = [tf.assign(tensor, self.assign_placeholders[tensor.name])
                            for tensor in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)]

        self.env_interface = env_interface
        self.env = SCSingleEnvironment(self.env_interface, env_kwargs=env_kwargs)
        # self.env = MultipleEnvironment(lambda: SCEnvironmentWrapper(self.env_interface, env_kwargs=env_kwargs),
        #                                   num_instance=1)

        self.curr_iteration = 0
        self.pipe = pipe
Exemple #4
0
def main(_):
    env_interface = EmbeddingInterfaceWrapper(BeaconEnvironmentInterface())
    # env_interface = EmbeddingInterfaceWrapper(TrainMarines())
    # learner = Learner(10, env_kwargs, env_interface, run_name="MineralWithBeacon2", load_name="Beacon2", load_model=True)

    # env_interface = EmbeddingInterfaceWrapper(BeaconEnvironmentInterface())
    # learner = NormalLearner(env_interface, load_model=False)
    # learner.train()
    # Refresh environment every once in a while to deal with memory leak
    environment = MultipleEnvironment(lambda: SCEnvironmentWrapper(env_interface, env_kwargs),
                                      num_instance=1)
    agent = LSTMAgent(env_interface)
    learner = ActorCriticLearner(environment, agent, run_name="SyncMarines", load_model=False)
    i = 0
    while True:
        i += 1
        print(i)
        learner.train_episode()
Exemple #5
0
class Learner:
    def __init__(self,
                 num_actors,
                 env_kwargs,
                 env_interface,
                 run_name='temp',
                 load_name=None,
                 load_model=False):
        self.num_actors = num_actors
        self.pipes = []
        self.processes = []
        self.threads = []
        self.trajectory_queue = []

        self.name = run_name
        if load_name is None:
            self.load_name = run_name
        else:
            self.load_name = load_name

        project_root = os.path.dirname(os.path.realpath(__file__))
        self.save_dir = os.path.join(project_root, 'saves', run_name)
        self.weights_path_load = os.path.join(project_root, 'saves',
                                              self.load_name, 'weights')
        self.code_dir = os.path.join(self.save_dir, 'code')
        self.weights_dir = os.path.join(self.save_dir, 'weights')
        self.weights_path = os.path.join(self.weights_dir, 'model.ckpt')

        # body_keywords = ["pointer_head/dense/", "pointer_head/dense_1/", "shared", "lstm"]
        if not os.path.exists(self.weights_dir):
            os.makedirs(self.weights_dir)
        if not os.path.exists(self.code_dir):
            os.makedirs(self.code_dir)

        os.system('cp -r ' + os.path.join(project_root, './*.py') + ' ' +
                  self.code_dir)
        self.rewards_path = os.path.join(self.save_dir, 'rewards.txt')

        self.epoch = 0
        self.env_kwargs = env_kwargs

        self.discount_factor = 0.95
        self.td_lambda = 0.95

        self.env_interface = env_interface
        self.agent = LSTMAgent(self.env_interface)

        with self.agent.graph.as_default():
            self.rewards_input = tf.placeholder(tf.float32, [None],
                                                name="rewards")  # T
            self.behavior_log_probs_input = tf.placeholder(
                tf.float32, [None], name="behavior_log_probs")  # T
            self.loss = self._ac_loss()
            # self.loss = self._impala_loss()

            head_variables = [
                v for v in tf.trainable_variables() if "shared" not in v.name
            ]
            for var in head_variables:
                print(var)

            print("body variables are")
            body_variables = [
                v for v in tf.trainable_variables() if "shared" in v.name
            ]
            for var in body_variables:
                print(var)

            self.train_op = tf.train.AdamOptimizer(0.0003).minimize(self.loss)
            self.session = self.agent.session
            # self.session.run(tf.global_variables_initializer())
            self.saver = tf.train.Saver()
            if load_model:
                try:
                    self._load_model()
                except Exception:
                    print("Could not load model")
            # self.session.run(tf.initialize_variables(head_variables))

    def start_children(self):
        for process_id in range(self.num_actors):
            parent_conn, child_conn = Pipe()
            self.pipes.append(parent_conn)
            p = Process(target=run_actor,
                        args=(partial(Actor, child_conn, self.env_interface,
                                      self.env_kwargs), ))
            self.processes.append(p)
            p.start()

        for i in range(self.num_actors):
            t = Thread(target=learn, args=(self, self.pipes[i]))
            self.threads.append(t)
            t.start()

    def train(self):
        print("Starting training")
        self.start_children()
        print("Finished starting children")
        while True:
            # print("[Learner] Sleeping")
            time.sleep(0.01)
            if len(self.trajectory_queue) >= 1:
                self.update_model(self.trajectory_queue)
                self.trajectory_queue = []

    def add_trajectory(self, trajectory):
        self.update_model([trajectory])
        # self.trajectory_queue.append(trajectory)

    def update_model(self, rollouts):
        for i in range(len(rollouts)):
            rollout = rollouts[i]
            if rollout.done:
                feed_dict = {
                    self.rewards_input:
                    rollout.rewards,
                    # self.behavior_log_probs_input: rollout.log_probs,
                    **self.agent.get_feed_dict(rollout.states, rollout.masks, rollout.actions, rollout.bootstrap_state)
                }

                loss, _ = self.session.run([self.loss, self.train_op],
                                           feed_dict=feed_dict)

        self.epoch += 1
        if self.epoch % 50 == 0:
            self.save_model()
        with open(self.rewards_path, 'a+') as f:
            for r in rollouts:
                f.write('%d\n' % r.total_reward())

    def save_model(self):
        """
        Saves the current model weights in current `save_path`.
        """
        save_path = self.saver.save(self.session, self.weights_path)
        print("Model Saved in %s" % save_path)

    def _load_model(self):
        """
        Loads the model from weights stored in the current `save_path`.
        """
        self.saver.restore(self.session, self.weights_path_load)
        print('Model Loaded from ', self.weights_path_load)

    def _impala_loss(self):
        num_steps = tf.shape(self.rewards_input)[0]
        discounts = tf.ones((num_steps, 1)) * self.discount_factor
        rewards = tf.expand_dims(self.rewards_input, axis=-1)

        values = tf.expand_dims(self.agent.train_values(), axis=-1)
        bootstrap = tf.expand_dims(self.agent.bootstrap_value(), axis=-1)
        train_log_probs = self.agent.train_log_probs()

        log_rhos = tf.expand_dims(train_log_probs -
                                  self.behavior_log_probs_input,
                                  axis=-1)
        vs, advantage = trfl.vtrace_from_importance_weights(
            log_rhos, discounts, rewards, values, bootstrap)

        loss_actor = tf.reduce_mean(-tf.stop_gradient(advantage) *
                                    train_log_probs)
        loss_critic = tf.reduce_mean((vs - values)**2)
        result = loss_actor + 0.5 * loss_critic
        return result

    def _ac_loss(self):
        num_steps = tf.shape(self.rewards_input)[0]
        discounts = tf.ones((num_steps, 1)) * self.discount_factor
        rewards = tf.expand_dims(self.rewards_input, axis=1)

        values = tf.expand_dims(self.agent.train_values(), axis=1)
        bootstrap = tf.expand_dims(self.agent.bootstrap_value(), axis=0)
        glr = trfl.generalized_lambda_returns(rewards,
                                              discounts,
                                              values,
                                              bootstrap,
                                              lambda_=self.td_lambda)
        advantage = tf.squeeze(glr - values)

        loss_actor = tf.reduce_mean(-tf.stop_gradient(advantage) *
                                    self.agent.train_log_probs())
        loss_critic = tf.reduce_mean(advantage**2)
        result = loss_actor + 0.5 * loss_critic
        return result
Exemple #6
0
#def main():
env = Env(trader=trader,
          symbol=symbol,
          commission=commission,
          action_space=action_space,
          share=exe_shares,
          time_total=execute_time,
          time_steps=exe_times,
          objPrice=exe_price,
          close_price_volumn=num_states)
agent = LSTMAgent(sess_=sess,
                  observations_dim=num_states + 2,
                  action_space=action_space,
                  batch_size=batch_size,
                  Q_function=Qf.ann,
                  optimizer=tf.train.AdamOptimizer,
                  GAMMA=GAMMA,
                  EPSILON=EPSILON,
                  LOAD=load,
                  learning_rate=0.001)
pool = SimpleReplayPool(max_pool_size=1000, pop_size=100)

for i in range(EPISODES):
    # Deal with the initialization for each episode
    print("*" * 100)
    print(f'THE {i+1} EPISODE \n\n')
    sum_rew4epi = 0
    if i % 2 == 1:
        bp = trader.get_best_price(symbol)
        exe_price = bp.get_bid_price()
        env.set_objective(share=-exe_shares,
Exemple #7
0
    def __init__(self, num_actors, env_kwargs, env_interface, run_name='temp', load_name=None):
        self.num_actors = num_actors
        self.pipes = []
        self.processes = []
        self.threads = []
        self.trajectory_queue = []

        self.name = run_name
        if load_name is None:
            self.load_name = run_name
        else:
            self.load_name = load_name

        project_root = os.path.dirname(os.path.realpath(__file__))
        self.save_dir = os.path.join(project_root, 'saves', run_name)
        self.weights_path_load = os.path.join(project_root, 'saves', self.load_name, 'weights', 'model.ckpt')

        self.code_dir = os.path.join(self.save_dir, 'code')
        self.weights_dir = os.path.join(self.save_dir, 'weights')
        self.weights_path = os.path.join(self.weights_dir, 'model.ckpt')
        # self.replay_dir = os.path.join(self.load_name, 'replays')

        # body_keywords = ["pointer_head/dense/", "pointer_head/dense_1/", "shared", "lstm"]
        if not os.path.exists(self.weights_dir):
            os.makedirs(self.weights_dir)
        if not os.path.exists(self.code_dir):
            os.makedirs(self.code_dir)
        # if not os.path.exists(self.replay_dir):
        #     os.makedirs(self.replay_dir)

        os.system('cp -r ' + os.path.join(project_root, './*.py') + ' ' + self.code_dir)
        self.rewards_path = os.path.join(self.save_dir, 'rewards.txt')

        self.epoch = 0
        self.env_kwargs = env_kwargs
        # self.env_kwargs['replay_dir'] = self.replay_dir
        print("Asdfasd")
        print(self.env_kwargs)

        self.discount_factor = 0.95
        self.td_lambda = 0.95

        self.env_interface = env_interface
        self.agent = LSTMAgent(self.env_interface)

        with self.agent.graph.as_default():
            self.rewards_input = tf.placeholder(tf.float32, [None], name="rewards")  # T
            self.behavior_log_probs_input = tf.placeholder(tf.float32, [None], name="behavior_log_probs")  # T
            self.loss = self._ac_loss()
            # self.loss = self._impala_loss()

            # head_variables = [v for v in tf.trainable_variables() if "shared" not in v.name]
            # for var in head_variables:
            #     print(var)
            #
            # print("body variables are")
            # body_variables = [v for v in tf.trainable_variables() if "shared" in v.name]
            # for var in body_variables:
            #     print(var)

            self.train_op = tf.train.AdamOptimizer(0.0003).minimize(self.loss)
            self.session = self.agent.session
            self.session.run(tf.global_variables_initializer())
            self.saver = tf.train.Saver()
            try:
                self._load_model()
            except Exception as e:
                print(e)
                print("Could not load model")