コード例 #1
0
ファイル: testbed.py プロジェクト: jacobzweig/policy_gradient
    def cross_validate(self,
                       step_size,
                       momentum,
                       net_dims,
                       q_net_dims,
                       update_method,
                       seed_state,
                       num_iters=50,
                       batch_size=50,
                       traj_len=20,
                       seed=0):

        learner = policy_gradient.PolicyGradient(net_dims=net_dims,
                                                 q_net_dims=q_net_dims,
                                                 output_function='tanh',
                                                 seed=seed,
                                                 seed_state=seed_state)
        initial_state = np.zeros((self.A.shape[0], 1))
        mean_rewards, ending_states = learner.train_agent(dynamics_func=self.calculate_next_state, reward_func=self.reward_function, \
         update_method=update_method, initial_state=initial_state, num_iters=num_iters, batch_size=batch_size, traj_len=traj_len, \
         step_size=step_size, momentum=momentum, normalize=False)
        mean_ending_states = np.mean(ending_states, axis=1)
        mean_ending_distances = [
            la.norm(s - self.goal_state) for s in mean_ending_states
        ]
        return mean_ending_distances
コード例 #2
0
	def __init__(self, A=None, B=None, goal_state=None, net_dims=None):
		"""
		Initializes dynamics of environment.

		Parameters:
		A: array-like
			Transition matrix for states.
		B: array-like
			Transition matrix for actions.
		"""
		self.A = A if A is not None else np.diag(np.ones(2)) 
		self.B = B if B is not None else np.diag(np.ones(2))
		self.goal_state = goal_state if goal_state is not None else np.array([10,10]).reshape(2,1)
		self.net_dims = net_dims if net_dims is not None else [2,6,4,3,2]
		self.learner = policy_gradient.PolicyGradient(net_dims=self.net_dims, output_function='tanh')
コード例 #3
0
import pandas as pd
import numpy as np
import gym
import tensorflow as tf
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import interactive
interactive(True)
import pdb
import logging

log = logging.getLogger()
#log.addHandler(logging.StreamHandler())
import policy_gradient
# create gym
env = gym.make('trading-v0')

sess = tf.InteractiveSession()

# create policygradient
pg = policy_gradient.PolicyGradient(sess,
                                    obs_dim=5,
                                    num_actions=3,
                                    learning_rate=1e-2)

# train model, loading if possible
alldf, summrzed = pg.train_model(env, episodes=301,
                                 log_freq=100)  #, load_model=True)
#print alldf
pd.DataFrame(summrzed).expanding().mean().plot()
input("Press Enter to continue...")
コード例 #4
0
        pred_str = 'FLAT (1)'
    elif prediction == 2:
        pred_str = 'SELL (2)'

    print('---------Predicted action is:')
    print(pred_str)

    time = int(dt.datetime.now().strftime('%s'))  # in millisec: * 1000
    outputString = str(prediction) + ' ' + str(time)

    file = open(conf.OUTPUT_PREDICT_PATH + conf.TICKER + '_prediction.txt',
                'w')
    file.write(outputString)
    file.close()


if __name__ == '__main__':

    # Disable tensorflow compilation warnings
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

    # create the tf session
    sess = tf.compat.v1.InteractiveSession()

    # create policygradient
    pg = policy_gradient.PolicyGradient(sess,
                                        obs_dim=conf.observation_dimension,
                                        num_actions=conf.number_of_actions,
                                        learning_rate=conf.first_lr)

    generate_predictions(conf.TICKER)
コード例 #5
0
ファイル: train_pg.py プロジェクト: Knight-X/rLens
def train_PG(
    maxlength,
    idx2regs,
    regs2idx,
    height,
    weight,
    actionsize,
    exp_name,
    env_name='CartPole-v0',
    n_iter=100,
    gamma=1.0,
    min_timesteps_per_batch=1000,
    max_path_length=None,
    learning_rate=0.000000625,
    reward_to_go=True,
    animate=True,
    logdir=None,
    normalize_advantages=True,
    nn_baseline=False,
    seed=0,
    # network arguments
    n_layers=1,
    size=32,
    tofile=False,
):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    np.random.seed(seed)

    # Make the gym environment
    #env = gym.make(env_name)
    env = en.Gplayer(idx2regs, regs2idx, maxlength, tofile, "./data/log/")
    act = func.ActorFunc()

    # Is this env continuous, or discrete?
    discrete = True
    #discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    #max_path_length = max_path_length or env.spec.max_episode_steps
    max_path_length = 3333

    # Observation and action sizes
    ob_dim = actionsize * actionsize
    ac_dim = actionsize
    #ob_dim = env.observation_space.shape[0]
    #ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    act.createPred(ac_dim, n_layers, size)
    act.createOptimizer(learning_rate)
    act.run_init()
    #========================================================================================#
    # Training Loop
    #========================================================================================#
    pg = policy_gradient.PolicyGradient(n_iter, env, act, animate,
                                        min_timesteps_per_batch,
                                        max_path_length, reward_to_go)

    pg.run(gamma, logz, start)