def cross_validate(self, step_size, momentum, net_dims, q_net_dims, update_method, seed_state, num_iters=50, batch_size=50, traj_len=20, seed=0): learner = policy_gradient.PolicyGradient(net_dims=net_dims, q_net_dims=q_net_dims, output_function='tanh', seed=seed, seed_state=seed_state) initial_state = np.zeros((self.A.shape[0], 1)) mean_rewards, ending_states = learner.train_agent(dynamics_func=self.calculate_next_state, reward_func=self.reward_function, \ update_method=update_method, initial_state=initial_state, num_iters=num_iters, batch_size=batch_size, traj_len=traj_len, \ step_size=step_size, momentum=momentum, normalize=False) mean_ending_states = np.mean(ending_states, axis=1) mean_ending_distances = [ la.norm(s - self.goal_state) for s in mean_ending_states ] return mean_ending_distances
def __init__(self, A=None, B=None, goal_state=None, net_dims=None): """ Initializes dynamics of environment. Parameters: A: array-like Transition matrix for states. B: array-like Transition matrix for actions. """ self.A = A if A is not None else np.diag(np.ones(2)) self.B = B if B is not None else np.diag(np.ones(2)) self.goal_state = goal_state if goal_state is not None else np.array([10,10]).reshape(2,1) self.net_dims = net_dims if net_dims is not None else [2,6,4,3,2] self.learner = policy_gradient.PolicyGradient(net_dims=self.net_dims, output_function='tanh')
import pandas as pd import numpy as np import gym import tensorflow as tf import matplotlib as mpl import matplotlib.pyplot as plt from matplotlib import interactive interactive(True) import pdb import logging log = logging.getLogger() #log.addHandler(logging.StreamHandler()) import policy_gradient # create gym env = gym.make('trading-v0') sess = tf.InteractiveSession() # create policygradient pg = policy_gradient.PolicyGradient(sess, obs_dim=5, num_actions=3, learning_rate=1e-2) # train model, loading if possible alldf, summrzed = pg.train_model(env, episodes=301, log_freq=100) #, load_model=True) #print alldf pd.DataFrame(summrzed).expanding().mean().plot() input("Press Enter to continue...")
pred_str = 'FLAT (1)' elif prediction == 2: pred_str = 'SELL (2)' print('---------Predicted action is:') print(pred_str) time = int(dt.datetime.now().strftime('%s')) # in millisec: * 1000 outputString = str(prediction) + ' ' + str(time) file = open(conf.OUTPUT_PREDICT_PATH + conf.TICKER + '_prediction.txt', 'w') file.write(outputString) file.close() if __name__ == '__main__': # Disable tensorflow compilation warnings os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' # create the tf session sess = tf.compat.v1.InteractiveSession() # create policygradient pg = policy_gradient.PolicyGradient(sess, obs_dim=conf.observation_dimension, num_actions=conf.number_of_actions, learning_rate=conf.first_lr) generate_predictions(conf.TICKER)
def train_PG( maxlength, idx2regs, regs2idx, height, weight, actionsize, exp_name, env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=0.000000625, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, tofile=False, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds np.random.seed(seed) # Make the gym environment #env = gym.make(env_name) env = en.Gplayer(idx2regs, regs2idx, maxlength, tofile, "./data/log/") act = func.ActorFunc() # Is this env continuous, or discrete? discrete = True #discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes #max_path_length = max_path_length or env.spec.max_episode_steps max_path_length = 3333 # Observation and action sizes ob_dim = actionsize * actionsize ac_dim = actionsize #ob_dim = env.observation_space.shape[0] #ac_dim = env.action_space.n if discrete else env.action_space.shape[0] act.createPred(ac_dim, n_layers, size) act.createOptimizer(learning_rate) act.run_init() #========================================================================================# # Training Loop #========================================================================================# pg = policy_gradient.PolicyGradient(n_iter, env, act, animate, min_timesteps_per_batch, max_path_length, reward_to_go) pg.run(gamma, logz, start)