def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='ppaquette/SuperMarioBros-1-1-v0') parser.add_argument('--render', action='store_true') parser.add_argument('--threads', type=int, default=8) parser.add_argument('--load', type=str) args = parser.parse_args() sess = tf.Session() sess.__enter__() model = make_network( [[32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0]]) icm_model = make_icm( [[32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0]]) env_name = args.env actions = np.arange(14).tolist() master = Agent(model, icm_model, len(actions), name='global') global_step = tf.Variable(0, dtype=tf.int64, name='global_step') global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') saver = tf.train.Saver(global_vars) if args.load: saver.restore(sess, args.load) workers = [] for i in range(args.threads): render = False if args.render and i == 0: render = True worker = Worker('worker{}'.format(i), model, icm_model, global_step, env_name, render=render) workers.append(worker) summary_writer = tf.summary.FileWriter('log', sess.graph) if args.render: sample_worker = workers.pop(0) initialize() coord = tf.train.Coordinator() threads = [] for i in range(len(workers)): worker_thread = lambda: workers[i].run(sess, summary_writer, saver) thread = threading.Thread(target=worker_thread) thread.start() threads.append(thread) time.sleep(0.1) if args.render: sample_worker.run(sess, summary_writer) coord.join(threads)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='MyAnt-v1') parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--load', type=str, default=None) parser.add_argument('--render', action='store_true') args = parser.parse_args() env = gym.make(args.env) obs_dim = env.observation_space.shape[0] n_actions = env.action_space.shape[0] network = make_network([100, 100, 100]) # critic = make_critic_network() sess = tf.Session() sess.__enter__() agent = Agent(network, obs_dim, n_actions, None) saver = tf.train.Saver() if args.load is not None: saver.restore(sess, args.load) global_step = 0 episode = 0 while True: sum_of_rewards = 0 done = False step = 0 state = env.reset() while True: if args.render: env.render() action = agent.act(state) if done: break print action state, reward, done, info = env.step(action) sum_of_rewards += reward step += 1 global_step += 1 episode += 1 print('Episode: {}, Step: {}: Reward: {}'.format( episode, global_step, sum_of_rewards))
def build_vr_loss(convs, fcs, padding, lstm, obs_t, actions_tm1, rewards_t, num_actions, lstm_unit, returns_t): init_state = tf.zeros((1, lstm_unit), dtype=tf.float32) rnn_state_tuple = tf.contrib.rnn.LSTMStateTuple(init_state, init_state) _, value_t, _ = make_network(convs, fcs, padding, lstm, obs_t, actions_tm1, rewards_t, rnn_state_tuple, num_actions, lstm_unit, scope='model', reuse=True) returns_t = tf.reshape(returns_t, [-1, 1]) loss = tf.reduce_sum((returns_t - value_t) ** 2) return loss
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='ppaquette/SuperMarioBros-1-1-v0') parser.add_argument('--render', action='store_true') parser.add_argument('--load', type=str) args = parser.parse_args() sess = tf.Session() sess.__enter__() model = make_network([[32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0]]) icm_model = make_icm([[32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0], [32, 3, 2, 0]]) env_name = args.env actions = np.arange(14).tolist() global_step = tf.Variable(0, dtype=tf.int64, name='global_step') worker = Worker('global', model, icm_model, global_step, env_name, render=args.render, training=False) global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') saver = tf.train.Saver(global_vars) if args.load: saver.restore(sess, args.load) summary_writer = tf.summary.FileWriter('log', sess.graph) worker.run(sess, summary_writer, saver)
def main(): date = datetime.now().strftime('%Y%m%d%H%M%S') parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='PongDeterministic-v4') parser.add_argument('--threads', type=int, default=8) parser.add_argument('--load', type=str) parser.add_argument('--logdir', type=str, default=date) parser.add_argument('--render', action='store_true') parser.add_argument('--demo', action='store_true') parser.add_argument('--record', action='store_true') args = parser.parse_args() outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir) if not os.path.exists(outdir): os.makedirs(outdir) logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir) env_name = args.env tmp_env = gym.make(env_name) is_atari = len(tmp_env.observation_space.shape) != 1 # box environment if not is_atari: observation_space = tmp_env.observation_space constants = box_constants actions = range(tmp_env.action_space.n) state_shape = [observation_space.shape[0], constants.STATE_WINDOW] state_preprocess = lambda s: s # (window_size, dim) -> (dim, window_size) phi = lambda s: np.transpose(s, [1, 0]) # atari environment else: constants = atari_constants actions = get_action_space(env_name) state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW] def state_preprocess(state): # atari specific preprocessing state = atari_preprocess(state, constants.STATE_SHAPE) state = np.array(state, dtype=np.float32) return state / 255.0 # (window_size, H, W) -> (H, W, window_size) phi = lambda s: np.transpose(s, [1, 2, 0]) # save settings dump_constants(constants, os.path.join(outdir, 'constants.json')) sess = tf.Session() sess.__enter__() model = make_network( constants.CONVS, constants.FCS, lstm=constants.LSTM, padding=constants.PADDING) # share Adam optimizer with all threads! lr = tf.Variable(constants.LR) decayed_lr = tf.placeholder(tf.float32) decay_lr_op = lr.assign(decayed_lr) if constants.OPTIMIZER == 'rmsprop': optimizer = tf.train.RMSPropOptimizer(lr, decay=0.99, epsilon=0.1) else: optimizer = tf.train.AdamOptimizer(lr) master = make_agent( model, actions, optimizer, state_shape, phi, 'global', constants) global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global') saver = tf.train.Saver(global_vars) if args.load: saver.restore(sess, args.load) agents = [] envs = [] for i in range(args.threads): name = 'worker{}'.format(i) agent = make_agent( model, actions, optimizer, state_shape, phi, name, constants) agents.append(agent) env = gym.make(args.env) env.seed(constants.RANDOM_SEED) if is_atari: env = NoopResetEnv(env) env = EpisodicLifeEnv(env) wrapped_env = EnvWrapper( env, r_preprocess=lambda r: np.clip(r, -1, 1), s_preprocess=state_preprocess ) envs.append(wrapped_env) sess.run(tf.global_variables_initializer()) summary_writer = tf.summary.FileWriter(logdir, sess.graph) tflogger = TfBoardLogger(summary_writer) tflogger.register('reward', dtype=tf.float32) tflogger.register('eval_reward', dtype=tf.float32) end_episode = lambda r, gs, s, ge, e: tflogger.plot('reward', r, gs) def after_action(state, reward, shared_step, global_step, local_step): if constants.LR_DECAY == 'linear': decay = 1.0 - (float(shared_step) / constants.FINAL_STEP) if decay < 0.0: decay = 0.0 sess.run(decay_lr_op, feed_dict={decayed_lr: constants.LR * decay}) if shared_step % 10 ** 6 == 0: path = os.path.join(outdir, 'model.ckpt') saver.save(sess, path, global_step=shared_step) trainer = AsyncTrainer( envs=envs, agents=agents, render=args.render, state_shape=state_shape[:-1], state_window=constants.STATE_WINDOW, final_step=constants.FINAL_STEP, after_action=after_action, end_episode=end_episode, training=not args.demo, n_threads=args.threads ) trainer.start()
def main(): date = datetime.now().strftime('%Y%m%d%H%M%S') parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='PongNoFrameskip-v4') parser.add_argument('--load', type=str) parser.add_argument('--logdir', type=str, default=date) parser.add_argument('--render', action='store_true') parser.add_argument('--demo', action='store_true') args = parser.parse_args() outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir) if not os.path.exists(outdir): os.makedirs(outdir) logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir) env_name = args.env tmp_env = gym.make(env_name) is_atari = len(tmp_env.observation_space.shape) != 1 if not is_atari: observation_space = tmp_env.observation_space constants = box_constants if isinstance(tmp_env.action_space, gym.spaces.Box): num_actions = tmp_env.action_space.shape[0] else: num_actions = tmp_env.action_space.n state_shape = [observation_space.shape[0], constants.STATE_WINDOW] state_preprocess = lambda s: s reward_preprocess = lambda r: r / 10.0 # (window_size, dim) -> (dim, window_size) phi = lambda s: np.transpose(s, [1, 0]) else: constants = atari_constants num_actions = tmp_env.action_space.n state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW] def state_preprocess(state): state = atari_preprocess(state, constants.STATE_SHAPE) state = np.array(state, dtype=np.float32) return state / 255.0 reward_preprocess = lambda r: np.clip(r, -1.0, 1.0) # (window_size, H, W) -> (H, W, window_size) phi = lambda s: np.transpose(s, [1, 2, 0]) # flag of continuous action space continuous = isinstance(tmp_env.action_space, gym.spaces.Box) upper_bound = tmp_env.action_space.high if continuous else None # save settings dump_constants(constants, os.path.join(outdir, 'constants.json')) sess = tf.Session() sess.__enter__() model = make_network( constants.CONVS, constants.FCS, use_lstm=constants.LSTM, padding=constants.PADDING, continuous=continuous) # learning rate with decay operation if constants.LR_DECAY == 'linear': lr = LinearScheduler(constants.LR, constants.FINAL_STEP, 'lr') epsilon = LinearScheduler( constants.EPSILON, constants.FINAL_STEP, 'epsilon') else: lr = ConstantScheduler(constants.LR, 'lr') epsilon = ConstantScheduler(constants.EPSILON, 'epsilon') agent = Agent( model, num_actions, nenvs=constants.ACTORS, lr=lr, epsilon=epsilon, gamma=constants.GAMMA, lam=constants.LAM, lstm_unit=constants.LSTM_UNIT, value_factor=constants.VALUE_FACTOR, entropy_factor=constants.ENTROPY_FACTOR, time_horizon=constants.TIME_HORIZON, batch_size=constants.BATCH_SIZE, grad_clip=constants.GRAD_CLIP, state_shape=state_shape, epoch=constants.EPOCH, phi=phi, use_lstm=constants.LSTM, continuous=continuous, upper_bound=upper_bound ) saver = tf.train.Saver() if args.load: saver.restore(sess, args.load) # create environemtns envs = [] for i in range(constants.ACTORS): env = gym.make(args.env) env.seed(constants.RANDOM_SEED) if is_atari: env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env) env = EpisodicLifeEnv(env) wrapped_env = EnvWrapper( env, r_preprocess=reward_preprocess, s_preprocess=state_preprocess ) envs.append(wrapped_env) batch_env = BatchEnvWrapper(envs) sess.run(tf.global_variables_initializer()) summary_writer = tf.summary.FileWriter(logdir, sess.graph) logger = TfBoardLogger(summary_writer) logger.register('reward', dtype=tf.float32) end_episode = lambda r, s, e: logger.plot('reward', r, s) def after_action(state, reward, global_step, local_step): if global_step % 10 ** 6 == 0: path = os.path.join(outdir, 'model.ckpt') saver.save(sess, path, global_step=global_step) trainer = BatchTrainer( env=batch_env, agent=agent, render=args.render, state_shape=state_shape[:-1], state_window=constants.STATE_WINDOW, final_step=constants.FINAL_STEP, after_action=after_action, end_episode=end_episode, training=not args.demo ) trainer.start()
def build_train(convs, fcs, padding, lstm, num_actions, optimizer, lstm_unit=256, state_shape=[84, 84, 1], grad_clip=40.0, value_factor=0.5, policy_factor=1.0, entropy_factor=0.01, rp_frame=3, scope='a3c', reuse=None): with tf.variable_scope(scope, reuse=reuse): # placeholers obs_t_ph = tf.placeholder(tf.float32, [None] + state_shape, name='obs_t') rnn_state_ph0 = tf.placeholder( tf.float32, [1, lstm_unit], name='rnn_state_0') rnn_state_ph1 = tf.placeholder( tf.float32, [1, lstm_unit], name='rnn_state_1') actions_tm1_ph = tf.placeholder(tf.int32, [None], name="action_tm1") rewards_t_ph = tf.placeholder(tf.float32, [None], name="reward_t") # placeholders for A3C update actions_t_ph = tf.placeholder(tf.uint8, [None], name='action_t') returns_t_ph = tf.placeholder(tf.float32, [None], name='return_t') advantages_t_ph = tf.placeholder(tf.float32, [None], name='advantage_t') # placeholders for reward prediction update rp_obs_ph = tf.placeholder( tf.float32, [rp_frame] + state_shape, name='rp_obs') rp_reward_tp1_ph = tf.placeholder(tf.int32, [], name='rp_reward_tp1') # placeholders for value function replay update vr_obs_t_ph = tf.placeholder( tf.float32, [None] + state_shape, name='vr_obs_t') vr_actions_tm1_ph = tf.placeholder(tf.int32, [None], name='vr_action_tm1') vr_rewards_t_ph = tf.placeholder(tf.float32, [None], name='vr_reward_t') vr_returns_t_ph = tf.placeholder(tf.float32, [None], name='vr_returns_t') # rnn state in tuple rnn_state_tuple = tf.contrib.rnn.LSTMStateTuple( rnn_state_ph0, rnn_state_ph1) # network outpus actions_tm1_one_hot = tf.one_hot( actions_tm1_ph, num_actions, dtype=tf.float32) policy_t, value_t, state_out = make_network( convs, fcs, padding, lstm, obs_t_ph, actions_tm1_one_hot, rewards_t_ph, rnn_state_tuple, num_actions, lstm_unit, scope='model') actions_t_one_hot = tf.one_hot(actions_t_ph, num_actions, dtype=tf.float32) log_policy_t = tf.log(tf.clip_by_value(policy_t, 1e-20, 1.0)) log_prob = tf.reduce_sum( log_policy_t * actions_t_one_hot, axis=1, keep_dims=True) # A3C loss advantages_t = tf.reshape(advantages_t_ph, [-1, 1]) returns_t = tf.reshape(returns_t_ph, [-1, 1]) with tf.variable_scope('value_loss'): value_loss = tf.reduce_sum((returns_t - value_t) ** 2) with tf.variable_scope('entropy_penalty'): entropy = -tf.reduce_sum(policy_t * log_policy_t) with tf.variable_scope('policy_loss'): policy_loss = tf.reduce_sum(log_prob * advantages_t) a3c_loss = value_factor * value_loss\ - policy_factor * policy_loss - entropy_factor * entropy # reward prediction loss rp_loss = build_rp_loss( convs, padding, rp_frame, rp_obs_ph, rp_reward_tp1_ph) vr_actions_tm1_one_hot = tf.one_hot( vr_actions_tm1_ph, num_actions, dtype=tf.float32) vr_loss = build_vr_loss(convs, fcs, padding, lstm, vr_obs_t_ph, vr_actions_tm1_one_hot, vr_rewards_t_ph, num_actions, lstm_unit, vr_returns_t_ph) # final loss loss = a3c_loss + rp_loss + vr_loss # local network weights local_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope) # global network weights global_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'global') # gradients gradients = tf.gradients(loss, local_vars) gradients, _ = tf.clip_by_global_norm(gradients, grad_clip) optimize_expr = optimizer.apply_gradients(zip(gradients, global_vars)) update_local_expr = [] for local_var, global_var in zip(local_vars, global_vars): update_local_expr.append(local_var.assign(global_var)) update_local_expr = tf.group(*update_local_expr) def update_local(): sess = tf.get_default_session() sess.run(update_local_expr) def train(obs_t, rnn_state0, rnn_state1, actions_t, rewards_t, actions_tm1, returns_t, advantages_t, rp_obs, rp_reward_tp1, vr_obs_t, vr_actions_tm1, vr_rewards_t, vr_returns_t): feed_dict = { obs_t_ph: obs_t, rnn_state_ph0: rnn_state0, rnn_state_ph1: rnn_state1, actions_t_ph: actions_t, actions_tm1_ph: actions_tm1, rewards_t_ph: rewards_t, returns_t_ph: returns_t, advantages_t_ph: advantages_t, rp_obs_ph: rp_obs, rp_reward_tp1_ph: rp_reward_tp1, vr_obs_t_ph: vr_obs_t, vr_actions_tm1_ph: vr_actions_tm1, vr_rewards_t_ph: vr_rewards_t, vr_returns_t_ph: vr_returns_t } sess = tf.get_default_session() return sess.run([loss, optimize_expr], feed_dict=feed_dict)[0] def act(obs_t, actions_tm1, rewards_t, rnn_state0, rnn_state1): feed_dict = { obs_t_ph: obs_t, actions_tm1_ph: actions_tm1, rewards_t_ph: rewards_t, rnn_state_ph0: rnn_state0, rnn_state_ph1: rnn_state1 } sess = tf.get_default_session() return sess.run([policy_t, value_t, state_out], feed_dict=feed_dict) return act, train, update_local
import tensorflow as tf import numpy as np from network import make_network from data_provider import DataProvider from tensorflow.core.protobuf import saver_pb2 import time import os from IPython import embed with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess: network = make_network() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(write_version=saver_pb2.SaverDef.V2) saver.restore(sess, './data/step-10500.ckpt') val_provider = DataProvider('val.tfrecords', sess) one_batch = val_provider.get_minibatch() for i in range(120): one_image = one_batch.images[i, ...][None] one_speed = one_batch.data[0][i][None] a = time.time() target_control, = sess.run(network['outputs'], feed_dict={ network['inputs'][0]: one_image, network['inputs'][1]: one_speed
[0.229, 0.224, 0.225])]) data_transforms = transforms.Compose([transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]) training_dataset = datasets.ImageFolder(train_dir, transform=training_transforms) validation_dataset = datasets.ImageFolder(valid_dir, transform=data_transforms) trainloader = torch.utils.data.DataLoader(training_dataset, batch_size=64, shuffle=True) validationloader = torch.utils.data.DataLoader(validation_dataset, batch_size=32, shuffle=True) with open('cat_to_name.json', 'r') as f: cat_to_name = json.load(f) model = make_network(arch, hidden_units) def train_network(model): criterion = nn.NLLLoss() optimizer = optim.Adam(model.classifier.parameters(), lr=learning_rate) print_every = 40 if gpu: model.to('cuda') curr_epoch = 0 for e in range(epochs): curr_epoch += 1 training_loss = 0 steps = 0 for images, labels in iter(trainloader): steps += 1 if gpu:
#beta = np.broadcast_to(beta[:, np.newaxis, np.newaxis, np.newaxis], shp) alpha1.set_value(alpha) alpha2.set_value(1 - alpha) beta1.set_value(beta) beta2.set_value(1 - beta) if __name__ == '__main__': parser = argparse.ArgumentParser() os.system("rm -r tbdata/") tb = TB("tbdata/") with TrainingEnv(name="lyy.{}.test".format(net_name), part_count=2, custom_parser=parser) as env: net, SS_list = make_network(minibatch_size=minibatch_size) preloss = net.loss_var net.loss_var = WeightDecay(net.loss_var, { "*conv*:W": 1e-4, "*fc*:W": 1e-4, "*bnaff*:k": 1e-4, "*offset*": 1e-4 }) train_func = env.make_func_from_loss_var(net.loss_var, "train", train_state=True) lr = 0.1 optimizer = Momentum(lr, 0.9) optimizer(train_func)
args = parser.parse_args() if args.outdir is None: args.outdir = os.path.join(os.path.dirname(__file__), 'results') if not os.path.exists(args.outdir): os.makedirs(args.outdir) if args.logdir is None: args.logdir = os.path.join(os.path.dirname(__file__), 'logs') env = gym.make(args.env) obs_dim = env.observation_space.shape[0] n_actions = env.action_space.shape[0] <<<<<<< HEAD network = make_network([512, 256, 128 ]) ======= network = make_network([100, 100, 100]) >>>>>>> 6b94eef89a5953d6ede5ebcd655291f1185553b1 sess = tf.Session() sess.__enter__() agent = Agent(network, obs_dim, n_actions) initialize() agent.sync_old() saver = tf.train.Saver() if args.load is not None: saver.restore(sess, args.load)
def main(): date = datetime.now().strftime('%Y%m%d%H%M%S') parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='PongNoFrameskip-v4') parser.add_argument('--load', type=str) # how to load parser.add_argument('--logdir', type=str, default=date) parser.add_argument('--render', action='store_true') parser.add_argument('--demo', action='store_true') # training or not training args = parser.parse_args() outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.logdir) if not os.path.exists(outdir): os.makedirs(outdir) logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir) env_name = args.env tmp_env = gym.make(env_name) is_atari = len(tmp_env.observation_space.shape) != 1 if not is_atari: observation_space = tmp_env.observation_space constants = box_constants if isinstance(tmp_env.action_space, gym.spaces.Box): num_actions = tmp_env.action_space.shape[0] # for continuous action space, num_actions means how many continuous actions else: num_actions = tmp_env.action_space.n # for discrete action space, num_actions means how many selectable actions. state_shape = [observation_space.shape[0], constants.STATE_WINDOW] state_preprocess = lambda s: s reward_preprocess = lambda r: r / 10.0 # (window_size, dim) -> (dim, window_size) phi = lambda s: np.transpose(s, [1, 0]) else: constants = atari_constants num_actions = tmp_env.action_space.n state_shape = constants.STATE_SHAPE + [constants.STATE_WINDOW] def state_preprocess(state): state = atari_preprocess(state, constants.STATE_SHAPE) state = np.array(state, dtype=np.float32) return state / 255.0 reward_preprocess = lambda r: np.clip(r, -1.0, 1.0) # (window_size, H, W) -> (H, W, window_size) phi = lambda s: np.transpose(s, [1, 2, 0]) # a transformation function # flag of continuous action space continuous = isinstance(tmp_env.action_space, gym.spaces.Box) # 'gym.spaces.Box' means continuous action space upper_bound = tmp_env.action_space.high if continuous else None # save settings dump_constants(constants, os.path.join(outdir, 'constants.json')) sess = tf.Session() sess.__enter__() model = make_network( # !!! just a lambda function constants.CONVS, constants.FCS, use_lstm=constants.LSTM, padding=constants.PADDING, continuous=continuous) # model is a function instance, # mlp network for continuous action space, cnn network for discrete # learning rate with decay operation if constants.LR_DECAY == 'linear': lr = LinearScheduler(constants.LR, constants.FINAL_STEP, 'lr') epsilon = LinearScheduler( constants.EPSILON, constants.FINAL_STEP, 'epsilon') else: lr = ConstantScheduler(constants.LR, 'lr') epsilon = ConstantScheduler(constants.EPSILON, 'epsilon') agent = Agent( model, # !!! num_actions, nenvs=constants.ACTORS, lr=lr, epsilon=epsilon, gamma=constants.GAMMA, lam=constants.LAM, lstm_unit=constants.LSTM_UNIT, value_factor=constants.VALUE_FACTOR, entropy_factor=constants.ENTROPY_FACTOR, time_horizon=constants.TIME_HORIZON, batch_size=constants.BATCH_SIZE, grad_clip=constants.GRAD_CLIP, state_shape=state_shape, epoch=constants.EPOCH, phi=phi, use_lstm=constants.LSTM, continuous=continuous, upper_bound=upper_bound ) saver = tf.train.Saver(max_to_keep=5) if args.load: saver.restore(sess, args.load) else: # this else is important sess.run(tf.global_variables_initializer()) # # create environemtns envs = [] for i in range(constants.ACTORS): # 8 actors env = gym.make(args.env) env.seed(constants.RANDOM_SEED) if is_atari: env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env) env = EpisodicLifeEnv(env) wrapped_env = EnvWrapper( env, r_preprocess=reward_preprocess, s_preprocess=state_preprocess ) envs.append(wrapped_env) # append all wrapped_envs batch_env = BatchEnvWrapper(envs) # envs is a list # sess.run(tf.global_variables_initializer()) # should not be here? otherwise it will override the loaded checkpoint summary_writer = tf.summary.FileWriter(logdir, sess.graph) logger = TfBoardLogger(summary_writer) logger.register('reward', dtype=tf.float32) end_episode = lambda r, s, e: logger.plot('reward', r, s) # record the reward a episode def after_action(state, reward, global_step, local_step):# after an action, check weather need to save model # demo mode will not save the model params if (global_step % 10**5 >=0 and global_step % 10**5 <= 10 ) and not args.demo : # save model about every 10 ** 5, can't use global step% 10**5 ==0, because global_step may not # get the number of multiple of 10**5. path = os.path.join(outdir, 'model.ckpt') print('model saved, global step:{}'.format(global_step)) saver.save(sess, path, global_step=global_step) trainer = BatchTrainer( env=batch_env, agent=agent, # Agent instannce render=args.render, state_shape=state_shape[:-1], state_window=constants.STATE_WINDOW, final_step=constants.FINAL_STEP, # final_step is a total time step limit # final_step=12345, after_action=after_action, # callback function after an action end_episode=end_episode, training=not args.demo # if --demo, then not training, if no --demo, then training the policy net and value net ) trainer.start()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='Pendulum-v0') parser.add_argument('--outdir', type=str, default="/home/aditya/NIPS18/output") parser.add_argument('--logdir', type=str, default="/home/aditya/NIPS18/output") parser.add_argument('--load', type=str, default=None) parser.add_argument('--final-steps', type=int, default=10**7) parser.add_argument('--render', action='store_true') parser.add_argument('--batch', type=int, default=64) parser.add_argument('--epoch', type=int, default=10) args = parser.parse_args() if args.outdir is None: args.outdir = os.path.join(os.path.dirname(__file__), 'results') if not os.path.exists(args.outdir): os.makedirs(args.outdir) if args.logdir is None: args.logdir = os.path.join(os.path.dirname(__file__), 'logs') #env = gym.make(args.env) env = ProstheticsEnv(visualize=False) obs_dim = 160 n_actions = env.action_space.shape[0] network = make_network([128, 128, 128]) sess = tf.Session() sess.__enter__() agent = Agent(network, obs_dim, n_actions) initialize() agent.sync_old() saver = tf.train.Saver() if args.load is not None: saver.restore(sess, args.load) reward_summary = tf.placeholder(tf.int32, (), name='reward_summary') tf.summary.scalar('reward_summary', reward_summary) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(args.logdir, sess.graph) global_step = 0 episode = 0 while True: local_step = 0 while True: training_data = [] sum_of_reward = 0 reward = 0 obs = env.reset() last_obs = None last_action = None last_value = None done = False while not done: if args.render: env.render() action, value = agent.act_and_train(last_obs, last_action, last_value, reward, obs) last_obs = obs last_action = action last_value = value action = tf.clip_by_value(action, 1e-10, 1.0) print(action) obs, reward, done, info = env.step(action) sum_of_reward += reward global_step += 1 local_step += 1 # save model if global_step % 10**6 == 0: path = os.path.join(args.outdir, '{}/model.ckpt'.format(global_step)) saver.save(sess, path) # the end of episode if done: summary, _ = sess.run( [merged, reward_summary], feed_dict={reward_summary: sum_of_reward}) train_writer.add_summary(summary, global_step) agent.stop_episode(last_obs, last_action, last_value, reward) print('Episode: {}, Step: {}: Reward: {}'.format( episode, global_step, sum_of_reward)) episode += 1 break # append data for training training_data.append(agent.get_training_data()) if local_step > 2048: break # train network obs = [] actions = [] returns = [] deltas = [] for o, a, r, d in training_data: obs.extend(o) actions.extend(a) returns.extend(r) deltas.extend(d) for epoch in range(args.epoch): indices = random.sample(range(len(obs)), args.batch) sampled_obs = np.array(obs)[indices] sampled_actions = np.array(actions)[indices] sampled_returns = np.array(returns)[indices] sampled_deltas = np.array(deltas)[indices] ratio = agent.train(sampled_obs, sampled_actions, sampled_returns, sampled_deltas) if args.final_steps < global_step: break
]) #test_dataset = datasets.ImageFolder('flowers/test', transform=data_transforms) #testloader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True) if cat_names: with open(cat_names, 'r') as f: cat_to_name = json.load(f) #Import pretrained model if gpu: checkpoint = torch.load('checkpoint.pth') else: checkpoint = torch.load('checkpoint.pth', map_location=lambda storage, loc: storage) model = make_network(checkpoint['arch'], checkpoint['hidden_units']) model.load_state_dict(checkpoint['model_state_dict']) class_to_idx = checkpoint['class_to_idx'] def process_image(image): image = Image.open(image) image = data_transforms(image) return image.numpy() def predict(image_path, model, topk=3): model.eval() image = process_image(image_path) image = torch.from_numpy(image) image.unsqueeze_(0)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='Pendulum-v0') parser.add_argument('--outdir', type=str, default=None) parser.add_argument('--logdir', type=str, default=None) parser.add_argument('--load', type=str, default=None) parser.add_argument('--final-steps', type=int, default=10**7) parser.add_argument('--render', action='store_true') parser.add_argument('--batch', type=int, default=64) parser.add_argument('--epoch', type=int, default=10) args = parser.parse_args() if args.outdir is None: args.outdir = os.path.join(os.path.dirname(__file__), 'results') if not os.path.exists(args.outdir): os.makedirs(args.outdir) if args.logdir is None: args.logdir = os.path.join(os.path.dirname(__file__), 'logs') env = gym.make(args.env) dam = gym.make("MyAntdam-v1") heal = env obs_dim = env.observation_space.shape[0] n_actions = env.action_space.shape[0] network = make_network([64, 64]) sess = tf.Session() sess.__enter__() agent = Agent(network, obs_dim, n_actions) initialize() agent.sync_old() saver = tf.train.Saver(max_to_keep=50) if args.load is not None: saver.restore(sess, args.load) reward_summary = tf.placeholder(tf.int32, (), name='reward_summary') tf.summary.scalar('reward_summary', reward_summary) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(args.logdir, sess.graph) global_step = 0 episode = 0 prob = 0.9 while True: local_step = 0 if global_step > 100000: prob = 0.5 elif global_step > 50000: prob = 0.7 elif global_step > 30000: prob = 0.8 while True: if (np.random.randint(0, 2, 1) > prob): env = dam else: env = heal training_data = [] sum_of_reward = 0 reward = 0 obs = env.reset() last_obs = None last_action = None last_value = None done = False while not done: if args.render: env.render() action, value = agent.act_and_train(last_obs, last_action, last_value, reward, obs) if numpy.isnan(action).any(): print "NaN found" path = os.path.join(args.outdir, '{}/model.ckpt'.format(global_step)) saver.save(sess, path) local_step = 3000 global_step = args.final_steps break last_obs = obs last_action = action last_value = value obs, reward, done, info = env.step(action) sum_of_reward += reward global_step += 1 local_step += 1 # save model if global_step % (5000) == 0: path = os.path.join(args.outdir, '{}/model.ckpt'.format(global_step)) saver.save(sess, path) # the end of episode if done: summary, _ = sess.run( [merged, reward_summary], feed_dict={reward_summary: sum_of_reward}) train_writer.add_summary(summary, global_step) agent.stop_episode(last_obs, last_action, last_value, reward) print('Episode: {}, Step: {}: Reward: {} Dam: {}'.format( episode, global_step, sum_of_reward, last_obs[-1])) episode += 1 break # append data for training training_data.append(agent.get_training_data()) if local_step > 2048: break # train network obs = [] actions = [] returns = [] deltas = [] for o, a, r, d in training_data: obs.extend(o) actions.extend(a) returns.extend(r) deltas.extend(d) print "Now Training" for epoch in range(args.epoch): indices = random.sample(range(len(obs)), min(len(obs), args.batch)) sampled_obs = np.array(obs)[indices] sampled_actions = np.array(actions)[indices] sampled_returns = np.array(returns)[indices] sampled_deltas = np.array(deltas)[indices] ratio = agent.train(sampled_obs, sampled_actions, sampled_returns, sampled_deltas) if args.final_steps < global_step: break
labels = [] for i in range(size): #a = p.get() #(img, label) = msgpack.unpackb(a, object_hook = m.decode) (img, label) = p.get() data.append(img) labels.append(label) return { "data": np.array(data).astype(np.float32), "label": np.array(labels) } if __name__ == '__main__': with TrainingEnv(name="lyy.{}.test".format(net_name), part_count=2) as env: net = make_network(minibatch_size=minibatch_size) preloss = net.loss_var net.loss_var = WeightDecay(net.loss_var, { "*conv*:W": 1e-4, "*fc*:W": 1e-4, "*bnaff*:k": 1e-4, "*offset*": 1e-4 }) train_func = env.make_func_from_loss_var(net.loss_var, "train", train_state=True) valid_func = env.make_func_from_loss_var(net.loss_var, "val", train_state=False)
def main(): date = datetime.now().strftime("%Y%m%d%H%M%S") parser = argparse.ArgumentParser() parser.add_argument('--env', type=str, default='CartPole-v0') parser.add_argument('--outdir', type=str, default=date) parser.add_argument('--logdir', type=str, default=date) parser.add_argument('--gpu', type=int, default=0) parser.add_argument('--load', type=str, default=None) parser.add_argument('--render', action='store_true') parser.add_argument('--eval-render', action='store_true') parser.add_argument('--record', action='store_true') parser.add_argument('--demo', action='store_true') args = parser.parse_args() # learned model path settings outdir = os.path.join(os.path.dirname(__file__), 'results/' + args.outdir) if not os.path.exists(outdir): os.makedirs(outdir) # log path settings logdir = os.path.join(os.path.dirname(__file__), 'logs/' + args.logdir) env = gym.make(args.env) # box environment if len(env.observation_space.shape) == 1: constants = box_constants actions = range(env.action_space.n) state_shape = [env.observation_space.shape[0], constants.STATE_WINDOW] state_preprocess = lambda state: state # (window_size, dim) -> (dim, window_size) phi = lambda state: np.transpose(state, [1, 0]) # atari environment else: constants = atari_constants actions = get_action_space(args.env) state_shape = [84, 84, constants.STATE_WINDOW] def state_preprocess(state): state = cv2.cvtColor(state, cv2.COLOR_RGB2GRAY) state = cv2.resize(state, (84, 84)) return np.array(state, dtype=np.float32) / 255.0 # (window_size, H, W) -> (H, W, window_size) phi = lambda state: np.transpose(state, [1, 2, 0]) # save constant variables dump_constants(constants, os.path.join(outdir, 'constants.json')) # exploration if constants.EXPLORATION_TYPE == 'linear': duration = constants.EXPLORATION_DURATION explorer = LinearDecayExplorer(final_exploration_step=duration) else: explorer = ConstantExplorer(constants.EXPLORATION_EPSILON) # wrap gym environment env = EnvWrapper( env, s_preprocess=state_preprocess, r_preprocess=lambda r: np.clip(r, -1, 1) ) # create encoder network network = make_network( constants.CONVS, constants.FCS, constants.DND_KEY_SIZE ) replay_buffer = NECReplayBuffer(constants.REPLAY_BUFFER_SIZE) sess = tf.Session() sess.__enter__() # create DNDs dnds = [] for i in range(len(actions)): dnd = DND( constants.DND_KEY_SIZE, constants.DND_CAPACITY, constants.DND_P, device=constants.DEVICES[i], scope='dnd{}'.format(i) ) dnd._init_vars() dnds.append(dnd) # create NEC agent agent = Agent( network, dnds, actions, state_shape, replay_buffer, explorer, constants, phi=phi, run_options=run_options, run_metadata=run_metadata ) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() if args.load is not None: saver.restore(sess, args.load) # tensorboard logger train_writer = tf.summary.FileWriter(logdir, sess.graph) tflogger = TfBoardLogger(train_writer) tflogger.register('reward', dtype=tf.float32) tflogger.register('eval_reward', dtype=tf.float32) # json logger trainlogger = JsonLogger(os.path.join(outdir, 'train.json')) evallogger = JsonLogger(os.path.join(outdir, 'evaluation.json')) # callback on the end of episode def end_episode(reward, step, episode): tflogger.plot('reward', reward, step) trainlogger.plot(reward=reward, step=step, episode=episode) evaluator = Evaluator( env=copy.deepcopy(env), state_shape=state_shape[:-1], state_window=constants.STATE_WINDOW, eval_episodes=constants.EVAL_EPISODES, recorder=Recorder(outdir) if args.record else None, record_episodes=constants.RECORD_EPISODES, render=args.eval_render ) def should_eval(step, episode): return step > 0 and step % constants.EVAL_INTERVAL == 0 def end_eval(step, episode, rewards): mean_rewards = np.mean(rewards) tflogger.plot('eval_reward', mean_rewards, step) evallogger.plot(reward=mean_rewards, step=step, episode=episode) trainer = Trainer( env=env, agent=agent, render=args.render, state_shape=state_shape[:-1], # ignore last channel state_window=constants.STATE_WINDOW, final_step=constants.FINAL_STEP, end_episode=end_episode, training=not args.demo, evaluator=evaluator, should_eval=should_eval, end_eval=end_eval ) trainer.start()