def __init__(self, num_actions, checkpoint=None): self.network, self.trainable_parameters = self.init_network( num_actions) self.optimizer = torch.optim.Adam(self.trainable_parameters, lr=1e-4) self.memory = Memory() if checkpoint is not None: load_checkpoint(self.network, self.optimizer, checkpoint)
def __init__(self, state_size, action_size, random_seed): """ Args: ====== state_size (int): state dim action_size (int): action dim random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # actor net initialization self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # critic net initialization self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck Exploration Noise Process self.noise = OUNoise(action_space=action_size, seed=random_seed) # Replay memory init self.memory = Memory(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed)
def __init__(self, hidden_size, env): self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] self.Actor = Actor(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Actor_target = Actor(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Critic = Critic(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Critic_target = Critic(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() for target_param, param in zip(self.Actor_target.parameters(), self.Actor.parameters()): target_param.data = param.data for target_param, param in zip(self.Critic_target.parameters(), self.Critic.parameters()): target_param.data = param.data self.Memory = Memory(30000) self.criterion = nn.MSELoss().cuda() self.actor_optimizer = torch.optim.Adam(self.Actor.parameters(), lr=1e-2) self.critic_optimizer = torch.optim.Adam(self.Critic.parameters(), lr=1e-1)
def __init__(self, model_func, n_way, n_support, jigsaw=False, lbda=0.0, rotation=False, tracking=False, use_bn=True, pretrain=False, image_loader=None, len_dataset=None): super(ProtoNet, self).__init__(model_func, n_way, n_support, use_bn, pretrain) self.loss_fn = nn.CrossEntropyLoss() self.len_dataset = len_dataset self.cuda() self.memory = Memory(size=len_dataset, weight=0.5, device='cuda') self.memory.initialize(self.feature, image_loader) self.jigsaw = jigsaw self.rotation = rotation self.lbda = lbda self.global_count = 0 self.indx = 0 if self.jigsaw: self.projection_transformed_features = nn.Linear( 512 * 9, 512) ### Self-supervision branch #self.fc6 = nn.Sequential() #self.fc6.add_module('fc6_s1',nn.Linear(512, 512))#for resnet #self.fc6.add_module('relu6_s1',nn.ReLU(inplace=True)) #self.fc6.add_module('drop6_s1',nn.Dropout(p=0.5)) #self.fc7 = nn.Sequential() #self.fc7.add_module('fc7',nn.Linear(9*512,4096))#for resnet #self.fc7.add_module('relu7',nn.ReLU(inplace=True)) #self.fc7.add_module('drop7',nn.Dropout(p=0.5)) #self.classifier = nn.Sequential() #self.classifier.add_module('fc8',nn.Linear(4096, 35)) if self.rotation: self.fc6 = nn.Sequential() self.fc6.add_module('fc6_s1', nn.Linear(512, 512)) #for resnet self.fc6.add_module('relu6_s1', nn.ReLU(inplace=True)) self.fc6.add_module('drop6_s1', nn.Dropout(p=0.5)) self.fc7 = nn.Sequential() self.fc7.add_module('fc7', nn.Linear(512, 128)) #for resnet self.fc7.add_module('relu7', nn.ReLU(inplace=True)) self.fc7.add_module('drop7', nn.Dropout(p=0.5)) self.classifier_rotation = nn.Sequential() self.classifier_rotation.add_module('fc8', nn.Linear(128, 4))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='Pendulum-v0') parser.add_argument("--time-steps", type=int, default=30000) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=.9) args = parser.parse_args() env = gym.make(args.environment) unroll = 20 state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] action_bound_high = env.action_space.high action_bound_low = env.action_space.low agent = direct_policy_search(state_dim, action_dim, action_bound_high, action_bound_low, unroll, .9, 5, 'direct_policy_search') # Replay memory memory = Memory(args.replay_mem_size) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) state = env.reset() total_rewards = 0.0 epoch = 1 for time_steps in range(args.time_steps): #env.render() action = agent.act(sess, state) next_state, reward, done, _ = env.step(action) total_rewards += float(reward) # Store tuple in replay memory memory.add([ np.atleast_2d(state), np.atleast_2d(action), reward, np.atleast_2d(next_state), done ]) # Training step batch = np.array(memory.sample(args.batch_size)) assert len(batch) > 0 states = np.concatenate(batch[:, 0], axis=0) # Train the agent agent.train(sess, states) # s <- s' state = np.copy(next_state) if done == True: print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards, 'unroll', unroll epoch += 1 total_rewards = 0. state = env.reset()
def main2(): # Initialize environment. import gym env = gym.make('Pendulum-v0') state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] action_bound_high = env.action_space.high action_bound_low = env.action_space.low # Initialize agent. pai = PAI(environment='Pendulum-v0', state_size=state_size, action_size=action_size, hidden_size=20, it_tloop=100, it_dyn=5000, bs_dyn=100, it_policy=1000, bs_policy=50, K=50, T=25, action_bound_low=action_bound_low, action_bound_high=action_bound_high, discount_factor=.9) # Initialize replay memory memory = Memory( 400 * 10 ) #Data from most recent 10 trials (each trial is 400 time steps long). with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(300): total_rewards = 0. state = env.reset() while True: action = pai.act(sess, state) next_state, reward, done, _ = env.step(action) total_rewards += float(reward) # Store tuple in replay memory memory.add([ np.atleast_2d(state), np.atleast_2d(action), reward, np.atleast_2d(next_state), done ]) # s <- s' state = np.copy(next_state) if done == True: print 'epoch', epoch, 'total rewards', total_rewards # Train the agent pai.train(sess, memory) break
def _rollout_with_memory(self, env, network, args, running_state, max_episode_steps, keep_memory=False): memory = Memory() num_steps = 0 reward_list = [] len_list = [] while num_steps < args.batch_size: state = env.reset() if args.state_norm: state = running_state(state) if args.append_time: state = np.append(state, 1.0) reward_sum = 0 for t in range(max_episode_steps): action_mean, action_std, value = network( Tensor(state).unsqueeze(0)) action_mean = action_mean[0] action_std = action_std[0] action, y = network.select_action(action_mean, action_std) action_mean = action_mean.data.numpy() action = action.data.numpy() y = y.data.numpy() next_state, reward, done, info = env.step(action) reward_sum += reward if args.state_norm: next_state = running_state(next_state) if args.append_time: next_state = np.append(next_state, 1 - (t + 1) / max_episode_steps) mask = 0 if (done or ((t + 1) == max_episode_steps)) else 1 memory.push(state, value, action_mean, action, y, mask, next_state, reward) if done: break state = next_state num_steps += (t + 1) reward_list.append(reward_sum) len_list.append(t + 1) meanepreward = np.mean(reward_list) meaneplen = np.mean(len_list) if keep_memory: self.memory = memory self.old_std = network.action_std.data return meanepreward, meaneplen else: return memory, meanepreward, meaneplen, num_steps
def __init__( self, action_dim, filters_C, kernel_size, hidden_R, dropout, dropout_r, Hstep, activation, is_training_mode, ): self.policy_clip = 0.2 self.value_clip = 0.2 self.entropy_coef = 0.0 self.vf_loss_coef = 0.5 self.minibatch = 32 self.PPO_epochs = 10 # TODO use predicted results action_std = 1.0 self.cov_mat = tf.fill([action_dim], action_std ** 2) self.is_training_mode = is_training_mode self.actor = Actor( action_dim, filters_C, kernel_size, hidden_R, dropout, dropout_r, activation, Hstep, ) self.actor_old = Actor( action_dim, filters_C, kernel_size, hidden_R, dropout, dropout_r, activation, Hstep, ) self.critic = Critic( filters_C, kernel_size, hidden_R, dropout, dropout_r, activation, Hstep ) self.critic_old = Critic( filters_C, kernel_size, hidden_R, dropout, dropout_r, activation, Hstep ) self.optimizer = tf.keras.optimizers.Adam(learning_rate=2e-4) self.memory = Memory() self.utils = Utils()
class DDPG: def __init__(self, env, batch_size=32, gamma=0.99, hidden_units=32, maxlen=10000, tau=0.1, actor_lr=0.001, critic_lr=0.001): self.env=env self.batch_size=batch_size self.gamma=gamma self.maxlen=maxlen self.sess=tf.Session() self.actor=Actor(env, self.sess, hidden_units, tau, actor_lr) self.critic=Critic(env, self.sess, hidden_units, tau, critic_lr) self.memory=Memory(maxlen) self.sess.run(tf.global_variables_initializer()) self.step=0 def store(self, exp): self.memory.add(exp) def update(self, ): if len(self.memory.buffer)<1000:#self.batch_size: return self.step+=1 data = self.memory.sample(self.batch_size) s=np.array([d[0] for d in data]) a=np.array([d[1] for d in data]) r=np.array([d[2] for d in data]) s_=np.array([d[3] for d in data]) a_=self.actor.target_model.predict(s_) target_q=self.critic.target_model.predict([s_, a_]) #y=np.array([d[2] for d in data]) #for i in range(self.batch_size): # y[i]+=self.gamma*target_q[i] y=r[:,np.newaxis]+self.gamma*target_q self.critic.model.train_on_batch([s, a], y) action=self.actor.model.predict(s) grads=self.critic.get_grads(s, action) self.actor.train(s,grads) if self.step%10==0: self.actor.update_weights() self.critic.update_weights() def get_action(self, s): return self.actor.get_action(s)
def inference_speed_memory(self, batch_size, seq_length): # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length)) key = jax.random.PRNGKey(0) input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size) @jax.jit def ref_step(): out = self.model(input_ids=input_ids) return out[0] if jax.local_devices()[0].platform == 'gpu': nvml.nvmlInit() ref_step().block_until_ready() handle = nvml.nvmlDeviceGetHandleByIndex(0) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) max_bytes_in_use = meminfo.used memory = Memory(max_bytes_in_use) # shutdown nvml nvml.nvmlShutdown() else: memory = None timeit.repeat("ref_step().block_until_ready()", repeat=1, number=2,globals=locals()) if self.jit: runtimes = timeit.repeat("ref_step().block_until_ready()", repeat=self.repeat,number=3,globals=locals()) else: with jax.disable_jit(): runtimes = timeit.repeat("ref_step().block_until_ready()",repeat=self.repeat,number=3,globals=locals()) return float(np.min(runtimes)/3.0), memory
def __init__(self, env, batch_size=32, gamma=0.99, hidden_units=32, maxlen=10000, tau=0.1, actor_lr=0.001, critic_lr=0.001): self.env=env self.batch_size=batch_size self.gamma=gamma self.maxlen=maxlen self.sess=tf.Session() self.actor=Actor(env, self.sess, hidden_units, tau, actor_lr) self.critic=Critic(env, self.sess, hidden_units, tau, critic_lr) self.memory=Memory(maxlen) self.sess.run(tf.global_variables_initializer()) self.step=0
def collect_samples(self, batch_size=1): memory = Memory() num_trajs = (batch_size + args.sample_traj_length - 1) // args.sample_traj_length onehot_state, multihot_state, continuous_state = self.reset(num_trajs) for walk_step in range(self.max_traj_length - 1): with torch.no_grad(): onehot_action, multihot_action, continuous_action, next_onehot_state, next_multihot_state, next_continuous_state, old_log_prob = self.step( onehot_state, multihot_state, continuous_state, num_trajs) # Currently we assume the exploration step is not done until it reaches max_traj_length. mask = torch.ones((num_trajs, 1), device=device) memory.push(onehot_state.type(FloatTensor), multihot_state.type(FloatTensor), continuous_state, onehot_action.type(FloatTensor), multihot_action.type(FloatTensor), continuous_action, next_onehot_state.type(FloatTensor), next_multihot_state.type(FloatTensor), next_continuous_state, old_log_prob, mask) onehot_state, multihot_state, continuous_state = next_onehot_state, next_multihot_state, next_continuous_state # one more step for push done with torch.no_grad(): onehot_action, multihot_action, continuous_action, next_onehot_state, next_multihot_state, next_continuous_state, old_log_prob = self.step( onehot_state, multihot_state, continuous_state, num_trajs) mask = torch.zeros((num_trajs, 1), device=device) memory.push(onehot_state.type(FloatTensor), multihot_state.type(FloatTensor), continuous_state, onehot_action.type(FloatTensor), multihot_action.type(FloatTensor), continuous_action, next_onehot_state.type(FloatTensor), next_multihot_state.type(FloatTensor), next_continuous_state, old_log_prob, mask) return memory, num_trajs
def collect_samples(self, mini_batch_size, size=1): num_step = 0 memory = Memory() while num_step < mini_batch_size: discrete_state, continuous_state = self.reset(size) for walk_step in range(self.max_traj_length - 1): with torch.no_grad(): discrete_action, continuous_action, next_discrete_state, next_continuous_state, old_log_prob = self.step( discrete_state, continuous_state, size) # Currently we assume the exploration step is not done until it reaches max_traj_length. mask = torch.ones((size, 1), device=device) memory.push(discrete_state.type(FloatTensor), continuous_state, discrete_action.type(FloatTensor), continuous_action, next_discrete_state.type(FloatTensor), next_continuous_state, old_log_prob, mask) discrete_state, continuous_state = next_discrete_state, next_continuous_state num_step += 1 if num_step >= mini_batch_size: return memory # one more step for push done with torch.no_grad(): discrete_action, continuous_action, next_discrete_state, next_continuous_state, old_log_prob = self.step( discrete_state, continuous_state, size) mask = torch.zeros((size, 1), device=device) memory.push(discrete_state.type(FloatTensor), continuous_state, discrete_action.type(FloatTensor), continuous_action, next_discrete_state.type(FloatTensor), next_continuous_state, old_log_prob, mask) num_step += 1 return memory
def train_speed_memory(self, batch_size, seq_length): key = jax.random.PRNGKey(0) input_ids = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size) targets = jax.random.randint(key, (batch_size, seq_length), 0, self.vocab_size) labels = jax.random.randint(key, (batch_size, seq_length), 0, 2) # input_ids = np.random.randint(0, self.vocab_size, (batch_size, seq_length)) # targets = np.random.randint(0, self.vocab_size, (batch_size, seq_length)) # labels = np.random.randint(0,2, (batch_size, seq_length)) @jax.jit def train_step(): def loss_fn(params): token_mask = jnp.where(labels > 0, 1.0, 0.0).astype(self.dtype) logits = self.model(input_ids=input_ids, train=True, params=params, dropout_rng=jax.random.PRNGKey(0))[0] loss, normalizing_factor = cross_entropy(logits,targets, token_mask) jax.profiler.save_device_memory_profile(f"memory/{workload[0]}_{workload[1]}_memory.prof", "gpu") return loss / normalizing_factor if self.fp16 and jax.local_devices()[0].platform == 'gpu': grad_fn = self.dynamic_scale.value_and_grad(loss_fn) dyn_scale, is_fin, loss, grad = grad_fn(self.model.params) else: grad_fn = jax.value_and_grad(loss_fn) loss, grad = grad_fn(self.model.params) return tree_flatten(grad)[0] if jax.local_devices()[0].platform == 'gpu': nvml.nvmlInit() train_step() handle = nvml.nvmlDeviceGetHandleByIndex(0) meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) max_bytes_in_use = meminfo.used memory = Memory(max_bytes_in_use) # shutdown nvml nvml.nvmlShutdown() else: memory = None # timeit.repeat(train_step,repeat=1,number=2) timeit.repeat("for i in train_step():i.block_until_ready()", repeat=1, number=2,globals=locals()) if self.jit: # runtimes = timeit.repeat(train_step,repeat=self.repeat,number=3) runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals()) else: with jax.disable_jit(): # runtimes = timeit.repeat(train_step, repeat=self.repeat, number=3) runtimes = timeit.repeat("for i in train_step():i.block_until_ready()", repeat=self.repeat, number=3,globals=locals()) return float(np.min(runtimes)/3.0), memory
def train_feature_extractor(self, sess, replay_buffer, batch_size=100, iterations=1, iterations_left=-1): try: self.buff except: self.buff = Memory(10000) for it in range(iterations): while len(self.buff.mem) < batch_size: state = replay_buffer.sample(1)[0][0] patches = [] for i in range(0, state.shape[1]-self.w+1, self.s): for j in range(0, state.shape[2]-self.w+1, self.s): patches.append(state[0, i:i+self.w, j:j+self.w, :]) assert patches[-1].shape[0] == patches[-1].shape[1] assert patches[-1].shape[0] == self.w from random import shuffle shuffle(patches) self.buff.mem += patches batch = self.buff.mem[:batch_size] self.buff.mem = self.buff.mem[batch_size:] batch = np.concatenate([b[np.newaxis, ...] for b in batch], axis=0) batch = self.process_states(batch) batch = [b.astype(np.float64) / 255. for b in batch] feed_dict = {} for i in range(self.no_inputs): feed_dict[self.params[i]['input']] = batch[i] _, recon_loss, = sess.run([self.update_model_recon, self.recon_loss], feed_dict=feed_dict) print "train_feature_extractor - recon_loss:", recon_loss ######################################################################################################################## if iterations_left <= 10: variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) import pickle recon_x, recon_y, conv1, conv2 = sess.run([self.params[0]['recon'], self.params[1]['recon'], variables[2], variables[4]], feed_dict=feed_dict) pickle.dump( [conv1, conv2, batch, recon_x, recon_y], open( "recons.p", "wb" ) ) ######################################################################################################################## return iterations
def __init__(self, stack_size=256, *args, **kwargs): self.stack = Stack(size=stack_size) self.memory = Memory(memory=kwargs.get('memory', b'')) self.storage = dict() self.args = args self.kwargs = kwargs self.opcode = None self.current_opcode_name = None self.code = None self.msg = None self.code = None self.debug = False self.pc = 0 self.prev_pc = -1 self.stop = False
def train_feature_extractor(self, sess, replay_buffer, batch_size=100, iterations=1): try: self.buff except: self.buff = Memory(10000) for it in range(iterations): while len(self.buff.mem) < batch_size: state = replay_buffer.sample(1)[0][0] state = state.astype(np.float64) state = state / 255. patches = [] for i in range(0, state.shape[1]-self.w+1, self.s): for j in range(0, state.shape[2]-self.w+1, self.s): patches.append(state[0, i:i+self.w, j:j+self.w, :]) assert patches[-1].shape[0] == patches[-1].shape[1] assert patches[-1].shape[0] == self.w from random import shuffle shuffle(patches) self.buff.mem += patches batch = self.buff.mem[:batch_size] self.buff.mem = self.buff.mem[batch_size:] _, recon_loss, = sess.run([self.update_model_recon, self.recon_loss], feed_dict={self.x:batch, self.y:batch}) print "train_feature_extractor - recon_loss:", recon_loss ######################################################################################################################## import pickle recon_x, recon_y = sess.run([self.recon_x_, self.recon_y_], feed_dict={self.x:batch, self.y:batch}) pickle.dump( [batch, recon_x, recon_y], open( "recons.p", "wb" ) ) ######################################################################################################################## return iterations
def main(): import gym import sys import copy sys.path.append('../..') from utils import Memory #env = gym.make('LunarLander-v2') env = gym.make('Pendulum-v0') #env = gym.make('CartPole-v0') mem = Memory(1000000) batch_size = 32 try: a_size = env.action_space.n a_type = 'discrete' except: try: a_size = env.action_space.shape[0] a_type = 'continuous' except: raise ValueError('Cannot find action size.') emg = gated_env_modeler(s_shape=[None, env.observation_space.shape[0]], a_size=a_size, out_shape=[None, env.observation_space.shape[0]], a_type=a_type, numfactors=256) #emg = gated_env_modeler(s_shape=[None, env.observation_space.shape[0]], a_size=a_size, out_shape=[None, 1], a_type=a_type, numfactors=256) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) while True: s = env.reset() done = False while done == False: #env.render() #a = np.random.randint(a_size) a = random_action(a_size, a_type) s_, r, done, _ = env.step(a) mem.add([s, a, r, s_, done]) batch = mem.sample(batch_size) if len(batch) == batch_size: states = [] actions = [] rewards = [] states_ = [] for i in range(batch_size): states.append(batch[i][0]) actions.append(batch[i][1]) rewards.append(batch[i][2]) states_.append(batch[i][3]) states = np.stack(states, axis=0) actions = np.stack(actions, axis=0) rewards = np.stack(rewards, axis=0) states_ = np.stack(states_, axis=0) #_, loss_s, loss_a, loss_s_, loss = sess.run([emg.update_model, emg.loss_s, emg.loss_a, emg.loss_s_, emg.loss], feed_dict={emg.states:states, emg.states_:rewards[..., np.newaxis], emg.actions_placeholder:actions}) _, loss_s, loss_a, loss_s_, loss = sess.run( [ emg.update_model, emg.loss_s, emg.loss_a, emg.loss_s_, emg.loss ], feed_dict={ emg.states: states, emg.states_: states_, emg.actions_placeholder: actions }) print 'loss_s', loss_s, 'loss_a', loss_a, 'loss_s_', loss_s_, 'loss', loss s = copy.deepcopy(s_) if done == True: break
class ProtoNet(MetaTemplate): def __init__(self, model_func, n_way, n_support, jigsaw=False, lbda=0.0, rotation=False, tracking=False, use_bn=True, pretrain=False, image_loader=None, len_dataset=None): super(ProtoNet, self).__init__(model_func, n_way, n_support, use_bn, pretrain) self.loss_fn = nn.CrossEntropyLoss() self.len_dataset = len_dataset self.cuda() self.memory = Memory(size=len_dataset, weight=0.5, device='cuda') self.memory.initialize(self.feature, image_loader) self.jigsaw = jigsaw self.rotation = rotation self.lbda = lbda self.global_count = 0 self.indx = 0 if self.jigsaw: self.projection_transformed_features = nn.Linear( 512 * 9, 512) ### Self-supervision branch #self.fc6 = nn.Sequential() #self.fc6.add_module('fc6_s1',nn.Linear(512, 512))#for resnet #self.fc6.add_module('relu6_s1',nn.ReLU(inplace=True)) #self.fc6.add_module('drop6_s1',nn.Dropout(p=0.5)) #self.fc7 = nn.Sequential() #self.fc7.add_module('fc7',nn.Linear(9*512,4096))#for resnet #self.fc7.add_module('relu7',nn.ReLU(inplace=True)) #self.fc7.add_module('drop7',nn.Dropout(p=0.5)) #self.classifier = nn.Sequential() #self.classifier.add_module('fc8',nn.Linear(4096, 35)) if self.rotation: self.fc6 = nn.Sequential() self.fc6.add_module('fc6_s1', nn.Linear(512, 512)) #for resnet self.fc6.add_module('relu6_s1', nn.ReLU(inplace=True)) self.fc6.add_module('drop6_s1', nn.Dropout(p=0.5)) self.fc7 = nn.Sequential() self.fc7.add_module('fc7', nn.Linear(512, 128)) #for resnet self.fc7.add_module('relu7', nn.ReLU(inplace=True)) self.fc7.add_module('drop7', nn.Dropout(p=0.5)) self.classifier_rotation = nn.Sequential() self.classifier_rotation.add_module('fc8', nn.Linear(128, 4)) def train_loop(self, epoch, train_loader, optimizer, writer, base_loader_u=None): print_freq = 10 avg_loss = 0 avg_loss_proto = 0 avg_loss_jigsaw = 0 avg_loss_rotation = 0 if base_loader_u is not None: for i, inputs in enumerate(zip(train_loader, cycle(base_loader_u))): self.global_count += 1 x = inputs[0][0] self.n_query = x.size(1) - self.n_support if self.change_way: self.n_way = x.size(0) optimizer.zero_grad() loss_proto, acc = self.set_forward_loss(x) if self.jigsaw: #loss_jigsaw, acc_jigsaw = self.set_forward_loss_unlabel(inputs[1][2], inputs[1][3],x)# torch.Size([5, 21, 9, 3, 75, 75]), torch.Size([5, 21]) loss_jigsaw = self.set_forward_loss_unlabel( inputs[1][2], inputs[1][3], x ) # torch.Size([5, 21, 9, 3, 64, 64]), torch.Size([5, 21]) loss = (1.0 - self.lbda) * loss_proto + self.lbda * loss_jigsaw writer.add_scalar('train/loss_proto', float(loss_proto.data.item()), self.global_count) writer.add_scalar('train/loss_jigsaw', float(loss_jigsaw.data.item()), self.global_count) elif self.rotation: loss_rotation, acc_rotation = self.set_forward_loss_unlabel( inputs[1][2], inputs[1][3], x ) # torch.Size([5, 21, 9, 3, 75, 75]), torch.Size([5, 21]) loss = (1.0 - self.lbda) * loss_proto + self.lbda * loss_rotation writer.add_scalar('train/loss_proto', float(loss_proto.data.item()), self.global_count) writer.add_scalar('train/loss_rotation', float(loss_rotation.data.item()), self.global_count) else: loss = loss_proto loss.backward() optimizer.step() avg_loss = avg_loss + loss.data writer.add_scalar('train/loss', float(loss.data.item()), self.global_count) if self.jigsaw: avg_loss_proto += loss_proto.data avg_loss_jigsaw += loss_jigsaw.data writer.add_scalar('train/acc_proto', acc, self.global_count) writer.add_scalar('train/acc_jigsaw', acc_jigsaw, self.global_count) elif self.rotation: avg_loss_proto += loss_proto.data avg_loss_rotation += loss_rotation.data writer.add_scalar('train/acc_proto', acc, self.global_count) writer.add_scalar('train/acc_rotation', acc_rotation, self.global_count) if (i + 1) % print_freq == 0: if self.jigsaw: print('Epoch {:d} | Batch {:d}/{:d} | Loss {:f} | Loss Proto {:f} | Loss Jigsaw {:f}'.\ format(epoch, i+1, len(train_loader), avg_loss/float(i+1), avg_loss_proto/float(i+1), avg_loss_jigsaw/float(i+1))) elif self.rotation: print('Epoch {:d} | Batch {:d}/{:d} | Loss {:f} | Loss Proto {:f} | Loss Rotation {:f}'.\ format(epoch, i+1, len(train_loader), avg_loss/float(i+1), avg_loss_proto/float(i+1), avg_loss_rotation/float(i+1))) else: print( 'Epoch {:d} | Batch {:d}/{:d} | Loss {:f}'.format( epoch, i + 1, len(train_loader), avg_loss / float(i + 1))) else: #### This branch is used self.memory.update_weighted_count() self.indx = 0 for i, inputs in enumerate(train_loader): self.global_count += 1 x = inputs[0] ### [5,21,3,224,224] self.n_query = x.size(1) - self.n_support if self.change_way: self.n_way = x.size(0) optimizer.zero_grad() loss_proto, acc = self.set_forward_loss(x) if self.jigsaw: # print(x.size(), inputs[2].size(), inputs[3].size()) loss_jigsaw = self.set_forward_loss_unlabel( x, inputs[2], inputs[3] ) # torch.Size([5, 21, 9, 3, 64, 64]), torch.Size([5, 21]) loss = (1.0 - self.lbda) * loss_proto + self.lbda * loss_jigsaw writer.add_scalar('train/loss_proto', float(loss_proto.data.item()), self.global_count) writer.add_scalar('train/loss_jigsaw', float(loss_jigsaw.data.item()), self.global_count) elif self.rotation: loss_rotation, acc_rotation = self.set_forward_loss_unlabel( inputs[2], inputs[3], x ) # torch.Size([5, 21, 9, 3, 75, 75]), torch.Size([5, 21]) loss = (1.0 - self.lbda) * loss_proto + self.lbda * loss_rotation writer.add_scalar('train/loss_proto', float(loss_proto.data.item()), self.global_count) writer.add_scalar('train/loss_rotation', float(loss_rotation.data.item()), self.global_count) else: loss = loss_proto loss.backward() optimizer.step() avg_loss = avg_loss + loss.item() writer.add_scalar('train/loss', float(loss.data.item()), self.global_count) if self.jigsaw: avg_loss_proto += loss_proto.data avg_loss_jigsaw += loss_jigsaw.data writer.add_scalar('train/acc_proto', acc, self.global_count) # writer.add_scalar('train/acc_jigsaw', acc_jigsaw, self.global_count) elif self.rotation: avg_loss_proto += loss_proto.data avg_loss_rotation += loss_rotation.data writer.add_scalar('train/acc_proto', acc, self.global_count) writer.add_scalar('train/acc_rotation', acc_rotation, self.global_count) if (i + 1) % print_freq == 0: #print(optimizer.state_dict()['param_groups'][0]['lr']) if self.jigsaw: print('Epoch {:d} | Batch {:d}/{:d} | Loss {:f} | Loss Proto {:f} | Loss Jigsaw {:f}'.\ format(epoch, i+1, len(train_loader), avg_loss/float(i+1), avg_loss_proto/float(i+1), avg_loss_jigsaw/float(i+1))) elif self.rotation: print('Epoch {:d} | Batch {:d}/{:d} | Loss {:f} | Loss Proto {:f} | Loss Rotation {:f}'.\ format(epoch, i+1, len(train_loader), avg_loss/float(i+1), avg_loss_proto/float(i+1), avg_loss_rotation/float(i+1))) else: print( 'Epoch {:d} | Batch {:d}/{:d} | Loss {:f}'.format( epoch, i + 1, len(train_loader), avg_loss / float(i + 1))) self.indx += 105 def test_loop(self, test_loader, record=None): # breakpoint() correct = 0 count = 0 acc_all = [] acc_all_jigsaw = [] acc_all_rotation = [] iter_num = len(test_loader) for i, inputs in enumerate(test_loader): x = inputs[0] self.n_query = x.size(1) - self.n_support if self.change_way: self.n_way = x.size(0) if self.jigsaw: # correct_this, correct_this_jigsaw, count_this, count_this_jigsaw = self.correct(x, inputs[2], inputs[3]) correct_this, count_this = self.correct(x) elif self.rotation: correct_this, correct_this_rotation, count_this, count_this_rotation = self.correct( x, inputs[2], inputs[3]) else: correct_this, count_this = self.correct(x) acc_all.append(correct_this / count_this * 100) # if self.jigsaw: # acc_all_jigsaw.append(correct_this_jigsaw/ count_this_jigsaw*100) # elif self.rotation: # acc_all_rotation.append(correct_this_rotation/ count_this_rotation*100) acc_all = np.asarray(acc_all) acc_mean = np.mean(acc_all) acc_std = np.std(acc_all) print('%d Test Protonet Acc = %4.2f%% +- %4.2f%%' % (iter_num, acc_mean, 1.96 * acc_std / np.sqrt(iter_num))) if self.jigsaw: # acc_all_jigsaw = np.asarray(acc_all_jigsaw) # acc_mean_jigsaw = np.mean(acc_all_jigsaw) # acc_std_jigsaw = np.std(acc_all_jigsaw) # print('%d Test Jigsaw Acc = %4.2f%% +- %4.2f%%' %(iter_num, acc_mean_jigsaw, 1.96* acc_std_jigsaw/np.sqrt(iter_num))) #return acc_mean, acc_mean_jigsaw return acc_mean elif self.rotation: acc_all_rotation = np.asarray(acc_all_rotation) acc_mean_rotation = np.mean(acc_all_rotation) acc_std_rotation = np.std(acc_all_rotation) print('%d Test Rotation Acc = %4.2f%% +- %4.2f%%' % (iter_num, acc_mean_rotation, 1.96 * acc_std_rotation / np.sqrt(iter_num))) return acc_mean, acc_mean_rotation else: return acc_mean def correct(self, x, patches=None, patches_label=None): scores = self.set_forward(x) #if self.jigsaw: # x_, y_ = self.set_forward_unlabel(patches=patches,patches_label=patches_label) #elif self.rotation: # x_, y_ = self.set_forward_unlabel(patches=patches,patches_label=patches_label) y_query = np.repeat(range(self.n_way), self.n_query) topk_scores, topk_labels = scores.data.topk(1, 1, True, True) topk_ind = topk_labels.cpu().numpy() top1_correct = np.sum(topk_ind[:, 0] == y_query) return float(top1_correct), len(y_query) #if self.jigsaw: # pred = torch.max(x_,1) # top1_correct_jigsaw = torch.sum(pred[1] == y_) # return float(top1_correct), float(top1_correct_jigsaw), len(y_query), len(y_) #elif self.rotation: # pred = torch.max(x_,1) # top1_correct_rotation = torch.sum(pred[1] == y_) # return float(top1_correct), float(top1_correct_rotation), len(y_query), len(y_) #else: # return float(top1_correct), len(y_query) def set_forward(self, x, is_feature=False): z_support, z_query = self.parse_feature(x, is_feature) z_support = z_support.contiguous() z_proto = z_support.view(self.n_way, self.n_support, -1).mean( 1) #the shape of z is [n_data, n_dim] z_query = z_query.contiguous().view(self.n_way * self.n_query, -1) dists = euclidean_dist(z_query, z_proto) scores = -dists return scores def set_forward_unlabel(self, patches=None, patches_label=None): # print(patches.size()) if len(patches.size()) == 6: patches_support = patches[:, :self.n_support] ###support pathces Way, S, T, C, H, W = patches_support.size( ) #torch.Size([5, 5, 9, 3, 64, 64]) ###new B = Way * S elif len(patches.size()) == 5: B, T, C, H, W = patches.size() #torch.Size([5, 15, 9, 3, 75, 75]) if self.jigsaw: patches_support = patches_support.reshape( B * T, C, H, W).cuda() #torch.Size([225, 3, 64, 64]) ###new if self.dual_cbam: patch_feat = self.feature(patches_support, jigsaw=True) #torch.Size([225, 512]) else: patch_feat = self.feature( patches_support) #torch.Size([225, 512]) x_ = patch_feat.view(B, T, -1) ### [25,9,512] x_ = x_[:, torch.randperm(x_.size()[1])] x_ = x_.view(B, -1) #[25,4608] ###new v_t = self.projection_transformed_features(x_) ### [25,512] v_t = v_t.view(self.n_way, self.n_way, -1) ### [5,5,512] #x_ = x_.transpose(0,1)#torch.Size([9, 75, 512]) #x_list = [] #for i in range(9): # z = self.fc6(x_[i])#torch.Size([75, 512]) # z = z.view([B,1,-1])#torch.Size([75, 1, 512]) # x_list.append(z) #x_ = torch.cat(x_list,1)#torch.Size([75, 9, 512]) #x_ = (x_.view(B,-1))#torch.Size([105, 9*512]) #x_= self.projection_transformed_features(x_) # [105,512] #x_ = self.classifier(x_) #y_ = patches_label.view(-1).cuda() return v_t elif self.rotation: patches = patches.view(B * T, C, H, W).cuda() x_ = self.feature(patches) #torch.Size([64, 512, 1, 1]) x_ = x_.squeeze() x_ = self.fc6(x_) x_ = self.fc7(x_) #64,128 x_ = self.classifier_rotation(x_) #64,4 pred = torch.max(x_, 1) y_ = patches_label.view(-1).cuda() return x_, y_ def set_forward_loss(self, x): y_query = torch.from_numpy(np.repeat(range(self.n_way), self.n_query)) scores = self.set_forward(x) topk_scores, topk_labels = scores.data.topk(1, 1, True, True) topk_ind = topk_labels.cpu().numpy() acc = np.sum(topk_ind[:, 0] == y_query.numpy()) / len(y_query.numpy()) y_query = Variable(y_query.cuda()) return self.loss_fn(scores, y_query), acc def contrastive_loss(self, original_features, patch_features, negative_nb, index): ###new loss = 0 # rng = np.random.default_rng() # print(z_support.size()) #negatives = torch.empty(5,20,512) #negatives[0] = torch.cat((z_support[1], z_support[2], z_support[3], z_support[4])) #negatives[1] = torch.cat((z_support[0], z_support[2], z_support[3], z_support[4])) #negatives[2] = torch.cat((z_support[0], z_support[1], z_support[3], z_support[4])) #negatives[3] = torch.cat((z_support[0], z_support[1], z_support[2], z_support[4])) #negatives[4] = torch.cat((z_support[0], z_support[1], z_support[2], z_support[3])) for i in range(original_features.shape[0]): temp = 0.07 cos = torch.nn.CosineSimilarity() criterion = torch.nn.CrossEntropyLoss() ### Obtaining negative images N=20 # Index=np.array(range(0,original_features.shape[0])) ### [,25] # Index=np.delete(Index,i) ### [,24] # numbers = rng.choice(24, size=negative_nb, replace=False) # [1,20] #for j in range(negative_nb): # if(j==1): # negative=z_support[Index[numbers[j]]] # else: # negative=torch.cat((negative,z_support[Index[numbers[j]]])) ### Negative should have a size of [20,512] # negative = negatives[i//5] negative = self.memory.return_random(size=negative_nb, index=[index[i]]) negative = torch.Tensor(negative).to('cuda').detach() image_to_modification_similarity = cos( original_features[None, i, :], patch_features[None, i, :]) / temp ### [,1] matrix_of_similarity = cos(patch_features[None, i, :], negative) / temp ### [,20] similarities = torch.cat( (image_to_modification_similarity, matrix_of_similarity)) loss += criterion(similarities[None, :], torch.tensor([0]).to('cuda')) return loss / original_features.shape[0] def set_forward_loss_unlabel(self, x, patches=None, patches_label=None): ###new if self.jigsaw: #x_, y_ = self.set_forward_unlabel(patches=patches,patches_label=patches_label) #pred = torch.max(x_,1) #acc_jigsaw = torch.sum(pred[1] == y_).cpu().numpy()*1.0/len(y_) #x = x.contiguous().view( self.n_way * (self.n_support + self.n_query), *x.size()[2:]) v_t = self.set_forward_unlabel( patches=patches, patches_label=patches_label) ###new [5,5,512] v_t = v_t.view(25, -1) ###new [25,512] z_support, z_query = self.parse_feature(x, is_feature=False) ###new v = z_support ###new [5,5,512] # print(v[0][0]) v = v.reshape(-1, 512) # print(v[0]) # print(v.size()) # v=v.view(25,-1) ###new [25,512] indxs = [ i + self.indx for i in [ 0, 1, 2, 3, 4, 21, 22, 23, 24, 25, 42, 43, 44, 45, 46, 63, 64, 65, 66, 67, 84, 85, 86, 87, 88 ] ] representations = self.memory.return_representations(indxs).to( 'cuda').detach() negative_nb = 2000 loss_weight = 0.5 loss_1 = self.contrastive_loss(representations, v_t, negative_nb, indxs) loss_2 = self.contrastive_loss(representations, v, negative_nb, indxs) loss = loss_weight * loss_1 + (1 - loss_weight) * loss_2 self.memory.update(indxs, v.detach().cpu().numpy()) elif self.rotation: x_, y_ = self.set_forward_unlabel(patches=patches, patches_label=patches_label) pred = torch.max(x_, 1) acc_rotation = torch.sum( pred[1] == y_).cpu().numpy() * 1.0 / len(y_) if self.jigsaw: return loss elif self.rotation: return self.loss_fn(x_, y_), acc_rotation def parse_feature(self, x, is_feature): x = Variable(x.cuda()) if is_feature: z_all = x else: x = x.contiguous().view( self.n_way * (self.n_support + self.n_query), *x.size()[2:]) z_all = self.feature(x) z_all = z_all.view(self.n_way, self.n_support + self.n_query, -1) z_support = z_all[:, :self.n_support] z_query = z_all[:, self.n_support:] return z_support, z_query
class ActorCriticAgent: """ Advantage Actor Critic agent """ def __init__(self, num_actions, checkpoint=None): self.network, self.trainable_parameters = self.init_network( num_actions) self.optimizer = torch.optim.Adam(self.trainable_parameters, lr=1e-4) self.memory = Memory() if checkpoint is not None: load_checkpoint(self.network, self.optimizer, checkpoint) def init_network(self, num_actions): network = {'actor_critic': ActorCritic(num_actions)} trainable_parameters = list(network['actor_critic'].parameters()) return network, trainable_parameters def play(self, environment, max_games=1, max_steps=500, train=False, verbose=False, recorder=None): n_steps = 0 n_games = 0 current_game_infos = { 'game': n_games + 1, 'reward': 0, 'game_duration': 0 } observation = environment.reset() if recorder is not None: recorder.reset() recorder.record(environment) while (n_steps < max_steps) and (n_games < max_games): self.init_rollout(observation) for rollout_step in range(20): value, log_policy, action = self.network['actor_critic']( observation) self.memory.append({ 'value': value, 'log_policy': log_policy, 'action': action }) observation, extrinsic_reward, is_game_over, infos = environment.step( action.numpy()[0]) if recorder is not None: recorder.record(environment) reward = self.get_reward(observation, extrinsic_reward) self.memory.append({'reward': reward}) current_game_infos['reward'] += reward current_game_infos['game_duration'] += 1 n_steps += 1 if is_game_over: n_games += 1 print(current_game_infos) current_game_infos = { 'game': n_games + 1, 'reward': 0, 'game_duration': 0 } observation = environment.reset() break self.end_rollout(observation, is_game_over) if verbose: print(current_game_infos) if train: loss = self.compute_loss() self.backpropagate(loss) if recorder is not None: recorder.stop() def init_rollout(self, observation): self.memory.reset() self.network['actor_critic'].detach_internal_state() def end_rollout(self, observation, is_game_over): if is_game_over: next_value = torch.Tensor([[0]]) self.network['actor_critic'].reset_internal_state() else: next_value = self.network['actor_critic'](observation)[0].detach() self.memory.append({'value': next_value}) def get_reward(self, observation, extrinsic_reward): return np.clip(extrinsic_reward, -1, 1) def compute_loss(self): loss = self.network['actor_critic'].loss(self.memory) return loss def backpropagate(self, loss, max_gradient_norm=40): self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.trainable_parameters, max_gradient_norm) self.optimizer.step()
def train(input_placeholder, output_data, sess): # build cost function action = tf.placeholder("float", [None, ACTIONS_CHOICE_NUMBER]) y = tf.placeholder("float", [None]) y_action = tf.reduce_sum(tf.multiply(output_data, action), reduction_indices=1) cost = tf.reduce_mean(tf.square(y - y_action)) train_step = tf.train.AdamOptimizer(1e-6).minimize(cost) # start game game_state = game.GameState() global_timestamp = 0 epsilon = EPSILON memory = Memory(MEMORY_SIZE, FRAME_NUM_PER_STACK) # start network sess.run(tf.global_variables_initializer()) # network checkpoint saver and restore loader saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_FOLDER) if checkpoint and checkpoint.model_checkpoint_path and os.path.exists( os.path.join(CHECKPOINT_FOLDER, "global_state.pkl")): data = load_history(os.path.join(CHECKPOINT_FOLDER, "global_state.pkl")) global_timestamp = data['global_timestamp'] epsilon = data['epsilon'] memory = data['memory'] game_state = data['game_state'] saver.restore(sess, checkpoint.model_checkpoint_path) logging.info("restored from checkpoint", extra={ 'stage': get_stage_name(global_timestamp), 'timestamp': global_timestamp, 'epsilon': epsilon, 'reward': "", 'action': "" }) else: image_data, _, _ = game_state.frame_step(actions.NOTHING) memory.initial_stack(image_data) prev_state = memory.get_current_stack() while True: actions_scores = output_data.eval( feed_dict={input_placeholder: [prev_state]})[0] action_name, action_choice = actions.get_next_action( epsilon, actions_scores) image_data, reward, game_terminate = game_state.frame_step( action_choice) memory.stack_frame(image_data) new_state = memory.get_current_stack() memory.remember(prev_state, action_choice, reward, new_state, game_terminate) # anneal if global_timestamp > OBSERVE_DURATION and epsilon > MIN_EPSILON: logging.info("start anneal", extra={ 'stage': get_stage_name(global_timestamp), 'timestamp': global_timestamp, 'epsilon': epsilon, 'reward': reward, 'action': action_name }) epsilon -= float(EPSILON - MIN_EPSILON) / ANNEAL_DURATION # explore + train if global_timestamp > OBSERVE_DURATION: prev_state_batch, action_batch, reward_batch, new_state_batch, game_terminate_batch = memory.get_sample_batches( BATCH_SIZE) y_batch = [] evaluate = output_data.eval( feed_dict={input_placeholder: new_state_batch}) for i, game_terminate in enumerate(game_terminate_batch): # train target to reward if game_terminate: y_batch.append(reward_batch[i]) else: y_batch.append(reward_batch[i] + GAMMA * np.max(evaluate[i])) # gradient train_step.run( feed_dict={ y: y_batch, action: action_batch, input_placeholder: new_state_batch }) if global_timestamp % CHECKPOINT_GAP == 0: saver.save(sess, os.path.join(CHECKPOINT_FOLDER, 'flappy-bird'), global_step=global_timestamp) save_history( os.path.join(CHECKPOINT_FOLDER, 'global_state.pkl'), { 'global_timestamp': global_timestamp, 'epsilon': epsilon, 'memory': memory, 'game_state': game_state }) logging.info("checkpoint saved", extra={ 'stage': get_stage_name(global_timestamp), 'timestamp': global_timestamp, 'epsilon': epsilon, 'reward': reward, 'action': "" }) # update state prev_state = new_state logging.info("finish epoch", extra={ 'stage': get_stage_name(global_timestamp), 'timestamp': global_timestamp, 'epsilon': epsilon, 'reward': reward, 'action': action_name }) global_timestamp += 1
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env-interface", type=str, default='gym') parser.add_argument("--environment", type=str, default='BreakoutDeterministic-v4') parser.add_argument("--action-size", type=int, default=4) parser.add_argument("--input-shape", type=str, default='None,84,84,4') parser.add_argument("--state-len-max", type=int, default=4) parser.add_argument("--target-update-freq", type=int, default=10000) parser.add_argument("--ep-greedy-speed", type=str, default='slow') parser.add_argument("--epsilon-max", type=float, default=1.) parser.add_argument("--epsilon-min", type=float, default=.01) parser.add_argument("--epsilon-decay-slow", type=int, default=1000000) parser.add_argument("--epsilon-decay-fast", type=float, default=.001) parser.add_argument("--learning-rate", type=float, default=.95) parser.add_argument("--replay-start-size", type=int, default=50000) parser.add_argument("--batch-size", type=int, default=32) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--epochs", type=int, default=30000) parser.add_argument("--pixel-feature", type=int, default=1) parser.add_argument("--padding", type=int, default=0) parser.add_argument("--model", type=str, default='nature') args = parser.parse_args() args.input_shape = str2list(args.input_shape) assert args.model in ['nature', 'gated'] assert args.ep_greedy_speed in ['fast', 'slow'] assert args.env_interface in [ 'gym', 'ale', 'custom_cart', 'custom_cartpole', 'ple' ] if args.env_interface in ['gym', 'ale']: env = env_interface(args.env_interface, args.environment) elif args.env_interface in ['custom_cart', 'custom_cartpole', 'ple']: env = env_interface(args.env_interface, args.environment, bool(args.pixel_feature), bool(args.padding)) args.input_shape = [None] + list(env.obs_space_shape) + [1] args.input_shape[-1] = args.state_len_max args.action_size = env.action_size assert args.state_len_max == args.input_shape[-1] print args #Other other paramters state_old = [] state = [] steps = 0 #Other parameters if args.ep_greedy_speed == 'slow': epsilon = args.epsilon_max epsilon_rate = 0. if args.epsilon_decay_slow != 0: epsilon_rate = ((args.epsilon_max - args.epsilon_min) / float(args.epsilon_decay_slow)) elif args.ep_greedy_speed == 'fast': epsilon = args.epsilon_max #Initialize replay memory memory = Memory(args.replay_mem_size, args.input_shape[1:]) #Initialize neural net qnet, tnet, update_ops = init_network(args.input_shape, args.action_size, args.model) #import time with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(update_ops) for epoch in range(args.epochs): frame = env.reset() total_rewards = 0. total_losses = 0. state_old = [] state = [frame] * args.state_len_max done = False #start = time.time() while done == False: if np.random.rand() < epsilon: action = np.random.randint(args.action_size) else: image_in = np.stack(state, axis=-1)[np.newaxis, ...] action = qnet.get_action(sess, image_in) frame, reward, done, _ = env.step(action) total_rewards += reward state_old = state[:] state.append(frame) if len(state) > args.state_len_max: state = state[1:] #Add to memory memory.add([ np.stack(state_old, axis=-1)[np.newaxis, ...], action, min(1., max(-1., reward)), np.stack(state, axis=-1)[np.newaxis, ...], done ]) #Reduce epsilon if args.ep_greedy_speed == 'slow': epsilon = max(args.epsilon_min, epsilon - epsilon_rate) elif args.ep_greedy_speed == 'fast': epsilon = args.epsilon_min + ( args.epsilon_max - args.epsilon_min) * np.exp( -args.epsilon_decay_fast * float(steps)) if steps > args.replay_start_size: #Training step batch = np.array(memory.sample(args.batch_size)) states = np.concatenate(batch[:, 0], axis=0) actions = batch[:, 1] rewards = batch[:, 2] states1 = np.concatenate(batch[:, 3], axis=0) dones = batch[:, 4] l = qnet.train(sess, states, actions, rewards, states1, dones, args.learning_rate, tnet) total_losses += l #Increase the frame steps counter steps += 1 #Check if target network is to be updated if steps % args.target_update_freq == 0: print "Updating target..." sess.run(update_ops) if done == True: print "epoch:", epoch, "total rewards", total_rewards, "total losses", total_losses, qnet.string #print 'time:', time.time() - start break env.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='CartPole-v0') parser.add_argument("--action-size", type=int, default=2) parser.add_argument("--input-shape", type=list, default=[None, 4]) parser.add_argument("--target-update-freq", type=int, default=200) parser.add_argument("--epsilon-max", type=float, default=1.) parser.add_argument("--epsilon-min", type=float, default=.01) parser.add_argument("--epsilon-decay", type=float, default=.001) parser.add_argument("--learning-rate", type=float, default=.99) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--epochs", type=int, default=300) parser.add_argument("--replay-mem-size", type=int, default=1000000) args = parser.parse_args() env = gym.make(args.environment) args.action_size = env.action_space.n args.input_shape = [None] + list(env.observation_space.shape) print args # Epsilon parameter epsilon = args.epsilon_max # Replay memory memory = Memory(args.replay_mem_size) # Time step time_step = 0. # Initialize the agent qnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='qnet') tnet = qnetwork(input_shape=args.input_shape, action_size=args.action_size, scope='tnet') update_ops = update_target_graph('qnet', 'tnet') with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(args.epochs): total_reward = 0 state = env.reset() while True: #env.render() if np.random.rand() < epsilon: action = np.random.randint(args.action_size) else: action = qnet.act(sess, state) next_state, reward, done, _ = env.step(action) total_reward += reward # Add to memory memory.add([state, action, reward, next_state, done]) # Reduce epsilon time_step += 1. epsilon = args.epsilon_min + ( args.epsilon_max - args.epsilon_min) * np.exp( -args.epsilon_decay * time_step) # Training step batch = np.array(memory.sample(args.batch_size)) qnet.train(sess, batch, args.learning_rate, tnet) # s <- s' state = np.copy(next_state) # Update target network if int(time_step) % args.target_update_freq == 0: sess.run(update_ops) if done: print 'epoch:', epoch, 'total_rewards:', total_reward break
def __init__(self, level_name): self.level_name = level_name # setup environment self.env = gym_super_mario_bros.make(level_name) self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) # one hot encoded version of our actions self.possible_actions = np.array(np.identity(self.env.action_space.n, dtype=int).tolist()) # resest graph tf.reset_default_graph() # instantiate the DQNetwork self.DQNetwork = DQNetwork(state_size, action_size, learning_rate) # instantiate memory self.memory = Memory(max_size=memory_size) # initialize deque with zero images self.stacked_frames = deque([np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4) for i in range(pretrain_length): # If it's the first step if i == 0: state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) # Get next state, the rewards, done by taking a random action choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] next_state, reward, done, _ = self.env.step(choice) # stack the frames next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) # if the episode is finished (we're dead) if done: # we inished the episode next_state = np.zeros(state.shape) # add experience to memory self.memory.add((state, action, reward, next_state, done)) # start a new episode state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) else: # add experience to memory self.memory.add((state, action, reward, next_state, done)) # our new state is now the next_state state = next_state # saver will help us save our model self.saver = tf.train.Saver() # setup tensorboard writer self.writer = tf.summary.FileWriter("logs/") # losses tf.summary.scalar("Loss", self.DQNetwork.loss) self.write_op = tf.summary.merge_all()
print(device) actor = Actor(obs_space=config.obs_space, action_space=config.action_space, hidden_size=config.hidden_size).to(device) critic = Critic(obs_space=config.obs_space, hidden_size=config.hidden_size).to(device) # actor.load_state_dict(torch.load('actor_model.h5')) # critic.load_state_dict(torch.load('critic_model.h5')) wandb.watch(actor) wandb.watch(critic) optimizer_actor = Adam(actor.parameters(), lr=config.actor_lr) optimizer_critic = Adam(critic.parameters(), lr=config.critic_lr) memory = Memory(env.agent_ids) def compute_GAE(rewards, state_values, done, gamma, lamb): """ Computes Generalized Advantage Estimations. """ returns = [rewards[-1] + state_values[-1]] running_sum = rewards[-1] - state_values[-1] for i in reversed(range(len(rewards) - 1)): mask = 0 if done[i + 1] else 1 delta = rewards[i] + gamma * state_values[i + 1] * mask - state_values[i] running_sum = delta + gamma * lamb * running_sum * mask returns.insert(0, running_sum + state_values[i])
class Agent: def __init__(self, level_name): self.level_name = level_name # setup environment self.env = gym_super_mario_bros.make(level_name) self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) # one hot encoded version of our actions self.possible_actions = np.array(np.identity(self.env.action_space.n, dtype=int).tolist()) # resest graph tf.reset_default_graph() # instantiate the DQNetwork self.DQNetwork = DQNetwork(state_size, action_size, learning_rate) # instantiate memory self.memory = Memory(max_size=memory_size) # initialize deque with zero images self.stacked_frames = deque([np.zeros((100, 128), dtype=np.int) for i in range(stack_size)], maxlen=4) for i in range(pretrain_length): # If it's the first step if i == 0: state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) # Get next state, the rewards, done by taking a random action choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] next_state, reward, done, _ = self.env.step(choice) # stack the frames next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) # if the episode is finished (we're dead) if done: # we inished the episode next_state = np.zeros(state.shape) # add experience to memory self.memory.add((state, action, reward, next_state, done)) # start a new episode state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) else: # add experience to memory self.memory.add((state, action, reward, next_state, done)) # our new state is now the next_state state = next_state # saver will help us save our model self.saver = tf.train.Saver() # setup tensorboard writer self.writer = tf.summary.FileWriter("logs/") # losses tf.summary.scalar("Loss", self.DQNetwork.loss) self.write_op = tf.summary.merge_all() def predict_action(self, sess, explore_start, explore_stop, decay_rate, decay_step, state, actions): # first we randomize a number exp_exp_tradeoff = np.random.rand() explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step) if explore_probability > exp_exp_tradeoff: # make a random action choice = random.randint(1, len(self.possible_actions)) - 1 action = self.possible_actions[choice] else: # estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: state.reshape((1, *state.shape))}) # take the biggest Q value (= best action) choice = np.argmax(Qs) action = self.possible_actions[choice] return action, choice, explore_probability def play_notebook(self): import matplotlib.pyplot as plt # imports to render env to gif from JSAnimation.IPython_display import display_animation from matplotlib import animation from IPython.display import display # http://mckinziebrandon.me/TensorflowNotebooks/2016/12/21/openai.html def display_frames_as_gif(frames): """ Displays a list of frames as a gif, with controls """ #plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 72) patch = plt.imshow(frames[0]) plt.axis('off') def animate(i): patch.set_data(frames[i]) anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50) display(display_animation(anim, default_mode='loop')) frames = [] with tf.Session() as sess: total_test_rewards = [] # Load the model self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name)) for episode in range(1): total_rewards = 0 state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) print("****************************************************") print("EPISODE ", episode) while True: # Reshape the state state = state.reshape((1, *state_size)) # Get action from Q-network # Estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state}) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) #Perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) frames.append(self.env.render(mode = 'rgb_array')) total_rewards += reward if done: print ("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) state = next_state self.env.close() display_frames_as_gif(frames) def play(self): with tf.Session() as sess: total_test_rewards = [] # Load the model self.saver.restore(sess, "models/{0}.cpkt".format(self.level_name)) #self.env = wrap_env(self.env) for episode in range(1): total_rewards = 0 state = self.env.reset() state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) print("****************************************************") print("EPISODE ", episode) while True: # Reshape the state state = state.reshape((1, *state_size)) # Get action from Q-network # Estimate the Qs values state Qs = sess.run(self.DQNetwork.output, feed_dict = {self.DQNetwork.inputs_: state}) # Take the biggest Q value (= the best action) choice = np.argmax(Qs) #Perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) self.env.render() total_rewards += reward if done: print ("Score", total_rewards) total_test_rewards.append(total_rewards) break next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) state = next_state self.env.close() def train(self): with tf.Session() as sess: # initialize the variables sess.run(tf.global_variables_initializer()) # initialize decay rate (that will be used to reduce epsilon) decay_step = 0 for episode in range(total_episodes): # set step to 0 step = 0 # initialize rewards of episode episode_rewards = [] # make a new episode and opserve the first state state = self.env.reset() # remember that stack frame function state, self.stacked_frames = stack_frames(self.stacked_frames, state, True) print("Episode:", episode) while step < max_steps: step += 1 #print("step:", step) # increase decay_step decay_step += 1 # predict an action action, choice, explore_probability = self.predict_action(sess, explore_start, explore_stop, decay_rate, decay_step, state, self.possible_actions) # perform the action and get the next_state, reward, and done information next_state, reward, done, _ = self.env.step(choice) if episode_render: self.env.render() # add the reward to total reward episode_rewards.append(reward) # the game is finished if done: print("done") # the episode ends so no next state next_state = np.zeros((110, 84), dtype=np.int) next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) # set step = max_steps to end episode step = max_steps # get total reward of the episode total_reward = np.sum(episode_rewards) print("Episode:", episode, "Total reward:", total_reward, "Explore P:", explore_probability, "Training Loss:", loss) #rewards_list.append((episode, total_reward)) # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory self.memory.add((state, action, reward, next_state, done)) else: # stack frame of the next state next_state, self.stacked_frames = stack_frames(self.stacked_frames, next_state, False) # store transition <s_i, a, r_{i+1}, s_{i+1}> in memory self.memory.add((state, action, reward, next_state, done)) # s_{i} := s_{i+1} state = next_state ### Learning part # obtain random mini-batch from memory batch = self.memory.sample(batch_size) states_mb = np.array([each[0] for each in batch], ndmin=3) actions_mb = np.array([each[1] for each in batch]) rewards_mb = np.array([each[2] for each in batch]) next_states_mb = np.array([each[3] for each in batch], ndmin=3) dones_mb = np.array([each[4] for each in batch]) target_Qs_batch = [] # get Q values for next_state Qs_next_state = sess.run(self.DQNetwork.output, feed_dict={self.DQNetwork.inputs_: next_states_mb}) # set Q_target = r if episode ends with s+1 for i in range(len(batch)): terminal = dones_mb[i] # if we are in a terminal state, only equals reward if terminal: target_Qs_batch.append(rewards_mb[i]) else: target = rewards_mb[i] + gamma * np.max(Qs_next_state[i]) target_Qs_batch.append(target) targets_mb = np.array([each for each in target_Qs_batch]) loss, _ = sess.run([self.DQNetwork.loss, self.DQNetwork.optimizer], feed_dict={self.DQNetwork.inputs_: states_mb, self.DQNetwork.target_Q: targets_mb, self.DQNetwork.actions_: actions_mb}) # write tf summaries summary = sess.run(self.write_op, feed_dict={self.DQNetwork.inputs_: states_mb, self.DQNetwork.target_Q: targets_mb, self.DQNetwork.actions_: actions_mb}) self.writer.add_summary(summary, episode) self.writer.flush() # save model every 5 episodes if episode % 5 == 0: self.saver.save(sess, "models/{0}.cpkt".format(self.level_name)) print("Model Saved")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--environment", type=str, default='Pendulum-v0') parser.add_argument("--action-dim", type=int, default=1) parser.add_argument("--state-dim", type=int, default=1) #parser.add_argument("--epochs", type=int, default=30000) parser.add_argument("--time-steps", type=int, default=30000) parser.add_argument('--tau', type=float, help='soft target update parameter', default=0.01) parser.add_argument("--action-bound", type=float, default=1.) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=.9) parser.add_argument("--latent-size", type=int, default=4, help='Size of vector for Z') parser.add_argument("--model", type=str, default='gan') parser.add_argument("--mode", type=str, default='none') args = parser.parse_args() assert args.mode in ['none', 'test', 'transfer'] assert args.model in [ 'mlp', 'gan', 'gated', 'dmlac_mlp', 'dmlac_gan', 'dmlac_gated', 'ddpg_unrolled_pg_mlp', 'dmlac_gp', 'dmlac_truth', 'mpc' ] if args.model == 'dmlac_truth': assert args.environment == 'Pendulum-v0' # Initialize environment env = gym.make(args.environment) args.state_dim = env.observation_space.shape[0] args.action_dim = env.action_space.shape[0] #assert args.action_dim == 1 args.action_bound_high = env.action_space.high args.action_bound_low = env.action_space.low assert len(args.action_bound_high) == len(args.action_bound_low) for i in range(len(args.action_bound_high)): assert args.action_bound_high[i] == -args.action_bound_low[i] print(args) jointddpg, update_target_actor, update_target_critic, copy_target_actor, copy_target_critic = init_model( [None, args.state_dim], args.action_dim, args.latent_size, args.learning_rate, args.action_bound_low, args.action_bound_high, args.tau, args.model) # Replay memory memory = Memory(args.replay_mem_size) # Actor noise exploration_strategy = OUStrategy(jointddpg, env) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) #sess.run(copy_target_critic) #sess.run(copy_target_actor) if args.mode in ['test', 'transfer']: env.seed(1) state = env.reset() total_rewards = 0.0 epoch = 1 for time_steps in range(args.time_steps): env.render() # Choose an action exploration = (float(args.time_steps - time_steps) / float(args.time_steps))**4 action = exploration_strategy.action(sess, state[np.newaxis, ...], exploration) # Execute action state1, reward, done, _ = env.step(action) total_rewards += float(reward) # Store tuple in replay memory memory.add([ state[np.newaxis, ...], action[np.newaxis, ...], reward, state1[np.newaxis, ...], done ]) # Training step batch_B = np.array(memory.sample(args.batch_size)) assert len(batch_B) > 0 states_B = np.concatenate(batch_B[:, 0], axis=0) actions_B = np.concatenate(batch_B[:, 1], axis=0) rewards_B = batch_B[:, 2] states1_B = np.concatenate(batch_B[:, 3], axis=0) dones_B = batch_B[:, 4] #Get another batch batch_M = np.array(memory.sample(args.batch_size)) assert len(batch_M) > 0 states_M = np.vstack(batch_M[:, 0]) actions_M = np.concatenate(batch_M[:, 1], axis=0) if args.model == 'dmlac_gp': jointddpg.update_hist(memory) jointddpg.train(sess, states_B, actions_B, rewards_B, states1_B, dones_B, states_M, actions_M, len(batch_M), args.latent_size) # Update target networks #jointddpg.update(self, sess, update_target_critic, update_target_actor) #sess.run(update_target_critic) #sess.run(update_target_actor) state = np.copy(state1) if done == True: print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards epoch += 1 total_rewards = 0. if args.mode == 'transfer': if time_steps >= args.time_steps / 3: env.seed(0) else: env.seed(1) elif args.mode == 'test': env.seed(1) state = env.reset() if args.mode == 'transfer': if time_steps == args.time_steps / 3: memory = Memory(args.replay_mem_size)
class DDPGagent: def __init__(self, hidden_size, env): self.num_states = env.observation_space.shape[0] self.num_actions = env.action_space.shape[0] self.Actor = Actor(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Actor_target = Actor(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Critic = Critic(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() self.Critic_target = Critic(input_size=self.num_states, hidden_size=hidden_size, output_size=self.num_actions).cuda() for target_param, param in zip(self.Actor_target.parameters(), self.Actor.parameters()): target_param.data = param.data for target_param, param in zip(self.Critic_target.parameters(), self.Critic.parameters()): target_param.data = param.data self.Memory = Memory(30000) self.criterion = nn.MSELoss().cuda() self.actor_optimizer = torch.optim.Adam(self.Actor.parameters(), lr=1e-2) self.critic_optimizer = torch.optim.Adam(self.Critic.parameters(), lr=1e-1) def get_action(self, state): state = torch.from_numpy(state).float().unsqueeze(0).cuda() action = self.Actor.forward(state) action = action.detach().cpu().numpy() return action def update(self, batch_size): states, actions, rewards, next_states, _ = self.Memory.sample( batch_size) states = torch.tensor(states).cuda() actions = torch.tensor(actions).cuda() rewards = torch.tensor(rewards).cuda() next_states = torch.tensor(next_states).cuda() Q_Value = self.Critic.forward(states, action=actions) next_actions = self.Actor_target(next_states) next_Q = self.Critic_target.forward(next_states, next_actions.detach()) Q_prime = rewards + 0.99 * next_Q critic_loss = self.criterion(Q_Value, Q_prime) policy_loss = -self.Critic.forward(states, self.Actor.forward(states)).mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() for target_param, param in zip(self.Actor_target.parameters(), self.Actor.parameters()): target_param.data = (param.data * 1e-2 + target_param.data * (1.0 - 1e-2)) for target_param, param in zip(self.Critic_target.parameters(), self.Critic.parameters()): target_param.data.copy_(param.data * 1e-2 + target_param.data * (1.0 - 1e-2))
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, default='Pendulum-v0') parser.add_argument("--unroll-steps", type=int, default=25) parser.add_argument("--no-samples", type=int, default=20) parser.add_argument("--no-basis", type=int, default=256) parser.add_argument("--discount-factor", type=float, default=.9) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--train-policy-batch-size", type=int, default=32) parser.add_argument("--train-policy-iterations", type=int, default=30) parser.add_argument("--replay-start-size-epochs", type=int, default=2) parser.add_argument("--train-hyperparameters-iterations", type=int, default=50000) parser.add_argument("--goal-position", type=float, default=.45) args = parser.parse_args() print args #env = gym.make(args.env, goal_position=args.goal_position) env = gym.make(args.env) env.seed(seed=args.goal_position) # Gather data to train hyperparameters data = [] rewards = [] dones = [] for _ in range(2): state = env.reset() while True: action = np.random.uniform(env.action_space.low, env.action_space.high, 1) next_state, reward, done, _ = env.step(action) data.append([state, action, next_state]) rewards.append(reward) dones.append(done) state = np.copy(next_state) if done: break states, actions, next_states = [np.stack(d, axis=0) for d in zip(*data)] permutation = np.random.permutation(len(data)) states_actions = np.concatenate([states, actions], axis=-1)[permutation] next_states = next_states[permutation] # Train the hyperparameters hs = [ hyperparameter_search(dim=env.observation_space.shape[0] + env.action_space.shape[0]) for _ in range(env.observation_space.shape[0]) ] hyperparameters = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) batch_size = 32 iterations = args.train_hyperparameters_iterations #idxs = [np.random.randint(len(states_actions), size=batch_size) for _ in range(iterations)] for i in range(len(hs)): hs[i].train_hyperparameters(sess, states_actions, next_states[:, i], iterations, batch_size) hyperparameters.append( sess.run([hs[i].length_scale, hs[i].signal_sd, hs[i].noise_sd])) blr = blr_model(x_dim=env.observation_space.shape[0] + env.action_space.shape[0], y_dim=env.observation_space.shape[0], state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0], observation_space_low=env.observation_space.low, observation_space_high=env.observation_space.high, action_bound_low=env.action_space.low, action_bound_high=env.action_space.high, unroll_steps=args.unroll_steps, no_samples=args.no_samples, no_basis=args.no_basis, discount_factor=args.discount_factor, train_policy_batch_size=args.train_policy_batch_size, train_policy_iterations=args.train_policy_iterations, hyperparameters=hyperparameters, debugging_plot=False) # Initialize the memory memory = Memory(args.replay_mem_size) assert len(data) == len(rewards) assert len(data) == len(dones) for dat, reward, done in zip(data, rewards, dones): memory.add([dat[0], dat[1], reward, dat[2], done]) memory2 = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) weights = pickle.load( open( '../custom_environments/weights/mountain_car_continuous_reward' + str(args.goal_position) + '.p', 'rb')) sess.run(blr.assign_ops, feed_dict=dict(zip(blr.placeholders_reward, weights))) # Update the model with data used from training hyperparameters blr.update(sess, states_actions, next_states) blr.train(sess, memory) state = env.reset() total_rewards = 0.0 epoch = 1 for time_steps in range(30000): action = blr.act(sess, state) next_state, reward, done, _ = env.step(action) total_rewards += float(reward) # Append to the batch memory.add([state, action, reward, next_state, done]) memory2.append([state, action, reward, next_state, done]) # s <- s' state = np.copy(next_state) if done == True: print 'time steps', time_steps, 'epoch', epoch, 'total rewards', total_rewards # Update the memory blr.update(sess, memory=memory2) # Train the policy blr.train(sess, memory) epoch += 1 total_rewards = 0. state = env.reset() memory2 = []
def main(): #Arguments for the q-learner parser = argparse.ArgumentParser() parser.add_argument("--env-interface", type=str, default='gym') parser.add_argument("--environment", type=str, default='BreakoutDeterministic-v4') parser.add_argument("--action-size", type=int, default=4) parser.add_argument("--input-shape", type=str, default='None,84,84,4') parser.add_argument("--state-len-max", type=int, default=4) parser.add_argument("--target-update-freq", type=int, default=10000) parser.add_argument("--epsilon-max", type=float, default=1.) parser.add_argument("--epsilon-min", type=float, default=.01) parser.add_argument("--epsilon-decay", type=int, default=1000000) parser.add_argument("--learning-rate", type=float, default=.95) parser.add_argument("--replay-start-size", type=int, default=50000) parser.add_argument("--batch-size", type=int, default=32) parser.add_argument("--replay-mem-size", type=int, default=1000000) parser.add_argument("--epochs", type=int, default=30000) #Arguments for the feature extractor parser.add_argument("--train-fe-shape", type=str, default='None,12,12,4') parser.add_argument("--stop-gradient", type=int, default=0) parser.add_argument("--train-fe-iterations", type=int, default=1000) parser.add_argument("--train-fe-batch-size", type=int, default=100) parser.add_argument("--train-fe-lamb", type=float, default=0.) parser.add_argument("--train-fe-numfactors", type=int, default=200) parser.add_argument("--train-fe-nummap", type=int, default=100) parser.add_argument("--train-fe-learning-rate", type=float, default=.001) parser.add_argument("--train-fe-w", type=int, default=12) parser.add_argument("--train-fe-s", type=int, default=1) parser.add_argument("--use-conv-after-fe", type=int, default=0) parser.add_argument("--ep-greedy-speed", type=str, default='slow') #Arguments for the environment interface parser.add_argument("--pixel-features", type=int, default=1) parser.add_argument("--padding", type=int, default=0) args = parser.parse_args() #Parse arguments wrt other arguments args.input_shape = str2list(args.input_shape) args.train_fe_shape = str2list(args.train_fe_shape) assert args.env_interface in [ 'gym', 'ale', 'custom_cart', 'custom_cartpole' ] assert args.ep_greedy_speed in ['fast', 'slow'] env = env_interface(args.env_interface, args.environment, pixel_feature=bool(args.pixel_features), padding=bool(args.padding), render=True) args.action_size = env.action_size if args.env_interface in ['custom_cart', 'custom_cartpole']: args.input_shape = [None] + list( env.obs_space_shape) + [args.state_len_max] args.train_fe_shape[-1] = args.state_len_max print args #Other other parameters state_old = [] state = [] steps = 0 #Other parameters epsilon_lambda = .001 epsilon = args.epsilon_max epsilon_rate = 0. if args.epsilon_decay != 0: epsilon_rate = ((args.epsilon_max - args.epsilon_min) / float(args.epsilon_decay)) #Initialize replay memory print args.input_shape memory = Memory(args.replay_mem_size, args.input_shape[1:]) #Initialize neural net from gated_qlearning import gated_qlearning qnet = gated_qlearning(shape=args.train_fe_shape,\ nummap=args.train_fe_nummap,\ numfactors=args.train_fe_numfactors,\ learning_rate=args.train_fe_learning_rate,\ frame_shape=args.input_shape,\ a_size=args.action_size,\ stop_gradient=bool(args.stop_gradient),\ lamb=args.train_fe_lamb,\ w=args.train_fe_w,\ s=args.train_fe_s,\ use_conv_after_fe=bool(args.use_conv_after_fe)) qnet_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) tnet = gated_qlearning(shape=args.train_fe_shape,\ nummap=args.train_fe_nummap,\ numfactors=args.train_fe_numfactors,\ learning_rate=args.train_fe_learning_rate,\ frame_shape=args.input_shape,\ a_size=args.action_size,\ stop_gradient=bool(args.stop_gradient),\ lamb=args.train_fe_lamb,\ w=args.train_fe_w,\ s=args.train_fe_s,\ use_conv_after_fe=bool(args.use_conv_after_fe)) tnet_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES)[len(qnet_vars):] update_ops = update_target_graph_vars(qnet_vars, tnet_vars) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) sess.run(update_ops) for epoch in range(args.epochs): frame = env.reset() total_rewards = 0. total_losses = 0. state_old = [] state = [frame] * args.state_len_max done = False while done == False: if np.random.rand() < epsilon: action = np.random.randint(args.action_size) else: image_in = np.stack(state, axis=-1)[np.newaxis, ...] action = qnet.get_action(sess, image_in) frame, reward, done, _ = env.step(action) total_rewards += reward state_old = state[:] state.append(frame) if len(state) > args.state_len_max: state = state[1:] #Add to memory memory.add([np.stack(state_old, axis=-1)[np.newaxis, ...],\ action,\ min(1., max(-1., reward)),\ np.stack(state, axis=-1)[np.newaxis, ...],\ done]) #Reduce epsilon if args.ep_greedy_speed == 'slow': epsilon = max(args.epsilon_min, epsilon - epsilon_rate) elif args.ep_greedy_speed == 'fast': epsilon = args.epsilon_min + ( args.epsilon_max - args.epsilon_min) * np.exp( -epsilon_lambda * float(steps)) #Train the reconstruction loss if args.train_fe_iterations > 0: args.train_fe_iterations -= qnet.train_feature_extractor( sess, memory, args.train_fe_batch_size, 10) print args.train_fe_iterations if steps > args.replay_start_size and args.train_fe_iterations <= 0: #Training step batch = np.array(memory.sample(args.batch_size)) states = np.concatenate(batch[:, 0], axis=0) actions = batch[:, 1] rewards = batch[:, 2] states1 = np.concatenate(batch[:, 3], axis=0) dones = batch[:, 4] Q1 = qnet.get_Q1(sess, states1, tnet) targetQ = rewards + (1. - dones) * args.learning_rate * np.amax( Q1, keepdims=False, axis=1) l, _, _ = qnet.train(sess, states, actions, targetQ[..., np.newaxis]) total_losses += l #Increase the frame steps counter steps += 1 #Check if target network is to be updated if steps % args.target_update_freq == 0: print "Updating target..." sess.run(update_ops) if done == True: print "epoch", epoch, "total rewards", total_rewards, "total losses", total_losses break env.close()