def __init__(self, env): self.env = env state_shape = self.env.observation_space.shape action_dim = self.env.action_space.shape[1] # for now, with single machine synchronous training, use a replay memory for training. # this replay memory stores states in a Variable (ie potentially in gpu memory) # TODO: switch back to async training with multiple replicas (as in drivebot project) self.replay_memory = replay_memory.ReplayMemory(opts.replay_memory_size, state_shape, action_dim) # s1 and s2 placeholders batched_state_shape = [None] + list(state_shape) s1 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32) s2 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32) # initialise base models for actor / critic and their corresponding target networks # target_actor is never used for online sampling so doesn't need explore noise. self.actor = ActorNetwork("actor", s1, action_dim) self.critic = CriticNetwork("critic", self.actor) self.target_actor = ActorNetwork("target_actor", s2, action_dim) self.target_critic = CriticNetwork("target_critic", self.target_actor) # setup training ops; # training actor requires the critic (for getting gradients) # training critic requires target_critic (for RHS of bellman update) self.actor.init_ops_for_training(self.critic) self.critic.init_ops_for_training(self.target_critic)
def __init__(self, env, agent_opts): self.env = env state_dim = self.env.observation_space.shape[0] action_dim = self.env.action_space.shape[1] # for now, with single machine synchronous training, use a replay memory for training. # TODO: switch back to async training with multiple replicas (as in drivebot project) self.replay_memory = replay_memory.ReplayMemory(agent_opts.replay_memory_size, state_dim, action_dim) # initialise base models for actor / critic and their corresponding target networks # target_actor is never used for online sampling so doesn't need explore noise. self.actor = ActorNetwork("actor", state_dim, action_dim, agent_opts.actor_hidden_layers, agent_opts.action_noise_theta, agent_opts.action_noise_sigma, agent_opts.actor_activation_init_magnitude) self.critic = CriticNetwork("critic", self.actor, agent_opts.critic_hidden_layers) self.target_actor = ActorNetwork("target_actor", state_dim, action_dim, agent_opts.actor_hidden_layers) self.target_critic = CriticNetwork("target_critic", self.target_actor, agent_opts.critic_hidden_layers) # setup training ops; # training actor requires the critic (for getting gradients) # training critic requires target_critic (for RHS of bellman update) self.actor.init_ops_for_training(self.critic, agent_opts.actor_learning_rate, agent_opts.actor_gradient_clip) self.critic.init_ops_for_training(self.target_critic, agent_opts.critic_learning_rate, agent_opts.critic_gradient_clip)
def get_batches(self): """Yields randomized batches epsilon-greedy games. Maintains a replay memory at full capacity. """ print("Initializing memory...") memory = replay_memory.ReplayMemory() while not memory.is_full(): for experience in self.experience_collector.collect( play.random_strategy): memory.add(experience) memory.print_stats() for i in itertools.count(): if i < START_DECREASE_EPSILON_GAMES: epsilon = 1.0 else: epsilon = max( MIN_EPSILON, 1.0 - (i - START_DECREASE_EPSILON_GAMES) / DECREASE_EPSILON_GAMES) strategy = play.make_epsilon_greedy_strategy( self.get_q_values, epsilon) for experience in self.experience_collector.collect(strategy): memory.add(experience) batch_experiences = memory.sample(BATCH_SIZE) yield self.experiences_to_batches(batch_experiences)
def __init__(self, name): self.game = DoomGame() self.game.load_config(CONFIG_FILE_PATH) self.game.set_window_visible(False) self.game.set_mode(Mode.PLAYER) # self.game.set_screen_format(ScreenFormat.GRAY8) self.game.set_screen_format(ScreenFormat.CRCGCB) self.game.set_screen_resolution(ScreenResolution.RES_640X480) self.game.init() health = self.game.get_game_variable(GameVariable.HEALTH) ammo = self.game.get_game_variable(GameVariable.SELECTED_WEAPON_AMMO) frag = self.game.get_game_variable(GameVariable.FRAGCOUNT) pos_x = self.game.get_game_variable(GameVariable.POSITION_X) pos_y = self.game.get_game_variable(GameVariable.POSITION_Y) self.reward_gen = RewardGenerater(health, ammo, frag, pos_x, pos_y) self.replay_buff = replay_memory.ReplayMemory( CAPACITY, data_name="demodata_cig2017.npy") self.network = Network() self.agent = Agent(self.network, self.replay_buff, self.reward_gen) self.local_step = 0 self.finished = False self.name = name
def initialize(self): if self.input_height is not 0: self.policy_net = deep_q_network.DQN_Conv( self.input_height, self.input_width, self.n_actions).double().to(self.device) self.target_net = deep_q_network.DQN_Conv( self.input_height, self.input_width, self.n_actions).double().to(self.device) else: # if not a convolutional network print("Linear Network") self.policy_net = deep_q_network.DQN_Linear( self.input_width, self.n_actions).to(self.device) self.target_net = deep_q_network.DQN_Linear( self.input_width, self.n_actions, requires_grad=False).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) if self.model_path: self.loadModel(self.model_path) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-5) #self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.memory = replay_memory.ReplayMemory(self.capacity)
def __init__(self, candidates, dropout_prob=1.0, batch_size=64, replay_memory_size=10000): self._candidates = [candidates[i] for i in range(len(candidates))] self._candidate_number = len(candidates) self._dimension = len(candidates[0]) self._droput_prob = dropout_prob self._batch_size = batch_size self._replay_memory_size = replay_memory_size self._replay_memory = replay_memory.ReplayMemory(replay_memory_size, [1, self._dimension, self._dimension, 1]) self._count = 0 # define a MLP self.graph = tf.Graph() with self.graph.as_default(): self.nid = tf.placeholder(tf.int32, [None, 1], name='nid') self.doc = tf.placeholder(tf.float32, [None, 5], name='doc') self.ctx = tf.placeholder(tf.float32, [None, 5], name='ctx') self.label = tf.placeholder(tf.float32, [None, 1], name='label') self.dpp = tf.placeholder(tf.float32, shape=(), name='droput_prob') self.batch = tf.placeholder(tf.int64, shape=(), name='batch') self.ds = tf.contrib.data.Dataset.from_tensor_slices((self.nid, self.doc, self.ctx, self.label)) self.ds = self.ds.repeat(1).batch(self.batch) self.itr = self.ds.make_initializable_iterator() self.nxt = self.itr.get_next() with tf.variable_scope('nid_embedding'): self.nid_embs = tf.get_variable('embedding', initializer=tf.random_uniform([200, 16], -1.0, 1.0)) self.nid_emb = tf.nn.relu(tf.reduce_sum(tf.nn.embedding_lookup(self.nid_embs, self.nxt[0]), 1)) print >> sys.stderr, self.nid_emb.get_shape().as_list() with tf.variable_scope('dot'): self.dot = tf.reduce_sum( tf.multiply(tf.nn.dropout(self.nxt[1], self.dpp), tf.nn.dropout(self.nxt[2], self.dpp)), 1, keep_dims=True) print >> sys.stderr, self.dot.get_shape().as_list() with tf.variable_scope('FC'): self.weight = tf.get_variable('weight', [17, 1], tf.float32, tf.random_normal_initializer(stddev=0.05)) self.bias = tf.get_variable('bias', [1], tf.float32, tf.constant_initializer(0.0)) self.feats = tf.concat([self.nid_emb, self.dot], 1) print >> sys.stderr, self.feats.get_shape().as_list() self.fc = tf.matmul(self.feats, self.weight) + self.bias print >> sys.stderr, self.fc.get_shape().as_list() self.pred_score = tf.nn.sigmoid(self.fc) self.max_idx = tf.argmax(self.pred_score) print >> sys.stderr, self.pred_score.get_shape().as_list() with tf.variable_scope('optimizer'): self.t_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.nxt[3], logits=self.fc) print >> sys.stderr, self.t_loss.get_shape().as_list() self.loss = tf.reduce_mean(self.t_loss) print >> sys.stderr, self.loss.get_shape().as_list() self.optim = tf.train.GradientDescentOptimizer(0.01).minimize(self.loss) self.init_var = tf.global_variables_initializer() self.sess = tf.Session(graph=self.graph) self.sess.run(self.init_var)
def test_overfit_simple_artificial_dataset(self): input_shape = 1 batch_size = 10 num_actions = 2 num_hidden = 2 discount = 1 learning_rate = 1 update_rule = 'adam' freeze_interval = 100 regularization = 0 rng = None num_hidden_layers = 1 network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers, num_actions, num_hidden, discount, learning_rate, regularization, update_rule, freeze_interval, rng) rm = replay_memory.ReplayMemory(batch_size) # state 0 to state 1 reward +1 for idx in range(20): state = np.array([0]) next_state = np.array([1]) action = 1 reward = 1 terminal = 1 rm.store((state, action, reward, next_state, terminal)) # state 0 to state 0 reward -1 for idx in range(20): switch = random.randint(0, 1) state = np.array([0]) next_state = np.array([0]) action = 0 reward = -1 terminal = 0 rm.store((state, action, reward, next_state, terminal)) print rm.terminal_count print_data = False l = logger.Logger('test') counter = 0 while True: counter += 1 states, actions, rewards, next_states, terminals = rm.sample_batch( ) loss = network.train(states, actions, rewards, next_states, terminals) l.log_loss(loss) if counter % 100 == 0: l.log_epoch(counter) Q = {} s0 = network.get_q_values(np.array([0])) Q['s0_a0'] = s0[0] Q['s0_a1'] = s0[1] s1 = network.get_q_values(np.array([1])) Q['s1_a0'] = s1[0] Q['s1_a1'] = s1[1]
def __init__(self, candidates, emsemble_num=10,dropout_prob=1, batch_size=64, replay_memory_size=10000): self._candidates = [candidates[i] for i in range(len(candidates))] self._emsemble_num = emsemble_num self._candidate_number = len(candidates) self._dimension = len(candidates[0]) self._droput_prob = dropout_prob self._batch_size = batch_size self._replay_memory_size = replay_memory_size self._replay_memory = replay_memory.ReplayMemory(replay_memory_size, [1, self._dimension, self._dimension, 1]) self._count = 0 self._model = [rmax.Rmax(self._candidates) for i in range(self._emsemble_num)]
def simulate(): draw = False print 'building network...' if draw: pltAcas = plot_acas_xu.Plot_ACAS_XU(state_generator.RMAX, ICON_FILE, 1) sg = state_generator.StateGenerator(state_generator.RMAX, state_generator.RMIN, state_generator.VMIN, state_generator.VMAX, K_SIZE) q = qnetwork.QNetwork(state_generator.NUM_INPUTS, replay_memory.BATCH_SIZE, state_generator.NUM_ACTIONS, GAMMA, SOLVER) repMem = replay_memory.ReplayMemory() count = 0 dt = state_generator.DT dti = state_generator.DTI state = sg.randomStateGenerator() i = 0 print 'starting training...' while True: for j in range(TRAIN_FREQ): i += 1 action = q.getAction(state) nextStates, rewards = sg.getNextState(state, action, dt, dti) stateNorm, nextStateNorm = sg.normState(state, nextStates) repMem.store((stateNorm, action, rewards, nextStateNorm)) state = nextStates[0] count += 1 if draw: pltAcas.updateState(state, action) pltAcas.draw() time.sleep(0.3) if sg.checkRange(state) or i > 100: i = 0 state = sg.randomStateGenerator() if count % PRINT_FREQ == 0 and count >= replay_memory.INIT_SIZE: print "Samples: %d, Trainings: %d" % ( count, (count - replay_memory.INIT_SIZE) / TRAIN_FREQ), "Loss: %.3e" % q.test(repMem.sample_batch()) sys.stdout.flush() elif (count % 10000 == 0): print "Samples: %d" % count sys.stdout.flush() q.train(repMem.sample_batch())
def __init__(self, opts): self.opts = opts config = tf.ConfigProto() #config.gpu_options.allow_growth = True #config.log_device_placement = True config.gpu_options.per_process_gpu_memory_fraction = 0.5 #opts.gpu_mem_fraction self.sess = tf.Session(config=config) render_shape = (opts.height, opts.width, 3) self.replay_memory = replay_memory.ReplayMemory( opts=opts, state_shape=render_shape, action_dim=2, load_factor=1.2) if opts.event_log_in: self.replay_memory.reset_from_event_log(opts.event_log_in, opts.event_log_in_num) # s1 and s2 placeholders batched_state_shape = [None] + list(render_shape) s1 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32) s2 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32) # initialise base models for value & naf networks. value subportion of net is # explicitly created seperate because it has a target network note: in the case of # --share-input-state-representation the input state network of the value_net will # be reused by the naf.l_value and naf.output_actions net self.value_net = models.ValueNetwork("value", s1, opts) self.target_value_net = models.ValueNetwork("target_value", s2, opts) self.network = models.NafNetwork("naf", s1, s2, self.value_net, self.target_value_net, action_dim=2, opts=opts) with self.sess.as_default(): # setup saver util and either load latest ckpt or init variables self.saver_util = None if opts.ckpt_dir is not None: self.saver_util = util.SaverUtil(self.sess, opts.ckpt_dir, opts.ckpt_freq) else: self.sess.run(tf.initialize_all_variables()) for v in tf.all_variables(): print >> sys.stderr, v.name, util.shape_and_product_of(v) # setup target network self.target_value_net.set_as_target_network_for( self.value_net, 0.01)
def test_minibatch_sample_shapes_1D_state(self): batch_size = 100 state_shape = 2 rm = replay_memory.ReplayMemory(batch_size) for idx in range(1000): state = np.ones(state_shape) action = 0 reward = 0 next_state = np.ones(state_shape) terminal = 0 rm.store((state, action, reward, next_state, terminal)) states, actions, rewards, next_states, terminals = rm.sample_batch() self.assertEquals(states.shape, (batch_size, state_shape)) self.assertEquals(actions.shape, (batch_size, 1)) self.assertEquals(rewards.shape, (batch_size, 1)) self.assertEquals(next_states.shape, (batch_size, state_shape)) self.assertEquals(terminals.shape, (batch_size, 1))
def __init__(self, input_size=10, TICKER='MSFT', BATCH_SIZE=128, GAMMA=0.999, EPS_START=0.9, EPS_END=0.05, EPS_DECAY=200, TARGET_UPDATE=10, REPLAY_MEMORY_CAPACITY=10000, NUM_EPISODES=1, hidden_layer=120, actions=3): self.TICKER = TICKER self.BATCH_SIZE = BATCH_SIZE self.GAMMA = GAMMA self.EPS_START = EPS_START self.EPS_END = EPS_END self.EPS_DECAY = EPS_DECAY self.TARGET_UPDATE = TARGET_UPDATE self.NUM_EPISODES = NUM_EPISODES self.fd = financial_data.financial_data(input_size) self.date = self.fd.norm_data_ls[self.fd.ticker_ls.index(TICKER)].date self.policy_net = dqn.DQN(input_size, hidden_layer, actions) self.target_net = dqn.DQN(input_size, hidden_layer, actions) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.memory = replay_memory.ReplayMemory(REPLAY_MEMORY_CAPACITY) self.steps_done = 0 self.episode_durations = [] self.actions = actions self.input_size = input_size self.action_index = ['Buy', 'Sell', 'Hold'] self.reward_list = [] self.episode_list = [] self.episode_len = 1200 self.money = self.fd.norm_data_ls[self.fd.ticker_ls.index( TICKER)].Close.values[0] * 20 self.money_list = [] self.loss_list = [] self.action_list = []
def gather_data(mdp, numTrials=10000, maxIterations=1000): mdp.computeStates() actions = mdp.actions(None) replay = replay_memory.ReplayMemory() for trial in range(numTrials): state = mdp.start_state if replay.isFull(): break for _ in range(maxIterations): action = random.choice(actions) transitions = mdp.succAndProbReward(state, action) if len(transitions) == 0: break i = sample([prob for newState, prob, reward in transitions]) newState, prob, reward = transitions[i] replay.store((state, action, reward, newState)) state = newState return replay
def test_minibatch_sample_shapes_multidimensional_state(self): batch_size = 100 state_shape = (1, 2, 2) rm = replay_memory.ReplayMemory(batch_size) for idx in range(1000): state = np.ones(state_shape) action = 0 reward = 0 next_state = np.ones(state_shape) terminal = 0 rm.store((state, action, reward, next_state, terminal)) states, actions, rewards, next_states, terminals = rm.sample_batch() expected_states_shape = (batch_size, ) + state_shape self.assertEquals(states.shape, expected_states_shape) self.assertEquals(actions.shape, (batch_size, 1)) self.assertEquals(rewards.shape, (batch_size, 1)) self.assertEquals(next_states.shape, expected_states_shape) self.assertEquals(terminals.shape, (batch_size, 1))
def test_agent(self): room_size = 5 mdp = mdps.MazeMDP(room_size, 1) mdp.compute_states() mdp.EXIT_REWARD = 1 mdp.MOVE_REWARD = -0.1 discount = mdp.get_discount() num_actions = len(mdp.get_actions(None)) network = qnetwork.QNetwork(input_shape=2 * room_size, batch_size=1, num_actions=4, num_hidden=10, discount=discount, learning_rate=1e-3, update_rule='sgd', freeze_interval=10000, rng=None) p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, 10000) rm = replay_memory.ReplayMemory(1) log = logger.NeuralLogger(agent_name='QNetwork') adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter( room_size=room_size) a = agent.NeuralAgent(network=network, policy=p, replay_memory=rm, logger=log, state_adapter=adapter) num_epochs = 2 epoch_length = 10 test_epoch_length = 0 max_steps = 10 run_tests = False e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, value_logging=False) e.run()
def __init__(self, env): self.env = env state_shape = self.env.state_shape action_dim = self.env.action_space.shape[1] self.obj_list = [i for i in range(10)] # for now, with single machine synchronous training, use a replay memory for training. # TODO: switch back to async training with multiple replicas (as in drivebot project) self.replay_memory = replay_memory.ReplayMemory(opts.replay_memory_size, state_shape, action_dim, opts) # s1 and s2 placeholders batched_state_shape = [None] + list(state_shape) s1 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32) s2 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32) if opts.use_full_internal_state: temp = [18] else: temp = [9] batched_internal_state_shape = [None] + temp internal_state = tf.placeholder(shape=batched_internal_state_shape, dtype=tf.float32) temp = [10] # object one hot batched_taget_obj_shape = [None] + temp target_obj_hot = tf.placeholder(shape=batched_taget_obj_shape, dtype=tf.float32) # initialise base models for value & naf networks. value subportion of net is # explicitly created seperate because it has a target network note: in the case of # --share-input-state-representation the input state network of the value_net will # be reused by the naf.l_value and naf.output_actions net self.value_net = ValueNetwork("value", s1, internal_state, target_obj_hot, opts.hidden_layers) self.target_value_net = ValueNetwork("target_value", s2, internal_state, target_obj_hot, opts.hidden_layers) self.naf = NafNetwork("naf", s1, s2, self.value_net, self.target_value_net, internal_state, target_obj_hot, action_dim)
self.target_net = None self.optimizer = None self.steps_done = 0 #ToDo: Save and Load this value def initialize(self): if(self.input_height not 0): self.policy_net = deep_q_network.DQN_Conv(self.input_height, self.input_width, self.n_actions).to(self.device) self.target_net = deep_q_network.DQN_Conv(self.input_height, self.input_width, self.n_actions).to(self.device) else: # if not a convolutional network self.policy_net = deep_q_network.DQN_Linear(self.input_width, self.n_actions).to(self.device) self.target_net = deep_q_network.DQN_Linear(self.input_width, self.n_actions, requires_grad=False).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters()) self.memory = replay_memory.ReplayMemory(self.capacity) def selectAction(self, state): # greedy eps algorithm sample = random.random() eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \ math.exp(-1. * self.steps_done / self.eps_decay) self.steps_done += 1 if sample > eps_threshold: with torch.no_grad(): # t.max(1) will return largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. return self.policy_net(state).max(1)[1].view(1, 1) else:
def train(self, sess, train_writer): """ Performs Q-learning on the Block World Task. The agent interacts with the simulator and performs roll-out followed by MSE updates. """ start = time.time() max_epoch = AbstractLearning.max_epochs dataset_size = AbstractLearning.dataset_size tuning_size = AbstractLearning.validation_datasize train_size = dataset_size - tuning_size logger.Log.info("Deep Q-Learning: Max Epoch: " + str(max_epoch) + " Train/Tuning: " + str(train_size) + "/" + str(tuning_size)) # Saver for logging the model saver = tf.train.Saver(max_to_keep=AbstractLearning.models_to_keep) # Iteration is the number of parameter update steps performed in the training iteration = 0 # Validation metric avg_bisk_metric = self.agent.test(tuning_size) min_avg_bisk_metric = avg_bisk_metric patience = 0 max_patience = AbstractLearning.max_patience logger.Log.info("Tuning Data: (Before Training) Avg. Bisk Metric: " + str(avg_bisk_metric)) for epoch in range(1, max_epoch + 1): logger.Log.info("=================\n Starting Epoch: " + str(epoch) + "\n=================") for data_point in range(1, train_size + 1): # Create a queue to handle history of states state = collections.deque([], 5) # Add the dummy images dummy_images = self.model.image_embedder.get_dummy_images() [state.append(v) for v in dummy_images] # Receive the instruction and the environment (_, bisk_metric, current_env, instruction, trajectory) = self.agent.receive_instruction_and_image() logger.Log.info("Train Bisk Metric " + str(bisk_metric)) state.append(current_env) ######################## text_indices = self.q_network.text_embedder.convert_text_to_indices( instruction) _, text_embedder_bucket = self.q_network.get_bucket_network( len(text_indices)) (text_input_word_indices_bucket, text_mask_bucket ) = text_embedder_bucket.pad_and_return_mask(text_indices) (text_input_word_indices, text_mask) = self.q_network.text_embedder.pad_and_return_mask( text_indices) ######################## logger.Log.info("=================\n " + str(data_point) + ": Instruction: " + str(instruction) + "\n=================") total_reward_episode = 0 steps = 0 previous_action = self.q_network.null_previous_action # Perform a roll out while True: # Compute the qVal of the current state q_val = self.q_network.evaluate_qfunction( state, text_input_word_indices_bucket, text_mask_bucket, previous_action, sess) # take an action using a behaviour policy action_id = self.behaviour_policy.get_action(q_val) action_str = self.agent.message_protocol_kit.encode_action( action_id) logger.Log.debug("Sending Message: " + action_str) self.agent.connection.send_message(action_str) # receive reward and a new environment as response on the completion of action (_, reward, new_env, is_reset) = self.agent.receive_response_and_image() logger.Log.debug("Received reward: " + str(reward)) # compute target y = r + gamma * max_a' Q(s', a') copy_state = collections.deque(state, 5) copy_state.append(new_env) q_val_new = self.target_q_network.evaluate_qfunction( copy_state, text_input_word_indices_bucket, text_mask_bucket, previous_action, sess) if self.agent.message_protocol_kit.is_reset_message( is_reset): # Terminal condition y = reward else: y = reward + self.agent.gamma * q_val_new.max() logger.Log.debug("Reward " + str(reward) + " Target " + str(y) + " max is " + str(q_val_new.max()) + " current " + str(q_val[action_id]) + " diff " + str(y - q_val[action_id])) # add to replay memory replay_memory_item = rm.ReplayMemory( text_input_word_indices, text_mask, state, action_id, reward, new_env, y, previous_action_id=previous_action) self.replay_memory.appendleft(replay_memory_item) state.append(new_env) # Update metric total_reward_episode += reward steps += 1 block_id = int(action_id / 4) if action_id == 80: direction_id = 4 else: direction_id = action_id % 4 previous_action = (direction_id, block_id) # Reset episode if self.agent.message_protocol_kit.is_reset_message( is_reset): logger.Log.debug("Resetting the episode") self.agent.connection.send_message("Ok-Reset") logger.Log.debug("Now waiting for response") # Perform minibatch SGD # Pick a sample using prioritized sweeping and perform backpropagation sample = self.ps.sample(self.replay_memory, self.batch_size) loss = self.min_loss(sample, sess, train_writer, factorized_actions=False) iteration += 1 logger.Log.info("Number of sample " + str(len(sample)) + " size of replay memory " + str(len(self.replay_memory)) + " loss = " + str(loss)) # Decay the epsilon self.behaviour_policy.decay_epsilon() logger.Log.info("Total reward in this episode: " + str(total_reward_episode)) # Print time statistics total_time = time.time() - start logger.Log.info("Total time: " + str(total_time)) logger.Log.flush() break # Synchronize the target network and main network self.copy_variables_to_target_network(sess) # Compute validation accuracy avg_bisk_metric = self.agent.test(tuning_size) logger.Log.info("Tuning Data: (end of epoch " + str(epoch) + ") Avg. Bisk Metric: " + str(avg_bisk_metric) + "Min was " + str(min_avg_bisk_metric)) # Save the model save_path = saver.save( sess, "./saved/model_epoch_" + str(epoch) + ".ckpt") logger.Log.info("Model saved in file: " + str(save_path)) if avg_bisk_metric >= min_avg_bisk_metric: if patience == max_patience: logger.Log.info( "Max patience reached. Terminating learning after " + str(epoch) + " epochs and " + str(iteration) + " iterations.") break else: logger.Log.info( "Tuning accuracy did not improve. Increasing patience to " + str(patience + 1)) patience += 1 else: logger.Log.info("Resetting patience to 0") patience = 0 min_avg_bisk_metric = min(min_avg_bisk_metric, avg_bisk_metric) logger.Log.close()
healths[trans][step] = health ammos[trans][step] = ammo frags[trans][step] = frag deaths[trans][step] = death posxs[trans][step] = posx posys[trans][step] = posy if __name__ == "__main__": game = initialize_vizdoom("./config/custom_config.cfg") n_actions = game.get_available_buttons_size() # actions = [list(a) for a in it.product([0, 1], repeat=n_actions)] commands = np.eye(n_actions, dtype=np.int32).tolist() replaymemory = replay_memory.ReplayMemory(n_transit, data_name="demodata_cig2017.npy") r_gen = reward_generater.reward_generater(game) # demo_data = replay_memory.ReplayMemory(resolution,n_transit) game.new_episode() for i in range(bots_num): game.send_game_command("addbot") total_reward = 0.0 death_bias = 0 for transit in tqdm(range(n_transit)): if game.is_episode_finished(): print(
def do_reinforce_learning_self_critical(self): """ Performs policy gradient learning using Reinforce on the Block World Task. The agent interacts with the simulator and performs roll-out followed by REINFORCE updates. """ start = time.time() max_epoch = 1000 dataset_size = 667 tuning_size = int(0.05 * dataset_size) train_size = dataset_size - tuning_size logger.Log.info("REINFORCE: Max Epoch: " + str(max_epoch) + " Train/Tuning: " + str(train_size) + "/" + str(tuning_size)) # Saver for logging the model saver = tf.train.Saver(max_to_keep=120) # Iteration is the number of parameter update steps performed in the training iteration = 0 # Reinforce baseline baseline = 0 # Validation metric avg_bisk_metric = self.test(tuning_size) min_avg_bisk_metric = avg_bisk_metric patience = 0 max_patience = 1000 logger.Log.info("Tuning Data: (Before Training) Avg. Bisk Metric: " + str(avg_bisk_metric)) for epoch in range(1, max_epoch + 1): logger.Log.info("=================\n Starting Epoch: " + str(epoch) + "\n=================") for data_point in range(1, train_size + 1): # Create a queue to handle history of states state = collections.deque([], 5) # Add the dummy images dummy_images = self.image_embedder.get_dummy_images() [state.append(v) for v in dummy_images] # Receive the instruction and the environment (_, _, current_env, instruction, trajectory) = self.receive_instruction_and_image() state.append(current_env) (text_input_word_indices, text_mask ) = self.text_embedder.get_word_indices_and_mask(instruction) logger.Log.info("=================\n " + str(data_point) + ": Instruction: " + str(instruction) + "\n=================") block_id = int(trajectory[0] / 4.0) total_reward_episode = 0 steps = 0 # Reinforce requires sampling from Q-function for the future. # So we cannot directly add entries to the global replay memory. replay_memory_items = [] rewards = [] # Perform a roll out while True: # Compute the probability of the current state prob = self.evaluate_qfunction(state, text_input_word_indices, text_mask) # Sample from the prob. distribution action_id = gp.GenericPolicy.sample_action_from_prob(prob) action_str = self.message_protocol_kit.encode_action_from_pair( block_id, action_id) logger.Log.debug("Sending Message: " + action_str + " with probability " + str(prob[action_id])) self.connection.send_message(action_str) # receive reward and a new environment as a response on the completion of action (_, reward, new_env, is_reset) = self.receive_response_and_image() logger.Log.debug("Received reward: " + str(reward)) # add to replay memory replay_memory_item = rm.ReplayMemory( text_input_word_indices, text_mask, state, action_id, reward, None, None, prob[action_id]) replay_memory_items.append(replay_memory_item) rewards.append(reward) state.append( new_env) ##### CHECK if state is being overwritten # Update metric total_reward_episode += reward steps += 1 # Reset episode if self.message_protocol_kit.is_reset_message(is_reset): logger.Log.debug("Resetting the episode") self.connection.send_message("Ok-Reset") logger.Log.debug("Now waiting for response") # Compute monte carlo q values baseline = self.get_reinforce_self_critical_baseline() logger.Log.info("Reward: " + " ".join([str(v) for v in rewards]) + " steps: " + str(steps)) logger.Log.info(" Total Reward: " + str(total_reward_episode) + ", Self Critical Baseline: " + str(baseline)) # Define the targets for replay_memory_item in replay_memory_items: replay_memory_item.set_target_retroactively( total_reward_episode - baseline) self.replay_memory.clear() for replay_memory_item in replay_memory_items: self.replay_memory.appendleft(replay_memory_item) # Perform minibatch SGD # Pick a sample using prioritized sweeping and perform backpropagation sample = self.ps.sample(self.replay_memory, self.batch_size) loss = self.min_loss(sample) if np.isnan(loss): logger.Log.error("NaN found. Exiting") exit(0) iteration += 1 logger.Log.info("Number of sample " + str(len(sample)) + " size of replay memory " + str(len(self.replay_memory)) + " loss = " + str(loss)) logger.Log.info("Total reward:" + str(total_reward_episode) + " Steps: " + str(steps)) # Print time statistics total_time = time.time() - start logger.Log.info("Total time: " + str(total_time)) logger.Log.flush() break # Compute validation accuracy avg_bisk_metric = self.test(tuning_size) logger.Log.info("Tuning Data: (end of epoch " + str(epoch) + ") Avg. Bisk Metric: " + str(avg_bisk_metric) + "Min was " + str(min_avg_bisk_metric)) # Save the model save_path = saver.save( self.sess, "./saved/model_epoch_" + str(epoch) + ".ckpt") logger.Log.info("Model saved in file: " + str(save_path)) if avg_bisk_metric >= min_avg_bisk_metric: if patience == max_patience: logger.Log.info( "Max patience reached. Terminating learning after " + str(epoch) + " epochs and " + str(iteration) + " iterations.") break else: logger.Log.info( "Tuning accuracy did not improve. Increasing patience to " + str(patience + 1)) patience += 1 else: logger.Log.info("Resetting patience to 0") patience = 0 min_avg_bisk_metric = min(min_avg_bisk_metric, avg_bisk_metric) logger.Log.close()
import replay_memory from collections import namedtuple Config = namedtuple("Config",["memory_size", "history_length", "batch_size", "state_num"]) config = Config(10,2,5,1) model = replay_memory.ReplayMemory(config) for i in range(5): model.add(i,i*0.1,i*2,i%4==0) print(model.states,model.actions,model.terminals,model.rewards) print("sampling") for i in range(5): print(model.sample_one()) for i in range(8): model.add(i,i*0.1,i*2,i%4==0) print(model.states,model.actions,model.terminals,model.rewards) print("sampling") for i in range(5): print(model.sample_one())
os.environ['KMP_DUPLICATE_LIB_OK']='True' ''' # Getting the Subway Surfers environment senv = env() number_actions = senv.action_space # Building an AI cnn = neural_net.CNN(number_actions) softmax_body = neural_net.SoftmaxBody(T=10) ai = neural_net.AI(body=softmax_body, brain=cnn) # Setting up Experience Replay and n_step progress n_steps = n_step.NStepProgress(ai=ai, env=senv, n_step=7) memory = replay_memory.ReplayMemory(n_steps=n_steps, capacity=5000) ma = moving_avg.MA(500) #Moving average used to grade our model # Functions to save and load the checkpoints created while training. def load(): if os.path.isfile('old_brain.pth'): print("=> loading checkpoint... ") checkpoint = torch.load('old_brain.pth') cnn.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("done !") else: print("no checkpoint found...")
def __init__(self, conf): self.conf = conf self.word_dim = conf['word_dim'] self.word_size = conf['word_size'] self.turn_len = conf['turn_len'] self.dialogue_len = conf['dialogue_len'] # epsilon setting self.ep_start = conf['ep_start'] self.ep = conf['ep_start'] self.ep_end = conf['ep_end'] self.ep_step = conf['ep_step'] self.discount = conf['discount'] self.update_freq = conf['update_freq'] self.max_reward = conf['max_reward'] self.min_reward = conf['min_reward'] self.num_actions = conf['num_actions'] self.batch_size = conf['batch_size'] self.learn_start = conf['learn_start'] self.target_q_clone_step = conf['target_q_clone_step'] self.debug = conf['debug'] self.num_step = 0 self.mini_batch_step = 0 self.last_s = None self.last_ask = -1 self.last_confirm = -1 self.last_r = None self.last_t = None self.ask_loss = None self.confirm_loss = None self.v_ask_avg = 0 self.v_confirm_avg = 0 try: self.loss_log = open('../data/loss_log', 'w') except: print("open file failed!") sys.exit(1) replay_memory_conf = {'replay_memory_size': conf['replay_memory_size'], 'learn_start': conf['prioritized_learnt_start'], 'batch_size': conf['batch_size'], 'word_dim': conf['word_dim'], 'debug': conf['debug']} self.replay_memory = replay_memory.ReplayMemory(replay_memory_conf) embedding_init = np.random.rand(self.word_size, self.word_dim) embedding_init[0] *= 0 embedding_init = embedding_init.astype('float32') output_network_conf = {'name': 'output_network', 'num_actions': conf['num_actions'], 'word_dim': conf['word_dim'], 'word_size': conf['word_size'], 'turn_len': conf['turn_len'], 'dialogue_len': conf['dialogue_len'], 'mlp_hidden_unit': conf['mlp_hidden_unit'], 'clip_delta': conf['clip_delta'], 'lr': conf['lr'], 'embedding_init': embedding_init} self.output_network = DQN.DQN(output_network_conf) target_network_conf = {'name': 'target_network', 'num_actions': conf['num_actions'], 'word_dim': conf['word_dim'], 'word_size': conf['word_size'], 'turn_len': conf['turn_len'], 'dialogue_len': conf['dialogue_len'], 'mlp_hidden_unit': conf['mlp_hidden_unit'], 'clip_delta': conf['clip_delta'], 'lr': conf['lr'], 'embedding_init': embedding_init} self.target_network = DQN.DQN(target_network_conf) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.75) self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) self.writer = tf.train.SummaryWriter('../data/graph_logs', self.sess.graph) self.init = tf.initialize_all_variables() self.sess.run(self.init) # self.sess.run(self.output_network.embedding_init) # self.sess.run(self.target_network.embedding_init) self.sync = self.sync_func() self.sess.run(self.sync)
def train(self, sess, train_writer, max_epoch=AbstractLearning.max_epochs, model_name="./model"): """ Performs policy gradient learning using Reinforce on the Block World Task. The agent interacts with the simulator and performs roll-out followed by REINFORCE updates. """ start = time.time() # Initialization using 2 epochs of MLE self.mle_policy_gradient.train(sess, train_writer, max_epoch=2, model_name="./model_mle", terminate=False) # Reinitialize the direction parameters w1, b1 = self.policy_model.mix_and_gen_prob.get_direction_weights() sess.run(tf.initialize_variables([w1, b1])) dataset_size = AbstractLearning.dataset_size tuning_size = AbstractLearning.validation_datasize train_size = dataset_size - tuning_size logger.Log.info("REINFORCE: Max Epoch: " + str(max_epoch) + " Train/Tuning: " + str(train_size) + "/" + str(tuning_size)) # Saver for logging the model saver = tf.train.Saver(max_to_keep=AbstractLearning.models_to_keep) # Iteration is the number of parameter update steps performed in the training iteration = 0 # Validation metric avg_bisk_metric = self.agent.test(tuning_size, oracle=True) min_avg_bisk_metric = avg_bisk_metric patience = 0 max_patience = AbstractLearning.max_patience logger.Log.info("Tuning Data: (Before Training) Avg. Bisk Metric: " + str(avg_bisk_metric)) for epoch in range(1, max_epoch + 1): logger.Log.info("=================\n Starting Epoch: " + str(epoch) + "\n=================") for data_point in range(1, train_size + 1): # Create a queue to handle history of states state = collections.deque([], 5) # Add the dummy images dummy_images = self.policy_model.image_embedder.get_padding_images() [state.append(v) for v in dummy_images] # Receive the instruction and the environment (_, _, current_env, instruction, trajectory) = self.agent.receive_instruction_and_image() state.append(current_env) # Convert text to indices text_indices = self.policy_model.text_embedder.convert_text_to_indices(instruction) _, text_embedder_bucket = self.policy_model.get_bucket_network(len(text_indices)) (text_input_word_indices_bucket, text_mask_bucket) = text_embedder_bucket.pad_and_return_mask( text_indices) (text_input_word_indices, text_mask) = self.policy_model.text_embedder.pad_and_return_mask(text_indices) logger.Log.info("=================\n " + str(data_point) + ": Instruction: " + str(instruction) + "\n=================") total_reward_episode = 0 steps = 0 # Reinforce requires sampling from Q-function for the future. # So we cannot directly add entries to the global replay memory. replay_memory_items = [] rewards = [] previous_status_code = self.policy_model.null_previous_action # Perform a roll out while True: # Compute the probability of the current state block_prob, direction_prob = self.policy_model.evaluate_policy( state, text_input_word_indices_bucket, text_mask_bucket, previous_action=previous_status_code, sess=sess) # Sample from the prob. distribution block_id = gp.GenericPolicy.sample_action_from_prob(block_prob) direction_id = gp.GenericPolicy.sample_action_from_prob(direction_prob) action_str = self.agent.message_protocol_kit.encode_action_from_pair(block_id, direction_id) prob_action = block_prob[block_id] * direction_prob[direction_id] logger.Log.debug("Sending Message: " + action_str + " with probability " + str(prob_action)) self.agent.connection.send_message(action_str) # receive reward and a new environment as a response on the completion of action (status_code, reward, new_env, is_reset) = self.agent.receive_response_and_image() logger.Log.debug("Received reward: " + str(reward)) # add to replay memory replay_memory_item = rm.ReplayMemory(text_input_word_indices, text_mask, state, (block_id, direction_id), reward, None, None, prob_action, previous_action_id=previous_status_code) replay_memory_items.append(replay_memory_item) rewards.append(reward) state.append(new_env) # Update metric total_reward_episode += reward steps += 1 previous_status_code = (direction_id, block_id) # Reset episode if self.agent.message_protocol_kit.is_reset_message(is_reset): logger.Log.debug("Resetting the episode") self.agent.connection.send_message("Ok-Reset") logger.Log.debug("Now waiting for response") if self.total_reward: # Compute monte carlo q values reward_multiplier = [0] * steps for i in range(0, steps): # Q-value approximated by roll-out reward_multiplier[i] = sum(rewards[i:]) else: # Use immediate reward only reward_multiplier = rewards # Define the targets for replay_memory_item, cumm_reward in zip(replay_memory_items, reward_multiplier): replay_memory_item.set_target_retroactively(cumm_reward) # Perform 1 iteration of minibatch SGD using backpropagation loss = self.min_loss(replay_memory_items, sess, train_writer) if np.isnan(loss): logger.Log.error("NaN found. Exiting") exit(0) iteration += 1 logger.Log.info("Number of sample " + str(len(replay_memory_items)) + " loss = " + str(loss)) logger.Log.info("Total reward:" + str(total_reward_episode) + " Steps: " + str(steps)) # Print time statistics total_time = time.time() - start logger.Log.info("Total time: " + str(total_time)) logger.Log.flush() break # Compute validation accuracy avg_bisk_metric = self.agent.test(tuning_size) logger.Log.info("Tuning Data: (end of epoch " + str(epoch) + ") Avg. Bisk Metric: " + str(avg_bisk_metric) + "Min was " + str(min_avg_bisk_metric)) # Save the model save_path = saver.save(sess, "./saved/" + str(model_name) + "_epoch_" + str(epoch) + ".ckpt") logger.Log.info("Model saved in file: " + str(save_path)) if avg_bisk_metric >= min_avg_bisk_metric: if patience == max_patience: logger.Log.info("Max patience reached. Terminating learning after " + str(epoch) + " epochs and " + str(iteration) + " iterations.") break else: logger.Log.info( "Tuning accuracy did not improve. Increasing patience to " + str(patience + 1)) patience += 1 else: logger.Log.info("Resetting patience to 0") patience = 0 min_avg_bisk_metric = min(min_avg_bisk_metric, avg_bisk_metric) logger.Log.close()
time.sleep(900) def stop(self): """ Prevents the thread from continuing when called. """ self.stop = True if __name__ == '__main__': global eta, alpha, number_of_turns, data_from_rm, rm, history, net, counter path = os.path.dirname( os.path.abspath(__file__) ) + '/saves/' # must end with "/" on Linux and with "\" on Windows history = hs.History(path + 'History') rm = rm.ReplayMemory(path + 'ReplayMemory.rm', 42000) Lambda = .0 eta = .0001 alpha = .7 input_layer = 543 output_layer = (8, af.tanh) hidden_layers = [(543, af.tanh), (543, af.tanh), (543, af.tanh)] number_of_turns = 500 data_from_rm = 500 net = ann.Neural_Network(path + 'DATA', input_layer, output_layer, hidden_layers, Lambda) Saver(net, rm).start() counter = 1 while True: history.setGame(hs.generateName('main_net', 'dummy_net', 1).__next__()) GAME = PONR(Interface('main net'), Interface('dummy net'))
# Define critic and dual optimizer if AC: critic = critic.Critic(DIS_EMBEDDING_DIM, DIS_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, device=DEVICE).to(DEVICE) AC_optimizer = optim.Adagrad([{ 'params': actor.parameters(), 'lr': ACTOR_LR }, { 'params': critic.parameters(), 'lr': CRITIC_LR }]) memory = replay_memory.ReplayMemory(CAPACITY_RM) # Use optimizer for baseline DP-GAN else: PG_optimizer = optim.Adagrad(actor.parameters(), ACTOR_LR) # Adversarial training loop gen_data_loader = iter(load_data()) gen_data_loader_tf = iter(load_data()) dis_data_loader = iter(load_data()) num_batches = int(len(gen_data_loader) / 2) N = ADV_TRAIN_EPOCHS * num_batches M = 1 K = 5 for n in range(N): if n % num_batches == 0: print('Iteration {}'.format(n))
def run(learning_rate, freeze_interval, num_hidden, reg): room_size = 5 num_rooms = 2 mdp = mdps.MazeMDP(room_size, num_rooms) mdp.compute_states() mdp.EXIT_REWARD = 1 mdp.MOVE_REWARD = -0.01 discount = 1 num_actions = len(mdp.get_actions(None)) batch_size = 100 print 'building network...' network = qnetwork.QNetwork(input_shape=2 * room_size + num_rooms**2, batch_size=batch_size, num_hidden_layers=2, num_actions=4, num_hidden=num_hidden, discount=discount, learning_rate=learning_rate, regularization=reg, update_rule='adam', freeze_interval=freeze_interval, rng=None) num_epochs = 50 epoch_length = 2 test_epoch_length = 0 max_steps = 4 * (room_size * num_rooms)**2 epsilon_decay = (num_epochs * epoch_length * max_steps) / 1.5 print 'building policy...' p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, epsilon_decay) print 'building memory...' rm = replay_memory.ReplayMemory(batch_size, capacity=50000) print 'building logger...' log = logger.NeuralLogger(agent_name='QNetwork') print 'building state adapter...' adapter = state_adapters.CoordinatesToRowColRoomAdapter( room_size=room_size, num_rooms=num_rooms) # adapter = state_adapters.CoordinatesToRowColAdapter(room_size=room_size, num_rooms=num_rooms) # adapter = state_adapters.CoordinatesToFlattenedGridAdapter(room_size=room_size, num_rooms=num_rooms) # adapter = state_adapters.IdentityAdapter(room_size=room_size, num_rooms=num_rooms) # adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size) print 'building agent...' a = agent.NeuralAgent(network=network, policy=p, replay_memory=rm, log=log, state_adapter=adapter) run_tests = False e = experiment.Experiment(mdp, a, num_epochs, epoch_length, test_epoch_length, max_steps, run_tests, value_logging=True) e.run() ak = file_utils.load_key('../access_key.key') sk = file_utils.load_key('../secret_key.key') bucket = 'hierarchical' try: aws_util = aws_s3_utility.S3Utility(ak, sk, bucket) aws_util.upload_directory(e.agent.logger.log_dir) except Exception as e: print 'error uploading to s3: {}'.format(e)
ckpt_path = "./model/" game = initialize_vizdoom(config_file_path) print("learning rate: %f" % learning_rate) print("discount_factor %f" % discount_factor) print("resolution:", resolution) print("frame_repeat: %d" % frame_repeat) print("capacity:", capacity) print("barch_size: %d" % batch_size) print("screen_format:", game.get_screen_format()) n_actions = game.get_available_buttons_size() actions = np.eye(n_actions, dtype=np.int32).tolist() print("action_size : %d" % (n_actions)) #actions = [list(a) for a in it.product([0,1], repeat=n_actions)] replay_memory = replay_memory.ReplayMemory(resolution, capacity) session = tf.Session() network = network_double.network_simple(session, resolution, n_actions, learning_rate) #network = network.network_simple(session,resolution,n_actions, learning_rate) #network = network_contrib.network_contrib(session,resolution,n_actions,learning_rate) session.run(tf.global_variables_initializer()) for epoch in range(n_epoch): print("Epoch %d \n -----" % (epoch)) print("Training Phase") train_episodes_finished = 0 train_scores = [] total_train_scores = []
def train(self, sess, train_writer, max_epoch=AbstractLearning.max_epochs, model_name="./model"): """ Performs supervised learning on the Block World Task. The agent interacts with the simulator and performs roll-out followed by supervised learning. """ start = time.time() dataset_size = AbstractLearning.dataset_size tuning_size = AbstractLearning.validation_datasize train_size = dataset_size - tuning_size logger.Log.info("Maximum Likelihood: Max Epoch: " + str(max_epoch) + " Train/Tuning: " + str(train_size) + "/" + str(tuning_size)) # Saver for logging the model saver = tf.train.Saver(max_to_keep=AbstractLearning.models_to_keep) # Iteration is the number of parameter update steps performed in the training iteration = 0 # Validation metric avg_bisk_metric = self.agent.test(tuning_size) min_avg_bisk_metric = avg_bisk_metric patience = 0 max_patience = AbstractLearning.max_patience logger.Log.info("Tuning Data: (Before Training) Avg. Bisk Metric: " + str(avg_bisk_metric)) for epoch in range(1, max_epoch + 1): logger.Log.info("=================\n Starting Epoch: " + str(epoch) + "\n=================") for data_point in range(1, train_size + 1): # Create a queue to handle history of states state = collections.deque([], 5) # Add the dummy images dummy_images = self.policy_model.image_embedder.get_dummy_images() [state.append(v) for v in dummy_images] # Receive the instruction and the environment (_, _, current_env, instruction, trajectory) = self.agent.receive_instruction_and_image() state.append(current_env) (text_input_word_indices, text_mask) = \ self.policy_model.text_embedder.get_word_indices_and_mask(instruction) logger.Log.info("=================\n " + str(data_point) + ": Instruction: " + str(instruction) + "\n=================") traj_ix = 0 total_reward_episode = 0 steps = 0 previous_action = self.policy_model.null_previous_action block_id = int(trajectory[0] / 4.0) # Perform a roll out while True: # Sample from the prob. distribution action_id = trajectory[traj_ix] traj_ix += 1 action_str = self.agent.message_protocol_kit.encode_action(action_id) logger.Log.debug("Sending Message: " + action_str) self.agent.connection.send_message(action_str) # receive reward and a new environment as a response on the completion of action (status_code, reward, new_env, is_reset) = self.agent.receive_response_and_image() logger.Log.debug("Received reward: " + str(reward)) # add to replay memory if action_id == 80: direction_id = 4 else: direction_id = action_id % 4 replay_memory_item = rm.ReplayMemory(text_input_word_indices, text_mask, state, (block_id, direction_id), 1.0, new_env, None, previous_action_id=previous_action) self.replay_memory.appendleft(replay_memory_item) state.append(new_env) # Update metric total_reward_episode += reward steps += 1 previous_action = (direction_id, block_id) # Reset episode if self.agent.message_protocol_kit.is_reset_message(is_reset): logger.Log.debug("Resetting the episode") self.agent.connection.send_message("Ok-Reset") logger.Log.debug("Now waiting for response") # Perform minibatch SGD # Pick a sample using prioritized sweeping and perform backpropagation sample = self.ps.sample(self.replay_memory, self.batch_size) loss = self.min_loss(sample, sess, train_writer) if np.isnan(loss): logger.Log.info("NaN found. Exiting") exit(0) iteration += 1 logger.Log.info("Number of sample " + str(len(sample)) + " size of replay memory " + str(len(self.replay_memory)) + " loss = " + str(loss)) logger.Log.info("Total reward:" + str(total_reward_episode) + " Steps: " + str(steps)) # Print time statistics total_time = time.time() - start logger.Log.info("Total time: " + str(total_time)) logger.Log.flush() break # Compute validation accuracy avg_bisk_metric = self.agent.test(tuning_size) logger.Log.info("Tuning Data: (end of epoch " + str(epoch) + ") Avg. Bisk Metric: " + str(avg_bisk_metric) + "Min was " + str(min_avg_bisk_metric)) # Save the model save_path = saver.save(sess, "./saved/" + str(model_name) + "_epoch_" + str(epoch) + ".ckpt") logger.Log.info("Model saved in file: " + str(save_path)) if avg_bisk_metric >= min_avg_bisk_metric: if patience == max_patience: logger.Log.info("Max patience reached. Terminating learning after " + str(epoch) + " epochs and " + str(iteration) + " iterations.") break else: logger.Log.info("Tuning accuracy did not improve. Increasing patience to " + str(patience + 1)) patience += 1 else: logger.Log.info("Resetting patience to 0") patience = 0 min_avg_bisk_metric = min(min_avg_bisk_metric, avg_bisk_metric) logger.Log.close()
parser = argparse.ArgumentParser(description='PyTorch DDPG') parser.add_argument('--env-name', default="Walker2d-v1", metavar='G', help='name of the environment to run') parser.add_argument('--render', action='store_true', help='render the environment') args = parser.parse_args() if __name__ == '__main__': env = gym.make(args.env_name) mem = replay_memory.ReplayMemory(1000000) trainer = train_networks.Training(env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high[0], mem) # for i_episode in count(1): # num_episodes = 0 # num_steps = 0 # reward_batch = 0 # while num_steps < 1000: num_episodes = 0 reward_batch = 0 for i in range(no_of_episodes): obs = env.reset()