def __init__(self, discount, num_iterations, lamb, animate, kl_target, **kwargs): self.env_name = 'RoboschoolHumanoidFlagrun-v1' self.env = gym.make(self.env_name) gym.spaces.seed(1234) # for reproducibility self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) # 1000000 is the size they have used in paper self.episodes = 20 # larger episodes can reduce variance self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if 'show' in kwargs and not kwargs['show']: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('Observation dimension:', self.obs_dim) print('Action dimension:', self.act_dim) # The use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler()
def test_map(): buf1 = Buffer([1, 2, 3]) squared = lambda x: x**2 buf2 = buf1.map(squared) assert_equal(buf1.bytes, [1, 2, 3]) assert_equal(buf2.bytes, [1, 4, 9])
def test_to_file(): filepath = './tests/resources/buf_out_test.txt' buf = Buffer('to file test') buf.to_file(filepath, 'hex') data = open(filepath, 'r').read() assert_equal(data, '746f2066696c652074657374')
def __init__(self, wid, shared_model, model, optimizer, gamma=0.99, n_steps=8): # Configuration self.worker_id = wid self.gamma = gamma self.n_steps = n_steps # experience buffer self.buffer = Buffer(size=n_steps) # network related settings self.global_net = shared_model self.local_net = model self.optimizer = optimizer # deep copy, synchronize with shared model self.local_net.load_state_dict(self.global_net.state_dict())
def parse_message(msg: bytes, conn, dir_): size = unpack("<I", msg[:4])[0] msg_type = unpack("<I", msg[4:8])[0] message_type = message_type_table[msg_type] end = -(len(msg) - size - 8) if end == 0: end = None msg = msg[8:end] buf = Buffer() buf.write(msg) message = message_type.unpack(buf) logging.info(("\033[36mSEND >>> \033[0m" if dir_ == 1 else "\033[32mRECV <<< \033[0m") + repr(message)) action = message_action_table.get(msg_type, None) if action is not None and conn is not None: action(message, conn)
def __init__(self, action_set, reward_function, feature_extractor, hidden_dims=[50, 50], learning_rate=5e-4, buffer_size=50000, batch_size=64, num_batches=100, starts_learning=5000, final_epsilon=0.02, discount=0.99, target_freq=10, verbose=False, print_every=1, test_model_path=None): Agent.__init__(self, action_set, reward_function) self.feature_extractor = feature_extractor self.feature_dim = self.feature_extractor.dimension # build Q network # we use a multilayer perceptron dims = [self.feature_dim] + hidden_dims + [len(self.action_set)] self.model = MLP(dims) if test_model_path is None: self.test_mode = False self.learning_rate = learning_rate self.buffer_size = buffer_size self.batch_size = batch_size self.num_batches = num_batches self.starts_learning = starts_learning self.epsilon = 1.0 # anneals starts_learning/(starts_learning + t) self.final_epsilon = 0.02 self.timestep = 0 self.discount = discount self.buffer = Buffer(self.buffer_size) self.target_net = MLP(dims) self.target_net.load_state_dict(self.model.state_dict()) self.target_net.eval() self.target_freq = target_freq # target nn updated every target_freq episodes self.num_episodes = 0 self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) # for debugging purposes self.verbose = verbose self.running_loss = 1. self.print_every = print_every else: self.test_mode = True self.model.load_state_dict(torch.load(test_model_path)) self.model.eval()
def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show): self.env_name = env_name self.env = gym.make(env_name) if env_name == "FetchReach-v0": self.env = gym.wrappers.FlattenDictWrapper( self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) self.obs_dim = self.env.observation_space.shape[ 0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) self.episodes = 20 self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) # using MC return would be more helpful self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if not show: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('observation dimension:', self.obs_dim) print('action dimension:', self.act_dim) # Use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler()
def __init__(self, model, env, args, state): self.model = model self.env = env self.state = state self.hx = None self.cx = None self.eps_len = 0 self.args = args self.values = [] self.log_probs = [] self.rewards = [] self.entropies = [] self.done = True self.info = None self.reward = 0 self.gpu_id = -1 self.position_history = Buffer(200)
def __init__(self, ip, port): self.stream = Buffer() self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.s.connect((ip, port)) raw_pk = self.s.recv(33) self.pk = GraphenePublicKey(raw_pk.hex()) sk = PrivateKey() point = self.pk.point() * int.from_bytes(bytes(sk), "big") x: int = point.x() raw_data = x.to_bytes(32, "big") self.shared_secret = sha512(raw_data).digest() key = sha256(self.shared_secret).digest() crc = cityhash.CityHash128(self.shared_secret) data = crc.to_bytes(16, "little") iv = data[8:16] + data[:8] self.s.sendall(bytes.fromhex(repr(sk.pubkey))) self.encryptor = AES.new(key, AES.MODE_CBC, iv) self.test = AES.new(key, AES.MODE_CBC, iv) self.decryptor = AES.new(key, AES.MODE_CBC, iv) self.worker_thread = threading.Thread(target=self.worker) self.worker_thread.start() self.send( 5006, { "user_agent": "Haruka Mock Client", "core_protocol_version": 106, "inbound_address": "0.0.0.0", "inbound_port": 0, "outbound_port": 0, "node_public_key": sk.pubkey, "signed_shared_secret": ecdsa.sign_message(self.shared_secret, str(sk)), "chain_id": bytes.fromhex( "4018d7844c78f6a6c41c6a552b898022310fc5dec06da467ee7905a8dad512c8" ), "user_data": { "platform": String("unknown") } })
def unpack(msg: Buffer): obj = {} count = VarInt.unpack(msg) for _ in range(count): key = String.unpack(msg).data index = ord(msg.read(1)) type_ = Variant.allowed_types[index] value = type_.unpack(msg) obj[key] = value return VariantObject(obj)
def unpack(msg: Buffer): value = 0 i = 0 while True: byte = ord(msg.read(1)) value += (byte & 0x7f) << (i * 7) if byte & 0x80 != 0x80: break i += 1 return value
def session_key(self, public_B): """Generate a session secret given the other party's public key""" raw_secret = gmp.powmod(public_B.to_mpz(), self._secret_key, self._dh_p) # Hash the secret to create a key h_256 = SHA256.new() h_256.update(raw_secret.digits(10)) raw_key = h_256.digest() return Buffer(raw_key)
def echo_get_intention(message): global small_talk_indicator answer, intent, action = detect_intent_texts(project_id, session_id, [message.text], language_code) if intent: bot.send_message(message.chat.id, intent) else: bot.send_message(message.chat.id, 'no intent') messages_list.append(message.text) intents_list.append(intent) if intent == 'my.make_order': small_talk_indicator = False bot.send_message(message.chat.id, answer) elif small_talk_indicator or intent == 'my.lets_make_conversation' or action == 'smalltalk.appraisal.thank_you': small_talk_indicator = True # answer, intent = detect_intent_texts(project_id, session_id, [message.text], language_code) bot.send_message(message.chat.id, answer) else: wish = get_item_for_search(message.text) if 'fail' not in wish: chat_id = message.chat.id markup = generate_irec_markup() user = Buffer(wish) user_dict[chat_id] = user bot.send_message(message.chat.id, 'Выбери способ подбора:', reply_markup=markup) else: negative = ''.join([ 'Много хочешь ', u'\U0001F5FF', '\n', 'Попробуй написать по-другому.' ]) bot.reply_to(message, negative)
def test_get(): buf = Buffer([0, 16, 0]) assert_equal(buf.get(1), 16)
class Connection: def __init__(self, ip, port): self.stream = Buffer() self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.s.connect((ip, port)) raw_pk = self.s.recv(33) self.pk = GraphenePublicKey(raw_pk.hex()) sk = PrivateKey() point = self.pk.point() * int.from_bytes(bytes(sk), "big") x: int = point.x() raw_data = x.to_bytes(32, "big") self.shared_secret = sha512(raw_data).digest() key = sha256(self.shared_secret).digest() crc = cityhash.CityHash128(self.shared_secret) data = crc.to_bytes(16, "little") iv = data[8:16] + data[:8] self.s.sendall(bytes.fromhex(repr(sk.pubkey))) self.encryptor = AES.new(key, AES.MODE_CBC, iv) self.test = AES.new(key, AES.MODE_CBC, iv) self.decryptor = AES.new(key, AES.MODE_CBC, iv) self.worker_thread = threading.Thread(target=self.worker) self.worker_thread.start() self.send( 5006, { "user_agent": "Haruka Mock Client", "core_protocol_version": 106, "inbound_address": "0.0.0.0", "inbound_port": 0, "outbound_port": 0, "node_public_key": sk.pubkey, "signed_shared_secret": ecdsa.sign_message(self.shared_secret, str(sk)), "chain_id": bytes.fromhex( "4018d7844c78f6a6c41c6a552b898022310fc5dec06da467ee7905a8dad512c8" ), "user_data": { "platform": String("unknown") } }) def send(self, msg_type, data: dict): message_type = message_type_table[msg_type] message = message_type(data) res = message.pack() # for name, type_ in definition.items(): # res.extend(pack_field(data.get(name, None), type_)) length = len(res) if length % 16 != 8: pad_length = (8 - length % 16) if pad_length < 0: pad_length += 16 res += b"\x00" * pad_length res = pack("<II", length, msg_type) + res data = self.encryptor.encrypt(res) logging.debug("SEND >>> %s" % res) parse_message(res, None, 1) self.s.sendall(data) def worker(self): data = bytearray() while True: data.extend(self.s.recv(65535)) if len(data) % 16 == 0: msg = self.decryptor.decrypt(bytes(data)) else: continue data = bytearray() self.stream.write(msg) if len(msg) == 0: break logging.debug("RECV <<< %s" % msg) while self.stream.count(): size = unpack("<I", self.stream.peek(4))[0] expect = size + 8 + (16 - (size + 8) % 16) % 16 logging.debug("expect %s have %s" % (expect, self.stream.count())) if expect <= self.stream.count(): parse_message(self.stream.read(expect), self, 2) else: break
def _init_memory(self): self._buffer = Buffer(max_size=max_memory)
def test_init(): buf = Buffer.init(5, 16) assert_equal(buf.bytes, [16, 16, 16, 16, 16])
class ActorCritic: def __init__(self, wid, shared_model, model, optimizer, gamma=0.99, n_steps=8): # Configuration self.worker_id = wid self.gamma = gamma self.n_steps = n_steps # experience buffer self.buffer = Buffer(size=n_steps) # network related settings self.global_net = shared_model self.local_net = model self.optimizer = optimizer # deep copy, synchronize with shared model self.local_net.load_state_dict(self.global_net.state_dict()) def select_action(self, obs): obs = Variable(torch.from_numpy(obs).float()) action_prob, value = self.local_net(obs) m = Categorical(action_prob) action = m.sample() return int(action[0].data.numpy()), m.log_prob(action), value def learn(self, next_state, done): # compute n steps approximate return G_t:t+n V_target = Variable(torch.FloatTensor([0.0])) if done else self.local_net(wrap_as_variable(next_state))[1] V_target_list = [] for experience in self.buffer.get_reversed_experience(): V_target = experience.reward + self.gamma* V_target V_target_list.insert(0, V_target) states = numpy.stack(self.buffer.get_n_steps_data().state) V_estimates = self.local_net(wrap_as_variable(states))[1] V_targets = torch.stack(V_target_list) # compute critic loss td_error = V_target.detach() - V_estimates critic_loss = td_error*td_error # compute actor loss log_action_probs = torch.stack(self.buffer.get_n_steps_data().log_action_prob) actor_loss = - log_action_probs*td_error.detach() total_loss = (actor_loss + critic_loss).mean() # reset gradient of local network self.optimizer.zero_grad() # loss backprobagation total_loss.backward() torch.nn.utils.clip_grad_norm(self.local_net.parameters(), 3) # ensure to share the gradient with global net ensure_shared_grad(self.global_net, self.local_net) # update network parameters self.optimizer.step() # synchronize the local net with global net's parameters self.local_net.load_state_dict(self.global_net.state_dict())
def test_xor(): buf1 = Buffer('abc') buf2 = Buffer(' ') xord = buf1.xor(buf2) assert_equal(xord.to_string(), 'ABC')
def test_from_b64(): buf = Buffer.from_b64('aGVsbG8=') assert_equal(buf.to_string(), 'hello')
def get_secret(self): """Return the private key as a Buffer instance""" return Buffer.from_mpz(self._secret_key)
def test_to_bin(): buf = Buffer('Az') assert_equal(buf.to_bin(), '0100000101111010')
def get_public(self): """Return the public key as a Buffer instance""" return Buffer.from_mpz(self._public_key)
def test_from_bin(): buf = Buffer.from_bin('0100000101111010') assert_equal(buf.to_string(), 'Az')
def test_concat(): buf1 = Buffer('abc') buf2 = Buffer('xyz') combined = buf1.concat(buf2) assert_equal(combined.to_string(), 'abcxyz')
class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, **kwargs): self.env_name = env_name self.env = gym.make(env_name) if env_name.startswith('Fetch'): # FetchReach env is a little bit different self.env = gym.wrappers.FlattenDictWrapper(self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) # for reproducibility self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) # 1000000 is the size they have used in paper self.episodes = 20 # larger episodes can reduce variance self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if 'show' in kwargs and not kwargs['show']: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('Observation dimension:', self.obs_dim) print('Action dimension:', self.act_dim) # The use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): """ Collection observations from 5 episodes to initialize Scaler. :return: a properly initialized scaler """ print('Fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) if self.env_name.startswith('Fetch'): obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) self.scaler.update(observation_samples) def normalize_obs(self, obs): """ Transform and update the scaler on the fly. :param obs: Raw observation :return: normalized observation """ scale, offset = self.scaler.get() obs_scaled = (obs-offset)*scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect a trajectory of (obs, act, reward, obs_next) """ obs = self.env.reset() observes, actions, rewards = [],[],[] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature at normalized observation observes.append(obs) action = self.policy.get_sample(obs).reshape((1, -1)).astype(np.float64) actions.append(action) if self.env_name.startswith('Fetch'): obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array(rewards) def discounted_sum(self, l, factor): """ Discounted sum of return or advantage estimates along a trajectory. :param l: a list containing the values of discounted summed interest. :param factor: discount factor in the disc_sum case or discount*lambda for GAE :return: discounted sum of l with regard to factor """ discounted = [] sum = 0 for i in reversed(l): discounted.append(factor*sum+i) sum = factor*sum+i return np.array(list(reversed(discounted))) def run_policy(self, episodes): """ Gather a batch of trajectory samples. :param episodes: size of batch. :return: a batch of samples """ trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = {'observes': observes, 'actions': actions, 'rewards': rewards, 'scaled_rewards': rewards*(1-self.discount)} trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) # add to experience replay buffer self.buffer.append(trajectories) print('buffer size:', self.buffer.size()) i += len(trajectories) # for E=20, T=50, the total number of samples would be 1000 # In future needs to account for not uniform time steps per episode. # e.g. in Hopper-v2 environment not every episode has same time steps # E = len(trajectories) # num_samples = np.sum([len(t['rewards']) for t in trajectories]) gradient_steps = np.sum([len(t['rewards']) for t in trajectories]) if self.env_name.startswith('Fetch'): assert (gradient_steps == 20*50) """train critic""" # train all samples in the buffer, to the extreme # self.critic.fit(self.policy, self.buffer, epochs=20, num_samples=self.buffer.size()) # train some samples minibatches only critic_loss_mean, critic_loss_std = self.critic.another_fit_func(self.policy, self.buffer, gradient_steps) """calculation of episodic discounted return only needs rewards""" mc_returns = np.concatenate([self.discounted_sum(t['scaled_rewards'], self.discount) for t in trajectories]) """using current batch of samples to update baseline""" observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) value_func_loss = self.value_func.update(observes, mc_returns) """compute GAE""" for t in trajectories: t['values'] = self.value_func.predict(t['observes']) # IS it really legitimate to insert 0 at the last obs? t['td_residual'] = t['scaled_rewards'] + self.discount * np.append(t['values'][1:], 0) - t['values'] t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb) advantages = np.concatenate([t['gae'] for t in trajectories]) """normalize advantage estimates, Crucial step""" advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) """compute control variate""""" cv = self.critic.get_contorl_variate(self.policy, observes, actions) # cv must not be centered # cv = (cv - cv.mean()) / (cv.std() + 1e-6) """conservative control variate""" eta = [1 if i > 0 else 0 for i in advantages*cv] """center learning signal""" # check that advantages and CV should be of size E*T # eta controls the on-off of control variate learning_signal = advantages - eta*cv # learning_signal = (learning_signal - learning_signal.mean()) / (learning_signal.std() + 1e-6) """controlled taylor eval term""" ctrl_taylor = np.concatenate([ [eta[i]*act] for i, act in enumerate(self.critic.get_taylor_eval(self.policy, observes))]) """policy update""" ppo_loss, ddpg_loss, kl, entropy, beta = self.policy.update(observes, actions, learning_signal, ctrl_taylor) avg_rewards = np.sum(np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average([len(t['rewards']) for t in trajectories]) log = {} # save training statistics log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['critic_loss'] = critic_loss_mean log['policy_ppo_loss'] = ppo_loss log['policy_ddpg_loss'] = ddpg_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format(log['steps'], log['rewards'])) for key in ['critic_loss', 'policy_ppo_loss', 'policy_ddpg_loss', 'value_func_loss', 'kl', 'entropy', 'beta']: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped early if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12,9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter(lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): """ Load all Function Approximators plus a Scaler. Replaybuffer is not restored though. :param load_from: Dir containing saved weights. """ from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/') self.value_func.load(load_from + 'value_func/') self.critic.load(load_from+'critic/') with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) def demonstrate_agent(self, load_from): """ Simply run the policy without training. :param load_from: :return: """ self.load_model(load_from) while True: observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format(ep_steps, ep_rewards))
def test_to_mpz(): buf = Buffer([255, 255]) assert_equal(buf.to_mpz(), mpz(65535))
def test_from_file(): buf = Buffer.from_file('./tests/resources/buf_in_test.txt', 'b64') assert_equal(buf.to_string(), 'Buffer from file test')
def test_from_mpz(): n_mpz = mpz('65535') buf = Buffer.from_mpz(n_mpz) assert_equal(buf.bytes, [255, 255])
def test_copy(): buf1 = Buffer([1, 2, 3]) buf2 = buf1.copy() assert_equal(buf2.bytes, [1, 2, 3]) assert_not_equal(id(buf1), id(buf2))
def train(rank, args, input_model=None, max_iter=100000, step_test=-1, log=False): if rank >= 0: torch.manual_seed(args.seed + rank) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = create_env(args) env.seed(args.seed + rank) if log: log = setup_logger("{0}_{1}_log".format(args.scale_legs, rank), "logs/{0}_{1}_log".format(args.scale_legs, rank)) # player initialization player = Agent(None, env, args, None) player.gpu_id = gpu_id if args.model == 'MLP': player.model = A3C_MLP(player.env.observation_space.shape[0], player.env.action_space, args.stack_frames) if args.model == 'CONV': player.model = A3C_CONV(args.stack_frames, player.env.action_space) # load the input model to the player if input_model != None: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(input_model.state_dict()) else: player.model.load_state_dict(input_model.state_dict()) # initialize the player optimizer optimizer = None if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(player.model.dictForOptimizer(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(player.model.dictForOptimizer(), lr=args.lr) else: optimizer = optim.SGD(player.model.dictForOptimizer(), lr=args.lr) # reset the environment and initialize the player state player.state = player.env.reset(args) player.state = torch.from_numpy(player.state).float() # If on GPU, do as GPU if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() last_iter = 0 mean_buf = Buffer(5) # Start looping over episodes for iteration in range(max_iter): last_iter += iteration # reset cx and hx if the enlvironmnent is over. if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 128).cuda()) player.hx = Variable(torch.zeros(1, 128).cuda()) else: player.cx = Variable(torch.zeros(1, 128)) player.hx = Variable(torch.zeros(1, 128)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) # Roll out actions and collect reward for one episode for step in range(args.num_steps): player.action_train() if player.done: break if player.done: player.eps_len = 0 # reset state state = player.env.reset(args) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if gpu_id >= 0: with torch.cuda.device(gpu_id): R = torch.zeros(1, 1).cuda() else: R = torch.zeros(1, 1) if not player.done: state = player.state if args.model == 'CONV': state = state.unsqueeze(0) value, _, _, _ = player.model( (Variable(state), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = torch.zeros(1, 1).cuda() else: gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ (player.log_probs[i].sum() * Variable(gae)) - \ (0.01 * player.entropies[i].sum()) player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() optimizer.step() player.clear_actions() if step_test > 0 and iteration % step_test == 0: tester = Tester(args, player.model) score = tester.test(last_iter) mean_buf.push(score) recent_mean = sum(mean_buf.bf) / mean_buf.current_size text = "Iteration {0}, episode reward {1}, recent reward mean {2}".format( iteration, score, recent_mean) log.info(text) tester = Tester(args, player.model) fitness = tester.test(last_iter) return fitness
def test_from_hex(): buf = Buffer.from_hex('68656c6c6f') assert_equal(buf.to_string(), 'hello')
def test_to_b64(): buf = Buffer('base64 test') assert_equal(buf.to_b64(), 'YmFzZTY0IHRlc3Q=')
def test_to_hex(): buf = Buffer('hex test') assert_equal(buf.to_hex(), '6865782074657374')
class Agent(object): def __init__(self, model, env, args, state): self.model = model self.env = env self.state = state self.hx = None self.cx = None self.eps_len = 0 self.args = args self.values = [] self.log_probs = [] self.rewards = [] self.entropies = [] self.done = True self.info = None self.reward = 0 self.gpu_id = -1 self.position_history = Buffer(200) def action_train(self): if self.args.model == 'CONV': self.state = self.state.unsqueeze(0) value, mu, sigma, (self.hx, self.cx) = self.model( (Variable(self.state), (self.hx, self.cx))) mu = torch.clamp(mu, -1.0, 1.0) sigma = F.softplus(sigma) + 1e-5 eps = torch.randn(mu.size()) pi = np.array([math.pi]) pi = torch.from_numpy(pi).float() if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): eps = Variable(eps).cuda() pi = Variable(pi).cuda() else: eps = Variable(eps) pi = Variable(pi) action = (mu + sigma.sqrt() * eps).data act = Variable(action) prob = normal(act, mu, sigma, self.gpu_id, gpu=self.gpu_id >= 0) action = torch.clamp(action, -1.0, 1.0) entropy = 0.5 * ((sigma * 2 * pi.expand_as(sigma)).log() + 1) self.entropies.append(entropy) log_prob = (prob + 1e-6).log() self.log_probs.append(log_prob) state, reward, self.done, self.info = self.env.step( action.cpu().numpy()[0]) reward = max(min(float(reward), 1.0), -1.0) self.state = torch.from_numpy(state).float() if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.state = self.state.cuda() self.eps_len += 1 # update position history self.position_history.push(self.env.env.hull.position.x) # check for the stagnation if self._is_stagnating(): self.done = True self.reward = -100 self.done = self.done or self.eps_len >= self.args.max_episode_length self.values.append(value) self.rewards.append(reward) return self def action_test(self): with torch.no_grad(): if self.done: if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.cx = Variable(torch.zeros(1, 128).cuda()) self.hx = Variable(torch.zeros(1, 128).cuda()) else: self.cx = Variable(torch.zeros(1, 128)) self.hx = Variable(torch.zeros(1, 128)) else: self.cx = Variable(self.cx.data) self.hx = Variable(self.hx.data) if self.args.model == 'CONV': self.state = self.state.unsqueeze(0) value, mu, sigma, (self.hx, self.cx) = self.model( (Variable(self.state), (self.hx, self.cx))) mu = torch.clamp(mu.data, -1.0, 1.0) action = mu.cpu().numpy()[0] #print("action ====================", action) state, self.reward, self.done, self.info = self.env.step(action) self.state = torch.from_numpy(state).float() if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.state = self.state.cuda() self.eps_len += 1 # update position history self.position_history.push(self.env.env.hull.position.x) # check for the stagnation if self._is_stagnating(): self.done = True self.reward = -100 self.done = self.done or self.eps_len >= self.args.max_episode_length return self def _is_stagnating(self): if self.position_history.is_full(): pos_past = self.position_history.get(0) pos_now = self.position_history.get(-1) if pos_now - pos_past == 0: return True return False def clear_actions(self): self.values = [] self.log_probs = [] self.rewards = [] self.entropies = [] return self
# %% # optimizer and loss LGAN = MSELoss() LCYC = L1Loss() LIdentity = L1Loss() optimizer_G = Adam(itertools.chain(G12.parameters(), G21.parameters()), lr=0.001) optimizer_D1 = Adam(D1.parameters(), lr=0.001) optimizer_D2 = Adam(D2.parameters(), lr=0.001) # %% # train models real_label = torch.full((32, ), 1, device="cuda:0") false_label = torch.full((32, ), 0, device="cuda:0") bufD1 = Buffer(50) bufD2 = Buffer(50) num_epochs = 100 learning_rate = 0.01 for epoch in range(num_epochs): for i, (realA, realB) in enumerate(dataloader): if torch.cuda.is_available(): realA.cuda() realB.cuda() #------------ Generator 1->2 and 2->1 -------------# optimizer_G.zero_grad() fakeB = G12(realA) pred_fakeB = D2(fakeB)
def test_set(): buf = Buffer([0, 0, 0, 0]) buf.set(1, 16) assert_equal(buf.get(1), 16)
def test_properties(): buf = Buffer([1, 2, 3]) assert_equal(buf.bytes, [1, 2, 3]) assert_equal(buf.size, 3)
class Experiment: def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show): self.env_name = env_name self.env = gym.make(env_name) if env_name == "FetchReach-v0": self.env = gym.wrappers.FlattenDictWrapper( self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) self.obs_dim = self.env.observation_space.shape[ 0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(50000, self.obs_dim, self.act_dim) self.episodes = 20 self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=5) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) # using MC return would be more helpful self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if not show: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('observation dimension:', self.obs_dim) print('action dimension:', self.act_dim) # Use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler() def init_scaler(self): """ 5 episodes empirically determined. :return: """ print('Fitting scaler') observation_samples = [] for i in range(5): observation = [] obs = self.env.reset() observation.append(obs) obs = obs.astype(np.float64).reshape((1, -1)) done = False step = 0 while not done: obs = np.append(obs, [[step]], axis=1) # add time step feature action = self.policy.get_sample(obs).reshape( (1, -1)).astype(np.float64) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step( action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) observation.append(obs_new) obs = obs_new.astype(np.float64).reshape((1, -1)) step += 1e-3 observation_samples.append(observation) observation_samples = np.concatenate(observation_samples, axis=0) self.scaler.update(observation_samples) def normalize_obs(self, obs): """ transform and update on the fly. :param obs: :return: """ scale, offset = self.scaler.get() obs_scaled = (obs - offset) * scale self.scaler.update(obs.astype(np.float64).reshape((1, -1))) return obs_scaled def run_one_episode(self): """ collect data only :param save: :param train_policy: :param train_value_func: :param animate: :return: """ obs = self.env.reset() observes, actions, rewards = [], [], [] done = False step = 0 while not done: if self.animate: self.env.render() obs = obs.astype(np.float64).reshape((1, -1)) obs = self.normalize_obs(obs) obs = np.append(obs, [[step]], axis=1) # add time step feature observes.append(obs) action = self.policy.get_sample(obs).reshape( (1, -1)).astype(np.float64) actions.append(action) if self.env_name == "FetchReach-v0": obs_new, reward, done, _ = self.env.step(action.reshape(-1)) else: obs_new, reward, done, _ = self.env.step(action) if not isinstance(reward, float): reward = np.asscalar(reward) rewards.append(reward) obs = obs_new step += 0.003 return np.concatenate(observes), np.concatenate(actions), np.array( rewards) def discounted_sum(self, l, factor): discounted = [] sum = 0 for i in reversed(l): discounted.append(factor * sum + i) sum = factor * sum + i return np.array(list(reversed(discounted))) def run_policy(self, episodes): """ gather a batch of samples. :param episodes: :return: """ trajectories = [] for e in range(episodes): observes, actions, rewards = self.run_one_episode() trajectory = { 'observes': observes, 'actions': actions, 'rewards': rewards } trajectories.append(trajectory) return trajectories def run_expr(self): ep_steps = [] ep_rewards = [] ep_entropy = [] i = 0 while i < self.num_iterations: trajectories = self.run_policy(20) # add to experience replay buffer self.buffer.append(trajectories) i += len(trajectories) # for E=20, T=50, the total number of samples would be 1000 # In future needs to account for not uniform time steps per episode. # e.g. in Hopper-v2 environment not every episode has same time steps E = len(trajectories) T = trajectories[0]['observes'].shape[0] """train critic""" self.critic.fit( self.policy, self.buffer, epochs=1, num_samples=E * T) # take E*T samples, so in total E*T gradient steps """calculation of episodic discounted return only needs rewards""" mc_returns = np.concatenate([ self.discounted_sum(t['rewards'], self.discount) for t in trajectories ]) """using current batch of samples to update baseline""" observes = np.concatenate([t['observes'] for t in trajectories]) actions = np.concatenate([t['actions'] for t in trajectories]) value_func_loss = self.value_func.update(observes, mc_returns) """compute GAE""" for t in trajectories: t['values'] = self.value_func.predict(t['observes']) # IS it really legitimate to insert 0 at the last obs? t['td_residual'] = t['rewards'] + self.discount * np.append( t['values'][1:], 0) - t['values'] t['gae'] = self.discounted_sum(t['td_residual'], self.discount * self.lamb) advantages = np.concatenate([t['gae'] for t in trajectories]) """compute control variate""" "" cv = self.critic.get_contorl_variate(self.policy, observes, actions) """conservative control variate""" eta = [1 if i > 0 else 0 for i in advantages * cv] """center learning signal""" # check that advantages and CV should be of size E*T # eta controls the on-off of control variate learning_signal = advantages - eta * cv """controlled taylor eval term""" ctrl_taylor = np.concatenate( [[eta[i] * act] for i, act in enumerate( self.critic.get_taylor_eval(self.policy, observes))]) policy_loss, kl, entropy, beta = self.policy.update( observes, actions, learning_signal, ctrl_taylor) # normalize advantage estimates # advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-6) avg_rewards = np.sum( np.concatenate([t['rewards'] for t in trajectories])) / self.episodes avg_timesteps = np.average( [len(t['rewards']) for t in trajectories]) log = {} # compute statistics such as mean and std log['steps'] = avg_timesteps log['rewards'] = avg_rewards log['policy_loss'] = policy_loss log['kl'] = kl log['entropy'] = entropy log['value_func_loss'] = value_func_loss log['beta'] = beta # display print('episode: ', i) print('average steps: {0}, average rewards: {1}'.format( log['steps'], log['rewards'])) for key in [ 'policy_loss', 'kl', 'entropy', 'beta', 'value_func_loss' ]: print('{:s}: {:.2g}'.format(key, log[key])) print('\n') ep_steps.append(log['steps']) ep_rewards.append(log['rewards']) ep_entropy.append(log['entropy']) # write to log.csv if self.write_header: fieldnames = [x for x in log.keys()] self.writer = csv.DictWriter(self.log_file, fieldnames=fieldnames) self.writer.writeheader() self.write_header = False self.writer.writerow(log) # we want the csv file to preserve information even if the program terminates earlier than scheduled. self.log_file.flush() # save model weights if stopped manually if self.killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break self.killer.kill_now = False # if (i+1)%20 == 0: # print('episode: ', i+1) # print('average steps', np.average(steps)) # print('average rewards', np.average(rewards)) self.policy.save(OUTPATH) self.value_func.save(OUTPATH) self.scaler.save(OUTPATH) plt.figure(figsize=(12, 9)) if self.env_name.startswith('Fetch'): ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('policy entropy') plt.plot(ep_entropy) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) else: ax1 = plt.subplot(121) plt.xlabel('episodes') plt.ylabel('steps') plt.plot(ep_steps) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax1.xaxis.set_major_formatter(ticks_x) ax2 = plt.subplot(122) plt.xlabel('episodes') plt.ylabel('episodic rewards') plt.plot(ep_rewards) scale_x = self.episodes ticks_x = ticker.FuncFormatter( lambda x, pos: '{0:g}'.format(x * scale_x)) ax2.xaxis.set_major_formatter(ticks_x) plt.savefig(OUTPATH + 'train.png') def load_model(self, load_from): from tensorflow.python.tools import inspect_checkpoint as chkp # # print all tensors in checkpoint file # chkp.print_tensors_in_checkpoint_file(load_from+'policy/policy.pl', tensor_name='', all_tensors=True, all_tensor_names=True) self.policy.load(load_from + 'policy/policy.pl') self.value_func.load(load_from + 'value_func/value_func.pl') def demonstrate_agent(self, load_from): self.load_model(load_from) with open(load_from + "scaler.pkl", 'rb') as file: self.scaler = pickle.load(file) self.animate = True for i in range(10): observes, actons, rewards = self.run_one_episode() ep_rewards = np.sum(rewards) ep_steps = len(rewards) print("Total steps: {0}, total rewards: {1}\n".format( ep_steps, ep_rewards))
def main(args): torch.manual_seed(args.seed) # start simulators mp.set_start_method('spawn') episode_q = Queue() player_qs = [] simulators = [] for si in range(args.n_simulators): player_qs.append(Queue()) simulators.append( mp.Process(target=simulator, args=( si, player_qs[-1], episode_q, args, False, ))) simulators[-1].start() return_q = Queue() valid_q = Queue() valid_simulator = mp.Process(target=simulator, args=( args.n_simulators, valid_q, return_q, args, True, )) valid_simulator.start() env = gym.make(args.env) # env = gym.make('Assault-ram-v0') n_frames = args.n_frames # initialize replay buffer replay_buffer = Buffer(max_items=args.buffer_size, n_frames=n_frames, priority_ratio=args.priority_ratio, store_ratio=args.store_ratio) n_iter = args.n_iter init_collect = args.init_collect n_collect = args.n_collect n_value = args.n_value n_policy = args.n_policy n_hid = args.n_hid critic_aware = args.critic_aware update_every = args.update_every disp_iter = args.disp_iter val_iter = args.val_iter save_iter = args.save_iter max_len = args.max_len batch_size = args.batch_size max_collected_frames = args.max_collected_frames clip_coeff = args.grad_clip ent_coeff = args.ent_coeff discount_factor = args.discount_factor value_loss = -numpy.Inf entropy = -numpy.Inf valid_ret = -numpy.Inf ess = -numpy.Inf n_collected_frames = 0 offset = 0 return_history = [] if args.nn == "ff": # create a policy player = ff.Player(n_in=128 * n_frames, n_hid=args.n_hid, n_out=6).to(args.device) if args.player_coeff > 0.: player_old = ff.Player(n_in=128 * n_frames, n_hid=args.n_hid, n_out=6).to(args.device) player_copy = ff.Player(n_in=128 * n_frames, n_hid=args.n_hid, n_out=6).to('cpu') # create a value estimator value = ff.Value(n_in=128 * n_frames, n_hid=args.n_hid).to(args.device) value_old = ff.Value(n_in=128 * n_frames, n_hid=args.n_hid).to(args.device) for m in player.parameters(): m.data.normal_(0., 0.01) for m in value.parameters(): m.data.normal_(0., 0.01) elif args.nn == "conv": # create a policy player = conv.Player(n_frames=n_frames, n_hid=args.n_hid).to(args.device) if args.player_coeff > 0.: player_old = conv.Player(n_frames=n_frames, n_hid=args.n_hid).to(args.device) player_copy = conv.Player(n_frames=n_frames, n_hid=args.n_hid).to('cpu') # create a value estimator value = conv.Value(n_frames, n_hid=args.n_hid).to(args.device) value_old = conv.Value(n_frames, n_hid=args.n_hid).to(args.device) else: raise Exception('Unknown type') if args.cont: files = glob.glob("{}*th".format(args.saveto)) iterations = [ int(".".join(f.split('.')[:-1]).split('_')[-1].strip()) for f in files ] last_iter = numpy.max(iterations) offset = last_iter - 1 print('Reloading from {}_{}.th'.format(args.saveto, last_iter)) checkpoint = torch.load("{}_{}.th".format(args.saveto, last_iter)) player.load_state_dict(checkpoint['player']) value.load_state_dict(checkpoint['value']) return_history = checkpoint['return_history'] n_collected_frames = checkpoint['n_collected_frames'] copy_params(value, value_old) if args.player_coeff > 0.: copy_params(player, player_old) # start simulators player.to('cpu') copy_params(player, player_copy) for si in range(args.n_simulators): player_qs[si].put( [copy.deepcopy(p.data.numpy()) for p in player_copy.parameters()] + [copy.deepcopy(p.data.numpy()) for p in player_copy.buffers()]) valid_q.put( [copy.deepcopy(p.data.numpy()) for p in player_copy.parameters()] + [copy.deepcopy(p.data.numpy()) for p in player_copy.buffers()]) player.to(args.device) if args.device == 'cuda': torch.set_num_threads(1) initial = True pre_filled = 0 for ni in range(n_iter): # re-initialize optimizers opt_player = eval(args.optimizer_player)(player.parameters(), lr=args.lr, weight_decay=args.l2) opt_value = eval(args.optimizer_value)(value.parameters(), lr=args.lr, weight_decay=args.l2) try: if not initial: lr = args.lr / (1 + (ni - pre_filled + 1) * args.lr_factor) ent_coeff = args.ent_coeff / ( 1 + (ni - pre_filled + 1) * args.ent_factor) print('lr', lr, 'ent_coeff', ent_coeff) for param_group in opt_player.param_groups: param_group['lr'] = lr for param_group in opt_value.param_groups: param_group['lr'] = lr if numpy.mod((ni - pre_filled + 1), save_iter) == 0: torch.save( { 'n_iter': n_iter, 'n_collect': n_collect, 'n_value': n_value, 'n_policy': n_policy, 'max_len': max_len, 'n_hid': n_hid, 'batch_size': batch_size, 'player': player.state_dict(), 'value': value.state_dict(), 'return_history': return_history, 'n_collected_frames': n_collected_frames, }, '{}_{}.th'.format(args.saveto, (ni - pre_filled + 1) + offset + 1)) player.eval() ret_ = -numpy.Inf while True: try: ret_ = return_q.get_nowait() except queue.Empty: break if ret_ != -numpy.Inf: return_history.append(ret_) if valid_ret == -numpy.Inf: valid_ret = ret_ else: valid_ret = 0.9 * valid_ret + 0.1 * ret_ print('Valid run', ret_, valid_ret) #st = time.time() player.to('cpu') copy_params(player, player_copy) for si in range(args.n_simulators): while True: try: # empty the queue, as the new one has arrived player_qs[si].get_nowait() except queue.Empty: break player_qs[si].put([ copy.deepcopy(p.data.numpy()) for p in player_copy.parameters() ] + [ copy.deepcopy(p.data.numpy()) for p in player_copy.buffers() ]) while True: try: # empty the queue, as the new one has arrived valid_q.get_nowait() except queue.Empty: break valid_q.put([ copy.deepcopy(p.data.numpy()) for p in player_copy.parameters() ] + [copy.deepcopy(p.data.numpy()) for p in player_copy.buffers()]) player.to(args.device) #print('model push took', time.time()-st) #st = time.time() n_collected_frames_ = 0 while True: try: epi = episode_q.get_nowait() replay_buffer.add(epi[0], epi[1], epi[2], epi[3]) n_collected_frames_ = n_collected_frames_ + len(epi[0]) except queue.Empty: break if n_collected_frames_ >= max_collected_frames \ and (len(replay_buffer.buffer) + len(replay_buffer.priority_buffer)) > 0: break n_collected_frames = n_collected_frames + n_collected_frames_ if len(replay_buffer.buffer) + len( replay_buffer.priority_buffer) < 1: continue if len(replay_buffer.buffer) + len( replay_buffer.priority_buffer) < args.initial_buffer: if initial: print( 'Pre-filling the buffer...', len(replay_buffer.buffer) + len(replay_buffer.priority_buffer)) continue else: if initial: pre_filled = ni initial = False #print('collection took', time.time()-st) #print('Buffer size', len(replay_buffer.buffer) + len(replay_buffer.priority_buffer)) # fit a value function # TD(0) #st = time.time() value.train() for vi in range(n_value): if numpy.mod(vi, update_every) == 0: #print(vi, 'zeroing gradient') opt_player.zero_grad() opt_value.zero_grad() batch = replay_buffer.sample(batch_size) batch_x = torch.from_numpy( numpy.stack([ex.current_['obs'] for ex in batch ]).astype('float32')).to(args.device) batch_r = torch.from_numpy( numpy.stack([ex.current_['rew'] for ex in batch ]).astype('float32')).to(args.device) batch_xn = torch.from_numpy( numpy.stack([ex.next_['obs'] for ex in batch ]).astype('float32')).to(args.device) pred_y = value(batch_x) pred_next = value_old(batch_xn).clone().detach() batch_pi = player(batch_x) loss_ = ((batch_r + discount_factor * pred_next.squeeze() - pred_y.squeeze())**2) batch_a = torch.from_numpy( numpy.stack([ex.current_['act'] for ex in batch ]).astype('float32')[:, None]).to(args.device) batch_q = torch.from_numpy( numpy.stack([ex.current_['prob'] for ex in batch ]).astype('float32')).to(args.device) logp = torch.log(batch_pi.gather(1, batch_a.long()) + 1e-8) # (clipped) importance weight: # because the policy may have changed since the tuple was collected. log_iw = logp.squeeze().clone().detach() - torch.log( batch_q.squeeze() + 1e-8) ess_ = torch.exp(-torch.logsumexp(2 * log_iw, dim=0)).item() iw = torch.exp(log_iw.clamp(max=0.)) if args.iw: loss = iw * loss_ else: loss = loss_ loss = loss.mean() loss.backward() if numpy.mod(vi, update_every) == (update_every - 1): #print(vi, 'making an update') if clip_coeff > 0.: nn.utils.clip_grad_norm_(value.parameters(), clip_coeff) opt_value.step() copy_params(value, value_old) if value_loss < 0.: value_loss = loss_.mean().item() else: value_loss = 0.9 * value_loss + 0.1 * loss_.mean().item() if numpy.mod((ni - pre_filled + 1), disp_iter) == 0: print('# frames', n_collected_frames, 'value_loss', value_loss, 'entropy', -entropy, 'ess', ess) #print('value update took', time.time()-st) # fit a policy #st = time.time() value.eval() player.train() if args.player_coeff > 0.: player_old.eval() for pi in range(n_policy): if numpy.mod(pi, update_every) == 0: opt_player.zero_grad() opt_value.zero_grad() #st = time.time() batch = replay_buffer.sample(batch_size) #print('batch collection took', time.time()-st) #st = time.time() #batch_x = [ex.current_['obs'] for ex in batch] #batch_xn = [ex.next_['obs'] for ex in batch] #batch_r = [ex.current_['rew'] for ex in batch] #print('list construction took', time.time()-st) #st = time.time() batch_x = numpy.zeros( tuple([len(batch)] + list(batch[0].current_['obs'].shape)), dtype='float32') batch_xn = numpy.zeros( tuple([len(batch)] + list(batch[0].current_['obs'].shape)), dtype='float32') batch_r = numpy.zeros((len(batch)), dtype='float32')[:, None] for ei, ex in enumerate(batch): batch_x[ei, :] = ex.current_['obs'] batch_xn[ei, :] = ex.next_['obs'] batch_r[ei, 0] = ex.current_['rew'] #batch_x = numpy.stack(batch_x).astype('float32') #batch_xn = numpy.stack(batch_xn).astype('float32') #batch_r = numpy.stack(batch_r).astype('float32')[:,None] #print('batch stack for value took', time.time()-st) #st = time.time() batch_x = torch.from_numpy(batch_x).to(args.device) batch_xn = torch.from_numpy(batch_xn).to(args.device) batch_r = torch.from_numpy(batch_r).to(args.device) #print('batch push for value took', time.time()-st) #st = time.time() batch_v = value(batch_x).clone().detach() batch_vn = value(batch_xn).clone().detach() #print('value forward pass took', time.time()-st) #st = time.time() batch_a = torch.from_numpy( numpy.stack([ex.current_['act'] for ex in batch ]).astype('float32')[:, None]).to(args.device) batch_q = torch.from_numpy( numpy.stack([ex.current_['prob'] for ex in batch ]).astype('float32')).to(args.device) batch_pi = player(batch_x) logp = torch.log(batch_pi.gather(1, batch_a.long()) + 1e-8) if args.player_coeff > 0.: batch_pi_old = player_old(batch_x).clone().detach() #print('policy computation took', time.time()-st) #st = time.time() # entropy regularization ent = -(batch_pi * torch.log(batch_pi + 1e-8)).sum(1) if entropy == -numpy.Inf: entropy = ent.mean().item() else: entropy = 0.9 * entropy + 0.1 * ent.mean().item() #print('entropy computation took', time.time()-st) #st = time.time() # advantage: r(s,a) + \gamma * V(s') - V(s) adv = batch_r + discount_factor * batch_vn - batch_v #adv = adv / adv.abs().max().clamp(min=1.) loss = -(adv * logp).squeeze() loss = loss - ent_coeff * ent #print('basic loss computation took', time.time()-st) #st = time.time() # (clipped) importance weight: log_iw = logp.squeeze().clone().detach() - torch.log(batch_q + 1e-8) iw = torch.exp(log_iw.clamp(max=0.)) ess_ = torch.exp(-torch.logsumexp(2 * log_iw, dim=0)).item() if ess == -numpy.Inf: ess = ess_ else: ess = 0.9 * ess + 0.1 * ess_ if args.iw: loss = iw * loss else: loss = loss #print('importance weighting took', time.time()-st) if critic_aware: #st = time.time() pred_y = value(batch_x).squeeze() pred_next = value(batch_xn).squeeze() critic_loss_ = -( (batch_r.squeeze() + discount_factor * pred_next - pred_y)**2).clone().detach() critic_loss_ = torch.exp(critic_loss_) loss = loss * critic_loss_ #print('critic aware weighting took', time.time()-st) loss = loss.mean() if args.player_coeff > 0.: #st = time.time() loss_old = -(batch_pi_old * torch.log(batch_pi + 1e-8)).sum(1).mean() loss = (1. - args.player_coeff ) * loss + args.player_coeff * loss_old #print('player interpolation took', time.time()-st) #st = time.time() loss.backward() if numpy.mod(pi, update_every) == (update_every - 1): if clip_coeff > 0.: nn.utils.clip_grad_norm_(player.parameters(), clip_coeff) opt_player.step() #print('backward computation and update took', time.time()-st) if args.player_coeff > 0.: copy_params(player, player_old) ##print('policy update took', time.time()-st) except KeyboardInterrupt: print('Terminating...') break for si in range(args.n_simulators): player_qs[si].put("END") print('Waiting for the simulators...') for si in range(args.n_simulators): simulators[-1].join() print('Done')