def __init__(self, discount, num_iterations, lamb, animate, kl_target, **kwargs): self.env_name = 'RoboschoolHumanoidFlagrun-v1' self.env = gym.make(self.env_name) gym.spaces.seed(1234) # for reproducibility self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) # 1000000 is the size they have used in paper self.episodes = 20 # larger episodes can reduce variance self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if 'show' in kwargs and not kwargs['show']: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('Observation dimension:', self.obs_dim) print('Action dimension:', self.act_dim) # The use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler()
def test_map(): buf1 = Buffer([1, 2, 3]) squared = lambda x: x**2 buf2 = buf1.map(squared) assert_equal(buf1.bytes, [1, 2, 3]) assert_equal(buf2.bytes, [1, 4, 9])
def test_to_file(): filepath = './tests/resources/buf_out_test.txt' buf = Buffer('to file test') buf.to_file(filepath, 'hex') data = open(filepath, 'r').read() assert_equal(data, '746f2066696c652074657374')
def session_key(self, public_B): """Generate a session secret given the other party's public key""" raw_secret = gmp.powmod(public_B.to_mpz(), self._secret_key, self._dh_p) # Hash the secret to create a key h_256 = SHA256.new() h_256.update(raw_secret.digits(10)) raw_key = h_256.digest() return Buffer(raw_key)
def parse_message(msg: bytes, conn, dir_): size = unpack("<I", msg[:4])[0] msg_type = unpack("<I", msg[4:8])[0] message_type = message_type_table[msg_type] end = -(len(msg) - size - 8) if end == 0: end = None msg = msg[8:end] buf = Buffer() buf.write(msg) message = message_type.unpack(buf) logging.info(("\033[36mSEND >>> \033[0m" if dir_ == 1 else "\033[32mRECV <<< \033[0m") + repr(message)) action = message_action_table.get(msg_type, None) if action is not None and conn is not None: action(message, conn)
def __init__(self, wid, shared_model, model, optimizer, gamma=0.99, n_steps=8): # Configuration self.worker_id = wid self.gamma = gamma self.n_steps = n_steps # experience buffer self.buffer = Buffer(size=n_steps) # network related settings self.global_net = shared_model self.local_net = model self.optimizer = optimizer # deep copy, synchronize with shared model self.local_net.load_state_dict(self.global_net.state_dict())
def __init__(self, action_set, reward_function, feature_extractor, hidden_dims=[50, 50], learning_rate=5e-4, buffer_size=50000, batch_size=64, num_batches=100, starts_learning=5000, final_epsilon=0.02, discount=0.99, target_freq=10, verbose=False, print_every=1, test_model_path=None): Agent.__init__(self, action_set, reward_function) self.feature_extractor = feature_extractor self.feature_dim = self.feature_extractor.dimension # build Q network # we use a multilayer perceptron dims = [self.feature_dim] + hidden_dims + [len(self.action_set)] self.model = MLP(dims) if test_model_path is None: self.test_mode = False self.learning_rate = learning_rate self.buffer_size = buffer_size self.batch_size = batch_size self.num_batches = num_batches self.starts_learning = starts_learning self.epsilon = 1.0 # anneals starts_learning/(starts_learning + t) self.final_epsilon = 0.02 self.timestep = 0 self.discount = discount self.buffer = Buffer(self.buffer_size) self.target_net = MLP(dims) self.target_net.load_state_dict(self.model.state_dict()) self.target_net.eval() self.target_freq = target_freq # target nn updated every target_freq episodes self.num_episodes = 0 self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate) # for debugging purposes self.verbose = verbose self.running_loss = 1. self.print_every = print_every else: self.test_mode = True self.model.load_state_dict(torch.load(test_model_path)) self.model.eval()
def __init__(self, model, env, args, state): self.model = model self.env = env self.state = state self.hx = None self.cx = None self.eps_len = 0 self.args = args self.values = [] self.log_probs = [] self.rewards = [] self.entropies = [] self.done = True self.info = None self.reward = 0 self.gpu_id = -1 self.position_history = Buffer(200)
def __init__(self, env_name, discount, num_iterations, lamb, animate, kl_target, show): self.env_name = env_name self.env = gym.make(env_name) if env_name == "FetchReach-v0": self.env = gym.wrappers.FlattenDictWrapper( self.env, ['observation', 'desired_goal', 'achieved_goal']) gym.spaces.seed(1234) self.obs_dim = self.env.observation_space.shape[ 0] + 1 # adding time step as feature self.act_dim = self.env.action_space.shape[0] self.discount = discount self.num_iterations = num_iterations self.lamb = lamb self.animate = animate self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) self.episodes = 20 self.killer = GracefulKiller() self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20) self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH) # using MC return would be more helpful self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10) if not show: # save copies of file shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH) shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH) shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH) shutil.copy(inspect.getfile(self.__class__), OUTPATH) self.log_file = open(OUTPATH + 'log.csv', 'w') self.write_header = True print('observation dimension:', self.obs_dim) print('action dimension:', self.act_dim) # Use of a scaler is crucial self.scaler = Scaler(self.obs_dim) self.init_scaler()
def __init__(self, ip, port): self.stream = Buffer() self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.s.connect((ip, port)) raw_pk = self.s.recv(33) self.pk = GraphenePublicKey(raw_pk.hex()) sk = PrivateKey() point = self.pk.point() * int.from_bytes(bytes(sk), "big") x: int = point.x() raw_data = x.to_bytes(32, "big") self.shared_secret = sha512(raw_data).digest() key = sha256(self.shared_secret).digest() crc = cityhash.CityHash128(self.shared_secret) data = crc.to_bytes(16, "little") iv = data[8:16] + data[:8] self.s.sendall(bytes.fromhex(repr(sk.pubkey))) self.encryptor = AES.new(key, AES.MODE_CBC, iv) self.test = AES.new(key, AES.MODE_CBC, iv) self.decryptor = AES.new(key, AES.MODE_CBC, iv) self.worker_thread = threading.Thread(target=self.worker) self.worker_thread.start() self.send( 5006, { "user_agent": "Haruka Mock Client", "core_protocol_version": 106, "inbound_address": "0.0.0.0", "inbound_port": 0, "outbound_port": 0, "node_public_key": sk.pubkey, "signed_shared_secret": ecdsa.sign_message(self.shared_secret, str(sk)), "chain_id": bytes.fromhex( "4018d7844c78f6a6c41c6a552b898022310fc5dec06da467ee7905a8dad512c8" ), "user_data": { "platform": String("unknown") } })
def echo_get_intention(message): global small_talk_indicator answer, intent, action = detect_intent_texts(project_id, session_id, [message.text], language_code) if intent: bot.send_message(message.chat.id, intent) else: bot.send_message(message.chat.id, 'no intent') messages_list.append(message.text) intents_list.append(intent) if intent == 'my.make_order': small_talk_indicator = False bot.send_message(message.chat.id, answer) elif small_talk_indicator or intent == 'my.lets_make_conversation' or action == 'smalltalk.appraisal.thank_you': small_talk_indicator = True # answer, intent = detect_intent_texts(project_id, session_id, [message.text], language_code) bot.send_message(message.chat.id, answer) else: wish = get_item_for_search(message.text) if 'fail' not in wish: chat_id = message.chat.id markup = generate_irec_markup() user = Buffer(wish) user_dict[chat_id] = user bot.send_message(message.chat.id, 'Выбери способ подбора:', reply_markup=markup) else: negative = ''.join([ 'Много хочешь ', u'\U0001F5FF', '\n', 'Попробуй написать по-другому.' ]) bot.reply_to(message, negative)
def _init_memory(self): self._buffer = Buffer(max_size=max_memory)
def __init__(self, action_set, reward_function, prior_variance, noise_variance, feature_extractor, prior_network, num_ensemble, hidden_dims=[10, 10], learning_rate=5e-4, buffer_size=50000, batch_size=64, num_batches=100, starts_learning=5000, discount=0.99, target_freq=10, verbose=False, print_every=1, test_model_path=None): Agent.__init__(self, action_set, reward_function) self.prior_variance = prior_variance self.noise_variance = noise_variance self.feature_extractor = feature_extractor self.feature_dim = self.feature_extractor.dimension dims = [self.feature_dim] + hidden_dims + [len(self.action_set)] self.prior_network = prior_network self.num_ensemble = num_ensemble # number of models in ensemble self.index = np.random.randint(self.num_ensemble) # build Q network # we use a multilayer perceptron if test_model_path is None: self.test_mode = False self.learning_rate = learning_rate self.buffer_size = buffer_size self.batch_size = batch_size self.num_batches = num_batches self.starts_learning = starts_learning self.discount = discount self.timestep = 0 self.buffer = Buffer(self.buffer_size) self.models = [] for i in range(self.num_ensemble): if self.prior_network: ''' Second network is a prior network whose weights are fixed and first network is difference network learned i.e, weights are mutable ''' self.models.append( DQNWithPrior(dims, scale=np.sqrt( self.prior_variance)).to(device)) else: self.models.append(MLP(dims).to(device)) self.models[i].initialize() ''' prior networks weights are immutable so enough to keep difference network ''' self.target_nets = [] for i in range(self.num_ensemble): if self.prior_network: self.target_nets.append( DQNWithPrior(dims, scale=np.sqrt( self.prior_variance)).to(device)) else: self.target_nets.append(MLP(dims).to(device)) self.target_nets[i].load_state_dict( self.models[i].state_dict()) self.target_nets[i].eval() self.target_freq = target_freq # target nn updated every target_freq episodes self.num_episodes = 0 self.optimizer = [] for i in range(self.num_ensemble): self.optimizer.append( torch.optim.Adam(self.models[i].parameters(), lr=self.learning_rate)) # for debugging purposes self.verbose = verbose self.running_loss = 1. self.print_every = print_every else: self.models = [] self.test_mode = True if self.prior_network: self.models.append( DQNWithPrior(dims, scale=self.prior_variance)) else: self.models.append(MLP(dims)) self.models[0].load_state_dict(torch.load(test_model_path)) self.models[0].eval() self.index = 0
def test_xor(): buf1 = Buffer('abc') buf2 = Buffer(' ') xord = buf1.xor(buf2) assert_equal(xord.to_string(), 'ABC')
def train(rank, args, input_model=None, max_iter=100000, step_test=-1, log=False): if rank >= 0: torch.manual_seed(args.seed + rank) gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = create_env(args) env.seed(args.seed + rank) if log: log = setup_logger("{0}_{1}_log".format(args.scale_legs, rank), "logs/{0}_{1}_log".format(args.scale_legs, rank)) # player initialization player = Agent(None, env, args, None) player.gpu_id = gpu_id if args.model == 'MLP': player.model = A3C_MLP(player.env.observation_space.shape[0], player.env.action_space, args.stack_frames) if args.model == 'CONV': player.model = A3C_CONV(args.stack_frames, player.env.action_space) # load the input model to the player if input_model != None: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(input_model.state_dict()) else: player.model.load_state_dict(input_model.state_dict()) # initialize the player optimizer optimizer = None if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(player.model.dictForOptimizer(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(player.model.dictForOptimizer(), lr=args.lr) else: optimizer = optim.SGD(player.model.dictForOptimizer(), lr=args.lr) # reset the environment and initialize the player state player.state = player.env.reset(args) player.state = torch.from_numpy(player.state).float() # If on GPU, do as GPU if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() last_iter = 0 mean_buf = Buffer(5) # Start looping over episodes for iteration in range(max_iter): last_iter += iteration # reset cx and hx if the enlvironmnent is over. if player.done: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, 128).cuda()) player.hx = Variable(torch.zeros(1, 128).cuda()) else: player.cx = Variable(torch.zeros(1, 128)) player.hx = Variable(torch.zeros(1, 128)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) # Roll out actions and collect reward for one episode for step in range(args.num_steps): player.action_train() if player.done: break if player.done: player.eps_len = 0 # reset state state = player.env.reset(args) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if gpu_id >= 0: with torch.cuda.device(gpu_id): R = torch.zeros(1, 1).cuda() else: R = torch.zeros(1, 1) if not player.done: state = player.state if args.model == 'CONV': state = state.unsqueeze(0) value, _, _, _ = player.model( (Variable(state), (player.hx, player.cx))) R = value.data player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 R = Variable(R) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = torch.zeros(1, 1).cuda() else: gae = torch.zeros(1, 1) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimataion delta_t = player.rewards[i] + args.gamma * \ player.values[i + 1].data - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ (player.log_probs[i].sum() * Variable(gae)) - \ (0.01 * player.entropies[i].sum()) player.model.zero_grad() (policy_loss + 0.5 * value_loss).backward() optimizer.step() player.clear_actions() if step_test > 0 and iteration % step_test == 0: tester = Tester(args, player.model) score = tester.test(last_iter) mean_buf.push(score) recent_mean = sum(mean_buf.bf) / mean_buf.current_size text = "Iteration {0}, episode reward {1}, recent reward mean {2}".format( iteration, score, recent_mean) log.info(text) tester = Tester(args, player.model) fitness = tester.test(last_iter) return fitness
# critic_model.save_weights(".model/billiard_critic.h5") # target_actor.save_weights(".model/billiard_target_actor.h5") # target_critic.save_weights(".model/billiard_target_critic.h5") if __name__ == "__main__": env = gym.make("billiard-v0") env_info = { "num_states": env.observation_space.shape, "num_actions": env.action_space.shape, "upper_bound": env.action_space.high[0], "lower_bound": env.action_space.low[0], } actor_model = get_actor(**env_info) critic_model = get_critic(**env_info) target_actor = get_actor(**env_info) target_critic = get_critic(**env_info) # Learning rate for actor-critic models critic_lr = 0.002 actor_lr = 0.001 critic_optimizer = tf.keras.optimizers.Adam(critic_lr) actor_optimizer = tf.keras.optimizers.Adam(actor_lr) buffer = Buffer(actor_model, critic_model, target_actor, target_critic, critic_optimizer, actor_optimizer, buffer_capacity=50000, batch_size=256, **env_info) train(env_info, buffer, total_episodes=1000)
def test_to_mpz(): buf = Buffer([255, 255]) assert_equal(buf.to_mpz(), mpz(65535))
def test_to_bin(): buf = Buffer('Az') assert_equal(buf.to_bin(), '0100000101111010')
def test_properties(): buf = Buffer([1, 2, 3]) assert_equal(buf.bytes, [1, 2, 3]) assert_equal(buf.size, 3)
DEBUG_CONNECT = True # Show pythonOBD debug info UPDATE_FREQ = 2.5 # wait 1/freq s each frame PORT = "/dev/rfcomm0" FONT = "Arial 20" BUFFER_SIZE = 60 # buffer & graph last x measurements GRAPH_Y_MAX = 100 watchlist = [["ELM_VOLTAGE", "RPM", "SPEED", "ENGINE_LOAD"], ["FUEL_LEVEL", "COOLANT_TEMP", "OIL_TEMP", "INTAKE_TEMP"], [ "RUN_TIME", "THROTTLE_POS", "TIMING_ADVANCE", "BAROMETRIC_PRESSURE" ]] history = { k: Buffer(maxlen=BUFFER_SIZE) for k in itertools.chain.from_iterable(watchlist) } labels = ("s1", "s2", "s3", "s4") # layout: sX_l for label, sX_d for data layout = [[ sg.Text("", size=(15, 1), key=f"{label}_l", font=FONT), sg.Text("", size=(15, 1), key=f"{label}_d", font=FONT, justification='left'), sg.Graph(canvas_size=(170, 120), graph_bottom_left=(-BUFFER_SIZE - 1, -GRAPH_Y_MAX * 1.05), graph_top_right=(1, GRAPH_Y_MAX * 1.05), background_color="white",
def test_to_hex(): buf = Buffer('hex test') assert_equal(buf.to_hex(), '6865782074657374')
"SpaceInvaders-v0", "Seaquest-v0", "LunarLanderV2", "Reacher-v2", "FrozenLake-v0" ] env = gym.make('MountainCar-v0') obs, rew, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0 epochs = 10 local_steps_per_epoch = 1000 # tf.set_random_seed(22222) agent = PPOAgent(env.observation_space, env.action_space) buffer = Buffer(env.observation_space.shape, env.action_space.shape, size=local_steps_per_epoch) rewards = [0] for epoch in tqdm(range(epochs)): # print("Epoch {} Reward {}".format(epoch, rewards[-1])) for t in range(local_steps_per_epoch): act, v_t, logp_pi = agent.get_action(obs) buffer.store(obs, act, rew, v_t, logp_pi) # Last var is logpi (not used in vpg) obs, rew, done, _ = env.step(act[0]) ep_ret += rew ep_len += 1 if done or (t==local_steps_per_epoch-1): # if not done:
def test_get(): buf = Buffer([0, 16, 0]) assert_equal(buf.get(1), 16)
def test_copy(): buf1 = Buffer([1, 2, 3]) buf2 = buf1.copy() assert_equal(buf2.bytes, [1, 2, 3]) assert_not_equal(id(buf1), id(buf2))
# %% # optimizer and loss LGAN = MSELoss() LCYC = L1Loss() LIdentity = L1Loss() optimizer_G = Adam(itertools.chain(G12.parameters(), G21.parameters()), lr=0.001) optimizer_D1 = Adam(D1.parameters(), lr=0.001) optimizer_D2 = Adam(D2.parameters(), lr=0.001) # %% # train models real_label = torch.full((32, ), 1, device="cuda:0") false_label = torch.full((32, ), 0, device="cuda:0") bufD1 = Buffer(50) bufD2 = Buffer(50) num_epochs = 100 learning_rate = 0.01 for epoch in range(num_epochs): for i, (realA, realB) in enumerate(dataloader): if torch.cuda.is_available(): realA.cuda() realB.cuda() #------------ Generator 1->2 and 2->1 -------------# optimizer_G.zero_grad() fakeB = G12(realA) pred_fakeB = D2(fakeB)
for _ in range(parameters["n_simulations"]): keras.backend.clear_session() alpha = 0.001 model = Sequential() model.add( Dense(parameters["hidden_size"], input_dim=env.observation_space.shape[0], activation='tanh')) model.add(Dense(env.action_space.n, activation='linear')) model.compile(loss='mse', optimizer=Adam(lr=alpha)) dqn_learner = Dqn(model=model, buffer=Buffer(parameters["buffer_size"]), env=env, gamma=parameters["gamma"], epsilon=parameters["epsilon"], decayment_rate=parameters["decayment_rate"], episodes=parameters["episodes"], max_steps=parameters["max_steps"], batch_size=parameters["batch_size"]) dqn_learner.train() rewards.append(dqn_learner.rewards_greedy) steps.append(dqn_learner.steps_greedy) rewards = np.array(rewards) steps = np.array(steps)
def test_set(): buf = Buffer([0, 0, 0, 0]) buf.set(1, 16) assert_equal(buf.get(1), 16)
def test_concat(): buf1 = Buffer('abc') buf2 = Buffer('xyz') combined = buf1.concat(buf2) assert_equal(combined.to_string(), 'abcxyz')
def main(args): torch.manual_seed(args.seed) # start simulators mp.set_start_method('spawn') episode_q = Queue() player_qs = [] simulators = [] for si in range(args.n_simulators): player_qs.append(Queue()) simulators.append( mp.Process(target=simulator, args=( si, player_qs[-1], episode_q, args, False, ))) simulators[-1].start() return_q = Queue() valid_q = Queue() valid_simulator = mp.Process(target=simulator, args=( args.n_simulators, valid_q, return_q, args, True, )) valid_simulator.start() env = gym.make(args.env) # env = gym.make('Assault-ram-v0') n_frames = args.n_frames # initialize replay buffer replay_buffer = Buffer(max_items=args.buffer_size, n_frames=n_frames, priority_ratio=args.priority_ratio, store_ratio=args.store_ratio) n_iter = args.n_iter init_collect = args.init_collect n_collect = args.n_collect n_value = args.n_value n_policy = args.n_policy n_hid = args.n_hid critic_aware = args.critic_aware update_every = args.update_every disp_iter = args.disp_iter val_iter = args.val_iter save_iter = args.save_iter max_len = args.max_len batch_size = args.batch_size max_collected_frames = args.max_collected_frames clip_coeff = args.grad_clip ent_coeff = args.ent_coeff discount_factor = args.discount_factor value_loss = -numpy.Inf entropy = -numpy.Inf valid_ret = -numpy.Inf ess = -numpy.Inf n_collected_frames = 0 offset = 0 return_history = [] if args.nn == "ff": # create a policy player = ff.Player(n_in=128 * n_frames, n_hid=args.n_hid, n_out=6).to(args.device) if args.player_coeff > 0.: player_old = ff.Player(n_in=128 * n_frames, n_hid=args.n_hid, n_out=6).to(args.device) player_copy = ff.Player(n_in=128 * n_frames, n_hid=args.n_hid, n_out=6).to('cpu') # create a value estimator value = ff.Value(n_in=128 * n_frames, n_hid=args.n_hid).to(args.device) value_old = ff.Value(n_in=128 * n_frames, n_hid=args.n_hid).to(args.device) for m in player.parameters(): m.data.normal_(0., 0.01) for m in value.parameters(): m.data.normal_(0., 0.01) elif args.nn == "conv": # create a policy player = conv.Player(n_frames=n_frames, n_hid=args.n_hid).to(args.device) if args.player_coeff > 0.: player_old = conv.Player(n_frames=n_frames, n_hid=args.n_hid).to(args.device) player_copy = conv.Player(n_frames=n_frames, n_hid=args.n_hid).to('cpu') # create a value estimator value = conv.Value(n_frames, n_hid=args.n_hid).to(args.device) value_old = conv.Value(n_frames, n_hid=args.n_hid).to(args.device) else: raise Exception('Unknown type') if args.cont: files = glob.glob("{}*th".format(args.saveto)) iterations = [ int(".".join(f.split('.')[:-1]).split('_')[-1].strip()) for f in files ] last_iter = numpy.max(iterations) offset = last_iter - 1 print('Reloading from {}_{}.th'.format(args.saveto, last_iter)) checkpoint = torch.load("{}_{}.th".format(args.saveto, last_iter)) player.load_state_dict(checkpoint['player']) value.load_state_dict(checkpoint['value']) return_history = checkpoint['return_history'] n_collected_frames = checkpoint['n_collected_frames'] copy_params(value, value_old) if args.player_coeff > 0.: copy_params(player, player_old) # start simulators player.to('cpu') copy_params(player, player_copy) for si in range(args.n_simulators): player_qs[si].put( [copy.deepcopy(p.data.numpy()) for p in player_copy.parameters()] + [copy.deepcopy(p.data.numpy()) for p in player_copy.buffers()]) valid_q.put( [copy.deepcopy(p.data.numpy()) for p in player_copy.parameters()] + [copy.deepcopy(p.data.numpy()) for p in player_copy.buffers()]) player.to(args.device) if args.device == 'cuda': torch.set_num_threads(1) initial = True pre_filled = 0 for ni in range(n_iter): # re-initialize optimizers opt_player = eval(args.optimizer_player)(player.parameters(), lr=args.lr, weight_decay=args.l2) opt_value = eval(args.optimizer_value)(value.parameters(), lr=args.lr, weight_decay=args.l2) try: if not initial: lr = args.lr / (1 + (ni - pre_filled + 1) * args.lr_factor) ent_coeff = args.ent_coeff / ( 1 + (ni - pre_filled + 1) * args.ent_factor) print('lr', lr, 'ent_coeff', ent_coeff) for param_group in opt_player.param_groups: param_group['lr'] = lr for param_group in opt_value.param_groups: param_group['lr'] = lr if numpy.mod((ni - pre_filled + 1), save_iter) == 0: torch.save( { 'n_iter': n_iter, 'n_collect': n_collect, 'n_value': n_value, 'n_policy': n_policy, 'max_len': max_len, 'n_hid': n_hid, 'batch_size': batch_size, 'player': player.state_dict(), 'value': value.state_dict(), 'return_history': return_history, 'n_collected_frames': n_collected_frames, }, '{}_{}.th'.format(args.saveto, (ni - pre_filled + 1) + offset + 1)) player.eval() ret_ = -numpy.Inf while True: try: ret_ = return_q.get_nowait() except queue.Empty: break if ret_ != -numpy.Inf: return_history.append(ret_) if valid_ret == -numpy.Inf: valid_ret = ret_ else: valid_ret = 0.9 * valid_ret + 0.1 * ret_ print('Valid run', ret_, valid_ret) #st = time.time() player.to('cpu') copy_params(player, player_copy) for si in range(args.n_simulators): while True: try: # empty the queue, as the new one has arrived player_qs[si].get_nowait() except queue.Empty: break player_qs[si].put([ copy.deepcopy(p.data.numpy()) for p in player_copy.parameters() ] + [ copy.deepcopy(p.data.numpy()) for p in player_copy.buffers() ]) while True: try: # empty the queue, as the new one has arrived valid_q.get_nowait() except queue.Empty: break valid_q.put([ copy.deepcopy(p.data.numpy()) for p in player_copy.parameters() ] + [copy.deepcopy(p.data.numpy()) for p in player_copy.buffers()]) player.to(args.device) #print('model push took', time.time()-st) #st = time.time() n_collected_frames_ = 0 while True: try: epi = episode_q.get_nowait() replay_buffer.add(epi[0], epi[1], epi[2], epi[3]) n_collected_frames_ = n_collected_frames_ + len(epi[0]) except queue.Empty: break if n_collected_frames_ >= max_collected_frames \ and (len(replay_buffer.buffer) + len(replay_buffer.priority_buffer)) > 0: break n_collected_frames = n_collected_frames + n_collected_frames_ if len(replay_buffer.buffer) + len( replay_buffer.priority_buffer) < 1: continue if len(replay_buffer.buffer) + len( replay_buffer.priority_buffer) < args.initial_buffer: if initial: print( 'Pre-filling the buffer...', len(replay_buffer.buffer) + len(replay_buffer.priority_buffer)) continue else: if initial: pre_filled = ni initial = False #print('collection took', time.time()-st) #print('Buffer size', len(replay_buffer.buffer) + len(replay_buffer.priority_buffer)) # fit a value function # TD(0) #st = time.time() value.train() for vi in range(n_value): if numpy.mod(vi, update_every) == 0: #print(vi, 'zeroing gradient') opt_player.zero_grad() opt_value.zero_grad() batch = replay_buffer.sample(batch_size) batch_x = torch.from_numpy( numpy.stack([ex.current_['obs'] for ex in batch ]).astype('float32')).to(args.device) batch_r = torch.from_numpy( numpy.stack([ex.current_['rew'] for ex in batch ]).astype('float32')).to(args.device) batch_xn = torch.from_numpy( numpy.stack([ex.next_['obs'] for ex in batch ]).astype('float32')).to(args.device) pred_y = value(batch_x) pred_next = value_old(batch_xn).clone().detach() batch_pi = player(batch_x) loss_ = ((batch_r + discount_factor * pred_next.squeeze() - pred_y.squeeze())**2) batch_a = torch.from_numpy( numpy.stack([ex.current_['act'] for ex in batch ]).astype('float32')[:, None]).to(args.device) batch_q = torch.from_numpy( numpy.stack([ex.current_['prob'] for ex in batch ]).astype('float32')).to(args.device) logp = torch.log(batch_pi.gather(1, batch_a.long()) + 1e-8) # (clipped) importance weight: # because the policy may have changed since the tuple was collected. log_iw = logp.squeeze().clone().detach() - torch.log( batch_q.squeeze() + 1e-8) ess_ = torch.exp(-torch.logsumexp(2 * log_iw, dim=0)).item() iw = torch.exp(log_iw.clamp(max=0.)) if args.iw: loss = iw * loss_ else: loss = loss_ loss = loss.mean() loss.backward() if numpy.mod(vi, update_every) == (update_every - 1): #print(vi, 'making an update') if clip_coeff > 0.: nn.utils.clip_grad_norm_(value.parameters(), clip_coeff) opt_value.step() copy_params(value, value_old) if value_loss < 0.: value_loss = loss_.mean().item() else: value_loss = 0.9 * value_loss + 0.1 * loss_.mean().item() if numpy.mod((ni - pre_filled + 1), disp_iter) == 0: print('# frames', n_collected_frames, 'value_loss', value_loss, 'entropy', -entropy, 'ess', ess) #print('value update took', time.time()-st) # fit a policy #st = time.time() value.eval() player.train() if args.player_coeff > 0.: player_old.eval() for pi in range(n_policy): if numpy.mod(pi, update_every) == 0: opt_player.zero_grad() opt_value.zero_grad() #st = time.time() batch = replay_buffer.sample(batch_size) #print('batch collection took', time.time()-st) #st = time.time() #batch_x = [ex.current_['obs'] for ex in batch] #batch_xn = [ex.next_['obs'] for ex in batch] #batch_r = [ex.current_['rew'] for ex in batch] #print('list construction took', time.time()-st) #st = time.time() batch_x = numpy.zeros( tuple([len(batch)] + list(batch[0].current_['obs'].shape)), dtype='float32') batch_xn = numpy.zeros( tuple([len(batch)] + list(batch[0].current_['obs'].shape)), dtype='float32') batch_r = numpy.zeros((len(batch)), dtype='float32')[:, None] for ei, ex in enumerate(batch): batch_x[ei, :] = ex.current_['obs'] batch_xn[ei, :] = ex.next_['obs'] batch_r[ei, 0] = ex.current_['rew'] #batch_x = numpy.stack(batch_x).astype('float32') #batch_xn = numpy.stack(batch_xn).astype('float32') #batch_r = numpy.stack(batch_r).astype('float32')[:,None] #print('batch stack for value took', time.time()-st) #st = time.time() batch_x = torch.from_numpy(batch_x).to(args.device) batch_xn = torch.from_numpy(batch_xn).to(args.device) batch_r = torch.from_numpy(batch_r).to(args.device) #print('batch push for value took', time.time()-st) #st = time.time() batch_v = value(batch_x).clone().detach() batch_vn = value(batch_xn).clone().detach() #print('value forward pass took', time.time()-st) #st = time.time() batch_a = torch.from_numpy( numpy.stack([ex.current_['act'] for ex in batch ]).astype('float32')[:, None]).to(args.device) batch_q = torch.from_numpy( numpy.stack([ex.current_['prob'] for ex in batch ]).astype('float32')).to(args.device) batch_pi = player(batch_x) logp = torch.log(batch_pi.gather(1, batch_a.long()) + 1e-8) if args.player_coeff > 0.: batch_pi_old = player_old(batch_x).clone().detach() #print('policy computation took', time.time()-st) #st = time.time() # entropy regularization ent = -(batch_pi * torch.log(batch_pi + 1e-8)).sum(1) if entropy == -numpy.Inf: entropy = ent.mean().item() else: entropy = 0.9 * entropy + 0.1 * ent.mean().item() #print('entropy computation took', time.time()-st) #st = time.time() # advantage: r(s,a) + \gamma * V(s') - V(s) adv = batch_r + discount_factor * batch_vn - batch_v #adv = adv / adv.abs().max().clamp(min=1.) loss = -(adv * logp).squeeze() loss = loss - ent_coeff * ent #print('basic loss computation took', time.time()-st) #st = time.time() # (clipped) importance weight: log_iw = logp.squeeze().clone().detach() - torch.log(batch_q + 1e-8) iw = torch.exp(log_iw.clamp(max=0.)) ess_ = torch.exp(-torch.logsumexp(2 * log_iw, dim=0)).item() if ess == -numpy.Inf: ess = ess_ else: ess = 0.9 * ess + 0.1 * ess_ if args.iw: loss = iw * loss else: loss = loss #print('importance weighting took', time.time()-st) if critic_aware: #st = time.time() pred_y = value(batch_x).squeeze() pred_next = value(batch_xn).squeeze() critic_loss_ = -( (batch_r.squeeze() + discount_factor * pred_next - pred_y)**2).clone().detach() critic_loss_ = torch.exp(critic_loss_) loss = loss * critic_loss_ #print('critic aware weighting took', time.time()-st) loss = loss.mean() if args.player_coeff > 0.: #st = time.time() loss_old = -(batch_pi_old * torch.log(batch_pi + 1e-8)).sum(1).mean() loss = (1. - args.player_coeff ) * loss + args.player_coeff * loss_old #print('player interpolation took', time.time()-st) #st = time.time() loss.backward() if numpy.mod(pi, update_every) == (update_every - 1): if clip_coeff > 0.: nn.utils.clip_grad_norm_(player.parameters(), clip_coeff) opt_player.step() #print('backward computation and update took', time.time()-st) if args.player_coeff > 0.: copy_params(player, player_old) ##print('policy update took', time.time()-st) except KeyboardInterrupt: print('Terminating...') break for si in range(args.n_simulators): player_qs[si].put("END") print('Waiting for the simulators...') for si in range(args.n_simulators): simulators[-1].join() print('Done')
def test_to_b64(): buf = Buffer('base64 test') assert_equal(buf.to_b64(), 'YmFzZTY0IHRlc3Q=')