Esempio n. 1
0
    def __init__(self, discount, num_iterations, lamb, animate, kl_target, **kwargs):
        self.env_name = 'RoboschoolHumanoidFlagrun-v1'
        self.env = gym.make(self.env_name)
        gym.spaces.seed(1234) # for reproducibility
        self.obs_dim = self.env.observation_space.shape[0] + 1 # adding time step as feature
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate

        self.buffer = Buffer(1000000, self.obs_dim, self.act_dim) # 1000000 is the size they have used in paper
        self.episodes = 20 # larger episodes can reduce variance
        self.killer = GracefulKiller()

        self.policy = QPropPolicy(self.obs_dim, self.act_dim, self.env.action_space, kl_target, epochs=20)
        self.critic = DeterministicCritic(self.obs_dim, self.act_dim, self.discount, OUTPATH)
        self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10)

        if 'show' in kwargs and not kwargs['show']:
            # save copies of file
            shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.__class__), OUTPATH)

            self.log_file = open(OUTPATH + 'log.csv', 'w')
            self.write_header = True

        print('Observation dimension:', self.obs_dim)
        print('Action dimension:', self.act_dim)

        # The use of a scaler is crucial
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()
Esempio n. 2
0
def test_map():
    buf1 = Buffer([1, 2, 3])
    squared = lambda x: x**2
    buf2 = buf1.map(squared)

    assert_equal(buf1.bytes, [1, 2, 3])
    assert_equal(buf2.bytes, [1, 4, 9])
Esempio n. 3
0
def test_to_file():
    filepath = './tests/resources/buf_out_test.txt'
    buf = Buffer('to file test')

    buf.to_file(filepath, 'hex')
    data = open(filepath, 'r').read()

    assert_equal(data, '746f2066696c652074657374')
Esempio n. 4
0
    def session_key(self, public_B):
        """Generate a session secret given the other party's public key"""
        raw_secret = gmp.powmod(public_B.to_mpz(), self._secret_key,
                                self._dh_p)

        # Hash the secret to create a key
        h_256 = SHA256.new()
        h_256.update(raw_secret.digits(10))
        raw_key = h_256.digest()

        return Buffer(raw_key)
Esempio n. 5
0
def parse_message(msg: bytes, conn, dir_):
    size = unpack("<I", msg[:4])[0]
    msg_type = unpack("<I", msg[4:8])[0]
    message_type = message_type_table[msg_type]
    end = -(len(msg) - size - 8)
    if end == 0:
        end = None
    msg = msg[8:end]
    buf = Buffer()
    buf.write(msg)
    message = message_type.unpack(buf)
    logging.info(("\033[36mSEND >>> \033[0m" if dir_ ==
                  1 else "\033[32mRECV <<< \033[0m") + repr(message))
    action = message_action_table.get(msg_type, None)
    if action is not None and conn is not None:
        action(message, conn)
Esempio n. 6
0
    def __init__(self, wid, shared_model, model, optimizer, gamma=0.99, n_steps=8):
        # Configuration
        self.worker_id = wid
        self.gamma = gamma
        self.n_steps = n_steps

        # experience buffer
        self.buffer = Buffer(size=n_steps)
        
        # network related settings
        self.global_net = shared_model
        self.local_net = model
        self.optimizer = optimizer

        # deep copy, synchronize with shared model
        self.local_net.load_state_dict(self.global_net.state_dict())
Esempio n. 7
0
    def __init__(self, action_set, reward_function, feature_extractor, 
        hidden_dims=[50, 50], learning_rate=5e-4, buffer_size=50000, 
        batch_size=64, num_batches=100, starts_learning=5000, final_epsilon=0.02, 
        discount=0.99, target_freq=10, verbose=False, print_every=1, 
        test_model_path=None):

        Agent.__init__(self, action_set, reward_function)
        self.feature_extractor = feature_extractor
        self.feature_dim = self.feature_extractor.dimension

        # build Q network
        # we use a multilayer perceptron
        dims = [self.feature_dim] + hidden_dims + [len(self.action_set)]
        self.model = MLP(dims)

        if test_model_path is None:
            self.test_mode = False
            self.learning_rate = learning_rate
            self.buffer_size = buffer_size
            self.batch_size = batch_size
            self.num_batches = num_batches
            self.starts_learning = starts_learning
            self.epsilon = 1.0  # anneals starts_learning/(starts_learning + t)
            self.final_epsilon = 0.02
            self.timestep = 0
            self.discount = discount
            
            self.buffer = Buffer(self.buffer_size)

            self.target_net = MLP(dims)
            self.target_net.load_state_dict(self.model.state_dict())
            self.target_net.eval()

            self.target_freq = target_freq # target nn updated every target_freq episodes
            self.num_episodes = 0

            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
            
            # for debugging purposes
            self.verbose = verbose
            self.running_loss = 1.
            self.print_every = print_every

        else:
            self.test_mode = True
            self.model.load_state_dict(torch.load(test_model_path))
            self.model.eval()
Esempio n. 8
0
 def __init__(self, model, env, args, state):
     self.model = model
     self.env = env
     self.state = state
     self.hx = None
     self.cx = None
     self.eps_len = 0
     self.args = args
     self.values = []
     self.log_probs = []
     self.rewards = []
     self.entropies = []
     self.done = True
     self.info = None
     self.reward = 0
     self.gpu_id = -1
     self.position_history = Buffer(200)
Esempio n. 9
0
    def __init__(self, env_name, discount, num_iterations, lamb, animate,
                 kl_target, show):
        self.env_name = env_name
        self.env = gym.make(env_name)
        if env_name == "FetchReach-v0":
            self.env = gym.wrappers.FlattenDictWrapper(
                self.env, ['observation', 'desired_goal', 'achieved_goal'])
        gym.spaces.seed(1234)
        self.obs_dim = self.env.observation_space.shape[
            0] + 1  # adding time step as feature
        self.act_dim = self.env.action_space.shape[0]
        self.discount = discount
        self.num_iterations = num_iterations
        self.lamb = lamb
        self.animate = animate

        self.buffer = Buffer(1000000, self.obs_dim, self.act_dim)
        self.episodes = 20
        self.killer = GracefulKiller()

        self.policy = QPropPolicy(self.obs_dim,
                                  self.act_dim,
                                  self.env.action_space,
                                  kl_target,
                                  epochs=20)
        self.critic = DeterministicCritic(self.obs_dim, self.act_dim,
                                          self.discount, OUTPATH)
        # using MC return would be more helpful
        self.value_func = l2TargetValueFunc(self.obs_dim, epochs=10)

        if not show:
            # save copies of file
            shutil.copy(inspect.getfile(self.policy.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.value_func.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.critic.__class__), OUTPATH)
            shutil.copy(inspect.getfile(self.__class__), OUTPATH)

            self.log_file = open(OUTPATH + 'log.csv', 'w')
            self.write_header = True

        print('observation dimension:', self.obs_dim)
        print('action dimension:', self.act_dim)

        # Use of a scaler is crucial
        self.scaler = Scaler(self.obs_dim)
        self.init_scaler()
Esempio n. 10
0
 def __init__(self, ip, port):
     self.stream = Buffer()
     self.s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
     self.s.connect((ip, port))
     raw_pk = self.s.recv(33)
     self.pk = GraphenePublicKey(raw_pk.hex())
     sk = PrivateKey()
     point = self.pk.point() * int.from_bytes(bytes(sk), "big")
     x: int = point.x()
     raw_data = x.to_bytes(32, "big")
     self.shared_secret = sha512(raw_data).digest()
     key = sha256(self.shared_secret).digest()
     crc = cityhash.CityHash128(self.shared_secret)
     data = crc.to_bytes(16, "little")
     iv = data[8:16] + data[:8]
     self.s.sendall(bytes.fromhex(repr(sk.pubkey)))
     self.encryptor = AES.new(key, AES.MODE_CBC, iv)
     self.test = AES.new(key, AES.MODE_CBC, iv)
     self.decryptor = AES.new(key, AES.MODE_CBC, iv)
     self.worker_thread = threading.Thread(target=self.worker)
     self.worker_thread.start()
     self.send(
         5006, {
             "user_agent":
             "Haruka Mock Client",
             "core_protocol_version":
             106,
             "inbound_address":
             "0.0.0.0",
             "inbound_port":
             0,
             "outbound_port":
             0,
             "node_public_key":
             sk.pubkey,
             "signed_shared_secret":
             ecdsa.sign_message(self.shared_secret, str(sk)),
             "chain_id":
             bytes.fromhex(
                 "4018d7844c78f6a6c41c6a552b898022310fc5dec06da467ee7905a8dad512c8"
             ),
             "user_data": {
                 "platform": String("unknown")
             }
         })
Esempio n. 11
0
def echo_get_intention(message):
    global small_talk_indicator
    answer, intent, action = detect_intent_texts(project_id, session_id,
                                                 [message.text], language_code)
    if intent:
        bot.send_message(message.chat.id, intent)
    else:
        bot.send_message(message.chat.id, 'no intent')

    messages_list.append(message.text)
    intents_list.append(intent)
    if intent == 'my.make_order':
        small_talk_indicator = False
        bot.send_message(message.chat.id, answer)

    elif small_talk_indicator or intent == 'my.lets_make_conversation' or action == 'smalltalk.appraisal.thank_you':
        small_talk_indicator = True
        # answer, intent = detect_intent_texts(project_id, session_id, [message.text], language_code)
        bot.send_message(message.chat.id, answer)

    else:
        wish = get_item_for_search(message.text)

        if 'fail' not in wish:
            chat_id = message.chat.id

            markup = generate_irec_markup()

            user = Buffer(wish)

            user_dict[chat_id] = user

            bot.send_message(message.chat.id,
                             'Выбери способ подбора:',
                             reply_markup=markup)
        else:
            negative = ''.join([
                'Много хочешь ', u'\U0001F5FF', '\n',
                'Попробуй написать по-другому.'
            ])
            bot.reply_to(message, negative)
Esempio n. 12
0
 def _init_memory(self):
     self._buffer = Buffer(max_size=max_memory)
Esempio n. 13
0
    def __init__(self,
                 action_set,
                 reward_function,
                 prior_variance,
                 noise_variance,
                 feature_extractor,
                 prior_network,
                 num_ensemble,
                 hidden_dims=[10, 10],
                 learning_rate=5e-4,
                 buffer_size=50000,
                 batch_size=64,
                 num_batches=100,
                 starts_learning=5000,
                 discount=0.99,
                 target_freq=10,
                 verbose=False,
                 print_every=1,
                 test_model_path=None):
        Agent.__init__(self, action_set, reward_function)

        self.prior_variance = prior_variance
        self.noise_variance = noise_variance

        self.feature_extractor = feature_extractor
        self.feature_dim = self.feature_extractor.dimension

        dims = [self.feature_dim] + hidden_dims + [len(self.action_set)]

        self.prior_network = prior_network
        self.num_ensemble = num_ensemble  # number of models in ensemble

        self.index = np.random.randint(self.num_ensemble)

        # build Q network
        # we use a multilayer perceptron

        if test_model_path is None:
            self.test_mode = False
            self.learning_rate = learning_rate
            self.buffer_size = buffer_size
            self.batch_size = batch_size
            self.num_batches = num_batches
            self.starts_learning = starts_learning
            self.discount = discount
            self.timestep = 0

            self.buffer = Buffer(self.buffer_size)
            self.models = []
            for i in range(self.num_ensemble):
                if self.prior_network:
                    '''
                    Second network is a prior network whose weights are fixed
                    and first network is difference network learned i.e, weights are mutable
                    '''
                    self.models.append(
                        DQNWithPrior(dims, scale=np.sqrt(
                            self.prior_variance)).to(device))
                else:
                    self.models.append(MLP(dims).to(device))
                self.models[i].initialize()
            '''
            prior networks weights are immutable so enough to keep difference network
            '''
            self.target_nets = []
            for i in range(self.num_ensemble):
                if self.prior_network:
                    self.target_nets.append(
                        DQNWithPrior(dims, scale=np.sqrt(
                            self.prior_variance)).to(device))
                else:
                    self.target_nets.append(MLP(dims).to(device))
                    self.target_nets[i].load_state_dict(
                        self.models[i].state_dict())
                    self.target_nets[i].eval()

            self.target_freq = target_freq  #   target nn updated every target_freq episodes
            self.num_episodes = 0

            self.optimizer = []
            for i in range(self.num_ensemble):
                self.optimizer.append(
                    torch.optim.Adam(self.models[i].parameters(),
                                     lr=self.learning_rate))

            # for debugging purposes
            self.verbose = verbose
            self.running_loss = 1.
            self.print_every = print_every

        else:
            self.models = []
            self.test_mode = True
            if self.prior_network:
                self.models.append(
                    DQNWithPrior(dims, scale=self.prior_variance))
            else:
                self.models.append(MLP(dims))
            self.models[0].load_state_dict(torch.load(test_model_path))
            self.models[0].eval()
            self.index = 0
Esempio n. 14
0
def test_xor():
    buf1 = Buffer('abc')
    buf2 = Buffer('   ')
    xord = buf1.xor(buf2)

    assert_equal(xord.to_string(), 'ABC')
Esempio n. 15
0
def train(rank,
          args,
          input_model=None,
          max_iter=100000,
          step_test=-1,
          log=False):
    if rank >= 0:
        torch.manual_seed(args.seed + rank)
    gpu_id = args.gpu_ids[rank % len(args.gpu_ids)]
    torch.manual_seed(args.seed + rank)
    if gpu_id >= 0:
        torch.cuda.manual_seed(args.seed + rank)
    env = create_env(args)
    env.seed(args.seed + rank)

    if log:
        log = setup_logger("{0}_{1}_log".format(args.scale_legs, rank),
                           "logs/{0}_{1}_log".format(args.scale_legs, rank))

    # player initialization
    player = Agent(None, env, args, None)
    player.gpu_id = gpu_id
    if args.model == 'MLP':
        player.model = A3C_MLP(player.env.observation_space.shape[0],
                               player.env.action_space, args.stack_frames)
    if args.model == 'CONV':
        player.model = A3C_CONV(args.stack_frames, player.env.action_space)

    # load the input model to the player
    if input_model != None:
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                player.model.load_state_dict(input_model.state_dict())
        else:
            player.model.load_state_dict(input_model.state_dict())

    # initialize the player optimizer
    optimizer = None
    if args.optimizer == 'RMSprop':
        optimizer = optim.RMSprop(player.model.dictForOptimizer(), lr=args.lr)
    if args.optimizer == 'Adam':
        optimizer = optim.Adam(player.model.dictForOptimizer(), lr=args.lr)
    else:
        optimizer = optim.SGD(player.model.dictForOptimizer(), lr=args.lr)

    # reset the environment and initialize the player state
    player.state = player.env.reset(args)
    player.state = torch.from_numpy(player.state).float()

    # If on GPU, do as GPU
    if gpu_id >= 0:
        with torch.cuda.device(gpu_id):
            player.state = player.state.cuda()
            player.model = player.model.cuda()

    player.model.train()

    last_iter = 0

    mean_buf = Buffer(5)
    # Start looping over episodes
    for iteration in range(max_iter):
        last_iter += iteration

        # reset cx and hx if the enlvironmnent is over.
        if player.done:
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.cx = Variable(torch.zeros(1, 128).cuda())
                    player.hx = Variable(torch.zeros(1, 128).cuda())
            else:
                player.cx = Variable(torch.zeros(1, 128))
                player.hx = Variable(torch.zeros(1, 128))
        else:
            player.cx = Variable(player.cx.data)
            player.hx = Variable(player.hx.data)

        # Roll out actions and collect reward for one episode
        for step in range(args.num_steps):
            player.action_train()

            if player.done:
                break

        if player.done:
            player.eps_len = 0
            # reset state
            state = player.env.reset(args)
            player.state = torch.from_numpy(state).float()
            if gpu_id >= 0:
                with torch.cuda.device(gpu_id):
                    player.state = player.state.cuda()

        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                R = torch.zeros(1, 1).cuda()
        else:
            R = torch.zeros(1, 1)

        if not player.done:
            state = player.state
            if args.model == 'CONV':
                state = state.unsqueeze(0)
            value, _, _, _ = player.model(
                (Variable(state), (player.hx, player.cx)))
            R = value.data

        player.values.append(Variable(R))
        policy_loss = 0
        value_loss = 0
        R = Variable(R)
        if gpu_id >= 0:
            with torch.cuda.device(gpu_id):
                gae = torch.zeros(1, 1).cuda()
        else:
            gae = torch.zeros(1, 1)
        for i in reversed(range(len(player.rewards))):
            R = args.gamma * R + player.rewards[i]
            advantage = R - player.values[i]
            value_loss = value_loss + 0.5 * advantage.pow(2)

            # Generalized Advantage Estimataion
            delta_t = player.rewards[i] + args.gamma * \
                player.values[i + 1].data - player.values[i].data

            gae = gae * args.gamma * args.tau + delta_t

            policy_loss = policy_loss - \
                (player.log_probs[i].sum() * Variable(gae)) - \
                (0.01 * player.entropies[i].sum())

        player.model.zero_grad()
        (policy_loss + 0.5 * value_loss).backward()
        optimizer.step()
        player.clear_actions()

        if step_test > 0 and iteration % step_test == 0:
            tester = Tester(args, player.model)
            score = tester.test(last_iter)
            mean_buf.push(score)
            recent_mean = sum(mean_buf.bf) / mean_buf.current_size
            text = "Iteration {0}, episode reward {1}, recent reward mean {2}".format(
                iteration, score, recent_mean)
            log.info(text)

    tester = Tester(args, player.model)
    fitness = tester.test(last_iter)

    return fitness
Esempio n. 16
0
    # critic_model.save_weights(".model/billiard_critic.h5")
    # target_actor.save_weights(".model/billiard_target_actor.h5")
    # target_critic.save_weights(".model/billiard_target_critic.h5")


if __name__ == "__main__":
    env = gym.make("billiard-v0")
    env_info = {
        "num_states": env.observation_space.shape,
        "num_actions": env.action_space.shape,
        "upper_bound": env.action_space.high[0],
        "lower_bound": env.action_space.low[0],
    }

    actor_model = get_actor(**env_info)
    critic_model = get_critic(**env_info)
    target_actor = get_actor(**env_info)
    target_critic = get_critic(**env_info)

    # Learning rate for actor-critic models
    critic_lr = 0.002
    actor_lr = 0.001
    critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
    actor_optimizer = tf.keras.optimizers.Adam(actor_lr)

    buffer = Buffer(actor_model, critic_model, target_actor, target_critic,
                    critic_optimizer, actor_optimizer,
                    buffer_capacity=50000, batch_size=256, **env_info)

    train(env_info, buffer, total_episodes=1000)
Esempio n. 17
0
def test_to_mpz():
    buf = Buffer([255, 255])

    assert_equal(buf.to_mpz(), mpz(65535))
Esempio n. 18
0
def test_to_bin():
    buf = Buffer('Az')

    assert_equal(buf.to_bin(), '0100000101111010')
Esempio n. 19
0
def test_properties():
    buf = Buffer([1, 2, 3])

    assert_equal(buf.bytes, [1, 2, 3])
    assert_equal(buf.size, 3)
Esempio n. 20
0
DEBUG_CONNECT = True  # Show pythonOBD debug info
UPDATE_FREQ = 2.5  # wait 1/freq s each frame
PORT = "/dev/rfcomm0"

FONT = "Arial 20"
BUFFER_SIZE = 60  # buffer & graph last x measurements
GRAPH_Y_MAX = 100

watchlist = [["ELM_VOLTAGE", "RPM", "SPEED", "ENGINE_LOAD"],
             ["FUEL_LEVEL", "COOLANT_TEMP", "OIL_TEMP", "INTAKE_TEMP"],
             [
                 "RUN_TIME", "THROTTLE_POS", "TIMING_ADVANCE",
                 "BAROMETRIC_PRESSURE"
             ]]
history = {
    k: Buffer(maxlen=BUFFER_SIZE)
    for k in itertools.chain.from_iterable(watchlist)
}
labels = ("s1", "s2", "s3", "s4")  # layout: sX_l for label, sX_d for data

layout = [[
    sg.Text("", size=(15, 1), key=f"{label}_l", font=FONT),
    sg.Text("",
            size=(15, 1),
            key=f"{label}_d",
            font=FONT,
            justification='left'),
    sg.Graph(canvas_size=(170, 120),
             graph_bottom_left=(-BUFFER_SIZE - 1, -GRAPH_Y_MAX * 1.05),
             graph_top_right=(1, GRAPH_Y_MAX * 1.05),
             background_color="white",
Esempio n. 21
0
def test_to_hex():
    buf = Buffer('hex test')

    assert_equal(buf.to_hex(), '6865782074657374')
Esempio n. 22
0
    "SpaceInvaders-v0",
    "Seaquest-v0",
    "LunarLanderV2",
    "Reacher-v2",
    "FrozenLake-v0"
]

env = gym.make('MountainCar-v0')
obs, rew, done, ep_ret, ep_len = env.reset(), 0, False, 0, 0

epochs = 10
local_steps_per_epoch = 1000
# tf.set_random_seed(22222)

agent = PPOAgent(env.observation_space, env.action_space)
buffer = Buffer(env.observation_space.shape, env.action_space.shape, size=local_steps_per_epoch)

rewards = [0]
for epoch in tqdm(range(epochs)):
    # print("Epoch {} Reward {}".format(epoch, rewards[-1]))
    for t in range(local_steps_per_epoch):
        act, v_t, logp_pi = agent.get_action(obs)

        buffer.store(obs, act, rew, v_t, logp_pi) # Last var is logpi (not used in vpg)

        obs, rew, done, _ = env.step(act[0])
        ep_ret += rew
        ep_len += 1

        if done or (t==local_steps_per_epoch-1):
            # if not done:
Esempio n. 23
0
def test_get():
    buf = Buffer([0, 16, 0])

    assert_equal(buf.get(1), 16)
Esempio n. 24
0
def test_copy():
    buf1 = Buffer([1, 2, 3])
    buf2 = buf1.copy()

    assert_equal(buf2.bytes, [1, 2, 3])
    assert_not_equal(id(buf1), id(buf2))
Esempio n. 25
0
# %%
# optimizer and loss
LGAN = MSELoss()
LCYC = L1Loss()
LIdentity = L1Loss()

optimizer_G = Adam(itertools.chain(G12.parameters(), G21.parameters()),
                   lr=0.001)
optimizer_D1 = Adam(D1.parameters(), lr=0.001)
optimizer_D2 = Adam(D2.parameters(), lr=0.001)

# %%
# train models
real_label = torch.full((32, ), 1, device="cuda:0")
false_label = torch.full((32, ), 0, device="cuda:0")
bufD1 = Buffer(50)
bufD2 = Buffer(50)

num_epochs = 100
learning_rate = 0.01
for epoch in range(num_epochs):
    for i, (realA, realB) in enumerate(dataloader):
        if torch.cuda.is_available():
            realA.cuda()
            realB.cuda()

        #------------ Generator 1->2 and 2->1 -------------#
        optimizer_G.zero_grad()

        fakeB = G12(realA)
        pred_fakeB = D2(fakeB)
Esempio n. 26
0
for _ in range(parameters["n_simulations"]):

    keras.backend.clear_session()

    alpha = 0.001

    model = Sequential()
    model.add(
        Dense(parameters["hidden_size"],
              input_dim=env.observation_space.shape[0],
              activation='tanh'))
    model.add(Dense(env.action_space.n, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(lr=alpha))

    dqn_learner = Dqn(model=model,
                      buffer=Buffer(parameters["buffer_size"]),
                      env=env,
                      gamma=parameters["gamma"],
                      epsilon=parameters["epsilon"],
                      decayment_rate=parameters["decayment_rate"],
                      episodes=parameters["episodes"],
                      max_steps=parameters["max_steps"],
                      batch_size=parameters["batch_size"])

    dqn_learner.train()

    rewards.append(dqn_learner.rewards_greedy)
    steps.append(dqn_learner.steps_greedy)

rewards = np.array(rewards)
steps = np.array(steps)
Esempio n. 27
0
def test_set():
    buf = Buffer([0, 0, 0, 0])
    buf.set(1, 16)

    assert_equal(buf.get(1), 16)
Esempio n. 28
0
def test_concat():
    buf1 = Buffer('abc')
    buf2 = Buffer('xyz')
    combined = buf1.concat(buf2)

    assert_equal(combined.to_string(), 'abcxyz')
Esempio n. 29
0
def main(args):

    torch.manual_seed(args.seed)

    # start simulators
    mp.set_start_method('spawn')

    episode_q = Queue()
    player_qs = []
    simulators = []
    for si in range(args.n_simulators):
        player_qs.append(Queue())
        simulators.append(
            mp.Process(target=simulator,
                       args=(
                           si,
                           player_qs[-1],
                           episode_q,
                           args,
                           False,
                       )))
        simulators[-1].start()

    return_q = Queue()
    valid_q = Queue()
    valid_simulator = mp.Process(target=simulator,
                                 args=(
                                     args.n_simulators,
                                     valid_q,
                                     return_q,
                                     args,
                                     True,
                                 ))
    valid_simulator.start()

    env = gym.make(args.env)
    # env = gym.make('Assault-ram-v0')

    n_frames = args.n_frames

    # initialize replay buffer
    replay_buffer = Buffer(max_items=args.buffer_size,
                           n_frames=n_frames,
                           priority_ratio=args.priority_ratio,
                           store_ratio=args.store_ratio)

    n_iter = args.n_iter
    init_collect = args.init_collect
    n_collect = args.n_collect
    n_value = args.n_value
    n_policy = args.n_policy
    n_hid = args.n_hid

    critic_aware = args.critic_aware

    update_every = args.update_every

    disp_iter = args.disp_iter
    val_iter = args.val_iter
    save_iter = args.save_iter

    max_len = args.max_len
    batch_size = args.batch_size
    max_collected_frames = args.max_collected_frames

    clip_coeff = args.grad_clip
    ent_coeff = args.ent_coeff
    discount_factor = args.discount_factor

    value_loss = -numpy.Inf
    entropy = -numpy.Inf
    valid_ret = -numpy.Inf
    ess = -numpy.Inf
    n_collected_frames = 0

    offset = 0

    return_history = []

    if args.nn == "ff":
        # create a policy
        player = ff.Player(n_in=128 * n_frames, n_hid=args.n_hid,
                           n_out=6).to(args.device)
        if args.player_coeff > 0.:
            player_old = ff.Player(n_in=128 * n_frames,
                                   n_hid=args.n_hid,
                                   n_out=6).to(args.device)
        player_copy = ff.Player(n_in=128 * n_frames, n_hid=args.n_hid,
                                n_out=6).to('cpu')

        # create a value estimator
        value = ff.Value(n_in=128 * n_frames, n_hid=args.n_hid).to(args.device)
        value_old = ff.Value(n_in=128 * n_frames,
                             n_hid=args.n_hid).to(args.device)

        for m in player.parameters():
            m.data.normal_(0., 0.01)
        for m in value.parameters():
            m.data.normal_(0., 0.01)
    elif args.nn == "conv":
        # create a policy
        player = conv.Player(n_frames=n_frames,
                             n_hid=args.n_hid).to(args.device)
        if args.player_coeff > 0.:
            player_old = conv.Player(n_frames=n_frames,
                                     n_hid=args.n_hid).to(args.device)
        player_copy = conv.Player(n_frames=n_frames,
                                  n_hid=args.n_hid).to('cpu')

        # create a value estimator
        value = conv.Value(n_frames, n_hid=args.n_hid).to(args.device)
        value_old = conv.Value(n_frames, n_hid=args.n_hid).to(args.device)
    else:
        raise Exception('Unknown type')

    if args.cont:
        files = glob.glob("{}*th".format(args.saveto))
        iterations = [
            int(".".join(f.split('.')[:-1]).split('_')[-1].strip())
            for f in files
        ]
        last_iter = numpy.max(iterations)
        offset = last_iter - 1
        print('Reloading from {}_{}.th'.format(args.saveto, last_iter))
        checkpoint = torch.load("{}_{}.th".format(args.saveto, last_iter))
        player.load_state_dict(checkpoint['player'])
        value.load_state_dict(checkpoint['value'])
        return_history = checkpoint['return_history']
        n_collected_frames = checkpoint['n_collected_frames']

    copy_params(value, value_old)
    if args.player_coeff > 0.:
        copy_params(player, player_old)

    # start simulators
    player.to('cpu')
    copy_params(player, player_copy)
    for si in range(args.n_simulators):
        player_qs[si].put(
            [copy.deepcopy(p.data.numpy()) for p in player_copy.parameters()] +
            [copy.deepcopy(p.data.numpy()) for p in player_copy.buffers()])
    valid_q.put(
        [copy.deepcopy(p.data.numpy()) for p in player_copy.parameters()] +
        [copy.deepcopy(p.data.numpy()) for p in player_copy.buffers()])
    player.to(args.device)

    if args.device == 'cuda':
        torch.set_num_threads(1)

    initial = True
    pre_filled = 0

    for ni in range(n_iter):
        # re-initialize optimizers
        opt_player = eval(args.optimizer_player)(player.parameters(),
                                                 lr=args.lr,
                                                 weight_decay=args.l2)
        opt_value = eval(args.optimizer_value)(value.parameters(),
                                               lr=args.lr,
                                               weight_decay=args.l2)

        try:
            if not initial:
                lr = args.lr / (1 + (ni - pre_filled + 1) * args.lr_factor)
                ent_coeff = args.ent_coeff / (
                    1 + (ni - pre_filled + 1) * args.ent_factor)
                print('lr', lr, 'ent_coeff', ent_coeff)

                for param_group in opt_player.param_groups:
                    param_group['lr'] = lr
                for param_group in opt_value.param_groups:
                    param_group['lr'] = lr

            if numpy.mod((ni - pre_filled + 1), save_iter) == 0:
                torch.save(
                    {
                        'n_iter': n_iter,
                        'n_collect': n_collect,
                        'n_value': n_value,
                        'n_policy': n_policy,
                        'max_len': max_len,
                        'n_hid': n_hid,
                        'batch_size': batch_size,
                        'player': player.state_dict(),
                        'value': value.state_dict(),
                        'return_history': return_history,
                        'n_collected_frames': n_collected_frames,
                    }, '{}_{}.th'.format(args.saveto,
                                         (ni - pre_filled + 1) + offset + 1))

            player.eval()

            ret_ = -numpy.Inf
            while True:
                try:
                    ret_ = return_q.get_nowait()
                except queue.Empty:
                    break
            if ret_ != -numpy.Inf:
                return_history.append(ret_)
                if valid_ret == -numpy.Inf:
                    valid_ret = ret_
                else:
                    valid_ret = 0.9 * valid_ret + 0.1 * ret_
                print('Valid run', ret_, valid_ret)

            #st = time.time()

            player.to('cpu')
            copy_params(player, player_copy)
            for si in range(args.n_simulators):
                while True:
                    try:
                        # empty the queue, as the new one has arrived
                        player_qs[si].get_nowait()
                    except queue.Empty:
                        break

                player_qs[si].put([
                    copy.deepcopy(p.data.numpy())
                    for p in player_copy.parameters()
                ] + [
                    copy.deepcopy(p.data.numpy())
                    for p in player_copy.buffers()
                ])
            while True:
                try:
                    # empty the queue, as the new one has arrived
                    valid_q.get_nowait()
                except queue.Empty:
                    break
            valid_q.put([
                copy.deepcopy(p.data.numpy())
                for p in player_copy.parameters()
            ] + [copy.deepcopy(p.data.numpy()) for p in player_copy.buffers()])

            player.to(args.device)

            #print('model push took', time.time()-st)

            #st = time.time()

            n_collected_frames_ = 0
            while True:
                try:
                    epi = episode_q.get_nowait()
                    replay_buffer.add(epi[0], epi[1], epi[2], epi[3])
                    n_collected_frames_ = n_collected_frames_ + len(epi[0])
                except queue.Empty:
                    break
                if n_collected_frames_ >= max_collected_frames \
                        and (len(replay_buffer.buffer) + len(replay_buffer.priority_buffer)) > 0:
                    break
            n_collected_frames = n_collected_frames + n_collected_frames_

            if len(replay_buffer.buffer) + len(
                    replay_buffer.priority_buffer) < 1:
                continue

            if len(replay_buffer.buffer) + len(
                    replay_buffer.priority_buffer) < args.initial_buffer:
                if initial:
                    print(
                        'Pre-filling the buffer...',
                        len(replay_buffer.buffer) +
                        len(replay_buffer.priority_buffer))
                    continue
            else:
                if initial:
                    pre_filled = ni
                    initial = False

            #print('collection took', time.time()-st)

            #print('Buffer size', len(replay_buffer.buffer) + len(replay_buffer.priority_buffer))

            # fit a value function
            # TD(0)
            #st = time.time()

            value.train()
            for vi in range(n_value):
                if numpy.mod(vi, update_every) == 0:
                    #print(vi, 'zeroing gradient')
                    opt_player.zero_grad()
                    opt_value.zero_grad()

                batch = replay_buffer.sample(batch_size)

                batch_x = torch.from_numpy(
                    numpy.stack([ex.current_['obs'] for ex in batch
                                 ]).astype('float32')).to(args.device)
                batch_r = torch.from_numpy(
                    numpy.stack([ex.current_['rew'] for ex in batch
                                 ]).astype('float32')).to(args.device)
                batch_xn = torch.from_numpy(
                    numpy.stack([ex.next_['obs'] for ex in batch
                                 ]).astype('float32')).to(args.device)
                pred_y = value(batch_x)
                pred_next = value_old(batch_xn).clone().detach()
                batch_pi = player(batch_x)

                loss_ = ((batch_r + discount_factor * pred_next.squeeze() -
                          pred_y.squeeze())**2)

                batch_a = torch.from_numpy(
                    numpy.stack([ex.current_['act'] for ex in batch
                                 ]).astype('float32')[:, None]).to(args.device)
                batch_q = torch.from_numpy(
                    numpy.stack([ex.current_['prob'] for ex in batch
                                 ]).astype('float32')).to(args.device)
                logp = torch.log(batch_pi.gather(1, batch_a.long()) + 1e-8)

                # (clipped) importance weight:
                # because the policy may have changed since the tuple was collected.
                log_iw = logp.squeeze().clone().detach() - torch.log(
                    batch_q.squeeze() + 1e-8)
                ess_ = torch.exp(-torch.logsumexp(2 * log_iw, dim=0)).item()
                iw = torch.exp(log_iw.clamp(max=0.))

                if args.iw:
                    loss = iw * loss_
                else:
                    loss = loss_

                loss = loss.mean()

                loss.backward()

                if numpy.mod(vi, update_every) == (update_every - 1):
                    #print(vi, 'making an update')
                    if clip_coeff > 0.:
                        nn.utils.clip_grad_norm_(value.parameters(),
                                                 clip_coeff)
                    opt_value.step()

            copy_params(value, value_old)

            if value_loss < 0.:
                value_loss = loss_.mean().item()
            else:
                value_loss = 0.9 * value_loss + 0.1 * loss_.mean().item()

            if numpy.mod((ni - pre_filled + 1), disp_iter) == 0:
                print('# frames', n_collected_frames, 'value_loss', value_loss,
                      'entropy', -entropy, 'ess', ess)

            #print('value update took', time.time()-st)

            # fit a policy
            #st = time.time()

            value.eval()
            player.train()
            if args.player_coeff > 0.:
                player_old.eval()

            for pi in range(n_policy):
                if numpy.mod(pi, update_every) == 0:
                    opt_player.zero_grad()
                    opt_value.zero_grad()

                #st = time.time()

                batch = replay_buffer.sample(batch_size)

                #print('batch collection took', time.time()-st)

                #st = time.time()

                #batch_x = [ex.current_['obs'] for ex in batch]
                #batch_xn = [ex.next_['obs'] for ex in batch]
                #batch_r = [ex.current_['rew'] for ex in batch]

                #print('list construction took', time.time()-st)

                #st = time.time()

                batch_x = numpy.zeros(
                    tuple([len(batch)] + list(batch[0].current_['obs'].shape)),
                    dtype='float32')
                batch_xn = numpy.zeros(
                    tuple([len(batch)] + list(batch[0].current_['obs'].shape)),
                    dtype='float32')
                batch_r = numpy.zeros((len(batch)), dtype='float32')[:, None]

                for ei, ex in enumerate(batch):
                    batch_x[ei, :] = ex.current_['obs']
                    batch_xn[ei, :] = ex.next_['obs']
                    batch_r[ei, 0] = ex.current_['rew']

                #batch_x = numpy.stack(batch_x).astype('float32')
                #batch_xn = numpy.stack(batch_xn).astype('float32')
                #batch_r = numpy.stack(batch_r).astype('float32')[:,None]

                #print('batch stack for value took', time.time()-st)

                #st = time.time()

                batch_x = torch.from_numpy(batch_x).to(args.device)
                batch_xn = torch.from_numpy(batch_xn).to(args.device)
                batch_r = torch.from_numpy(batch_r).to(args.device)

                #print('batch push for value took', time.time()-st)

                #st = time.time()

                batch_v = value(batch_x).clone().detach()
                batch_vn = value(batch_xn).clone().detach()

                #print('value forward pass took', time.time()-st)

                #st = time.time()

                batch_a = torch.from_numpy(
                    numpy.stack([ex.current_['act'] for ex in batch
                                 ]).astype('float32')[:, None]).to(args.device)
                batch_q = torch.from_numpy(
                    numpy.stack([ex.current_['prob'] for ex in batch
                                 ]).astype('float32')).to(args.device)

                batch_pi = player(batch_x)
                logp = torch.log(batch_pi.gather(1, batch_a.long()) + 1e-8)

                if args.player_coeff > 0.:
                    batch_pi_old = player_old(batch_x).clone().detach()

                #print('policy computation took', time.time()-st)

                #st = time.time()

                # entropy regularization
                ent = -(batch_pi * torch.log(batch_pi + 1e-8)).sum(1)
                if entropy == -numpy.Inf:
                    entropy = ent.mean().item()
                else:
                    entropy = 0.9 * entropy + 0.1 * ent.mean().item()

                #print('entropy computation took', time.time()-st)

                #st = time.time()

                # advantage: r(s,a) + \gamma * V(s') - V(s)
                adv = batch_r + discount_factor * batch_vn - batch_v
                #adv = adv / adv.abs().max().clamp(min=1.)

                loss = -(adv * logp).squeeze()

                loss = loss - ent_coeff * ent

                #print('basic loss computation took', time.time()-st)

                #st = time.time()

                # (clipped) importance weight:
                log_iw = logp.squeeze().clone().detach() - torch.log(batch_q +
                                                                     1e-8)
                iw = torch.exp(log_iw.clamp(max=0.))

                ess_ = torch.exp(-torch.logsumexp(2 * log_iw, dim=0)).item()
                if ess == -numpy.Inf:
                    ess = ess_
                else:
                    ess = 0.9 * ess + 0.1 * ess_

                if args.iw:
                    loss = iw * loss
                else:
                    loss = loss

                #print('importance weighting took', time.time()-st)

                if critic_aware:
                    #st = time.time()

                    pred_y = value(batch_x).squeeze()
                    pred_next = value(batch_xn).squeeze()
                    critic_loss_ = -(
                        (batch_r.squeeze() + discount_factor * pred_next -
                         pred_y)**2).clone().detach()

                    critic_loss_ = torch.exp(critic_loss_)
                    loss = loss * critic_loss_

                    #print('critic aware weighting took', time.time()-st)

                loss = loss.mean()

                if args.player_coeff > 0.:
                    #st = time.time()

                    loss_old = -(batch_pi_old *
                                 torch.log(batch_pi + 1e-8)).sum(1).mean()
                    loss = (1. - args.player_coeff
                            ) * loss + args.player_coeff * loss_old

                    #print('player interpolation took', time.time()-st)

                #st = time.time()
                loss.backward()
                if numpy.mod(pi, update_every) == (update_every - 1):
                    if clip_coeff > 0.:
                        nn.utils.clip_grad_norm_(player.parameters(),
                                                 clip_coeff)
                    opt_player.step()
                #print('backward computation and update took', time.time()-st)

            if args.player_coeff > 0.:
                copy_params(player, player_old)

            ##print('policy update took', time.time()-st)

        except KeyboardInterrupt:
            print('Terminating...')
            break

    for si in range(args.n_simulators):
        player_qs[si].put("END")

    print('Waiting for the simulators...')

    for si in range(args.n_simulators):
        simulators[-1].join()

    print('Done')
Esempio n. 30
0
def test_to_b64():
    buf = Buffer('base64 test')

    assert_equal(buf.to_b64(), 'YmFzZTY0IHRlc3Q=')