Example #1
0
    for pos in positions:
        s = dummy_env.get_observation(pos)
        #actions = net.([s]) # [1, num_factors]
        sp = []
        for i in range(net.num_actions):
            dummy_env.set_position(pos)
            sp.append(np.reshape(dummy_env.step(i), [12, 12, 1, 1]))
        sp = np.concatenate(sp, axis=-1)
        s_list.append(s)
        sp_list.append(sp)
    return net.train_step(s_list, sp_list)


for i in itertools.count():
    s = env.get_observation()
    buffer.append(env.get_position(), None, None, None, None)
    a = np.random.randint(0, 4)
    sp = env.step(a)

    if buffer.length() < 1000:
        continue

    recon_loss, pi_loss = run_training_step(buffer, net)
    LOG.add_line('recon_loss', recon_loss)
    LOG.add_line('pi_loss', pi_loss)
    print(f'{i}: recon_loss: {recon_loss} pi_loss: {pi_loss}')
    if i % visualization_freq == 0:
        visualize_policies(f'./vis/policies/{i}_', net, env)
        visualize_correlation(f'./vis/correlation/{i}_', net, env)
        visualize_autoencoder(f'./vis/autoencoder/', net, env)