コード例 #1
0
batch_size = 32  # the number of samples that are provided to the model for update services at a given time
gamma = 0.99  # the discount of future rewards
record_idx = 10000  #

replay_initial = 10000  # number frames that are held
replay_buffer = ReplayBuffer(100000)
model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)
model.load_state_dict(
    torch.load("model_pretrained.pth",
               map_location='cpu'))  #loading in the pretrained model

target_model = QLearner(env, num_frames, batch_size, gamma,
                        replay_buffer)  #load in model
target_model.copy_from(model)

optimizer = optim.Adam(model.parameters(),
                       lr=0.0001)  #learning rate set and optimizing the model
if USE_CUDA:
    model = model.cuda()  # sends model to gpu
    target_model = target_model.cuda()
    print("Using cuda")

epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 30000  #used in ?
epsilon_by_frame = lambda frame_idx: epsilon_final + (
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

losses = []
all_rewards = []
episode_reward = 0
コード例 #2
0
epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 1000000
replay_initial = 10000
learning_rate = 1e-5
train_replay_buffer = ReplayBuffer(100000)
analysis_replay_buffer = ReplayBuffer(100000)

policy_model = QLearner(env, train_num_frames, batch_size, gamma,
                        train_replay_buffer)
target_model = QLearner(env, train_num_frames, batch_size, gamma,
                        train_replay_buffer)
target_model.load_state_dict(policy_model.state_dict())
target_model.eval()

optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if USE_CUDA:
    policy_model = policy_model.to(device)
    target_model = target_model.to(device)

epsilon_by_frame = lambda frame_idx: epsilon_final + (
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)


def play_to_train(num_frames, policy_model, target_model, buffer):
    losses = []
    all_rewards = []
    mean_losses = []
    mean_rewards = []
    episode_reward = 0
コード例 #3
0
env = wrap_pytorch(env)

num_frames = 1000000
batch_size = 32
gamma = 0.99
record_idx = 10000

replay_initial = 10000
replay_buffer = ReplayBuffer(100000)
model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)
model.load_state_dict(torch.load("model_pretrained.pth", map_location='cpu'))

target_model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)
target_model.copy_from(model)

optimizer = optim.Adam(model.parameters(), lr=0.00001)
if USE_CUDA:
    model = model.cuda()
    target_model = target_model.cuda()
    print("Using cuda")

epsilon_start = 1.0
epsilon_final = 0.01
epsilon_decay = 30000
epsilon_by_frame = lambda frame_idx: epsilon_final + (
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)
# .01 + 0.99* 1/ e^(frame index / 30000)
# So epsilon starts at pretty much 1
# as frame index increases, exp will get larger so 1/exp will decrease, so the 0.99 term will decrease, leaving us with just the final
# half of num_frames will give us a value very close to final value. So it decays quickly.
コード例 #4
0
num_frames = 1000000  # total frames will be lerning from
batch_size = 32  # num samples provided to for update ppurposes
gamma = 0.99
record_idx = 10000

replay_initial = 10000  # num frames it will hold??
replay_buffer = ReplayBuffer(100000)
model = QLearner(env, num_frames, batch_size, gamma,
                 replay_buffer)  #from dpn.py; pth file??v
model.load_state_dict(torch.load("model_pretrained.pth", map_location='cpu'))

target_model = QLearner(env, num_frames, batch_size, gamma, replay_buffer)
target_model.copy_from(model)

optimizer = optim.Adam(model.parameters(), lr=0.00001)  #lr == learning rate
if USE_CUDA:  # gpu to use
    model = model.cuda()
    target_model = target_model.cuda()
    print("Using cuda")

epsilon_start = 1.0  # in textbook, figure out what it is used fro
epsilon_final = 0.01  # go towards this from above
epsilon_decay = 30000
epsilon_by_frame = lambda frame_idx: epsilon_final + (
    epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay)

losses = []
all_rewards = []
episode_reward = 0