batch_size = 32 # the number of samples that are provided to the model for update services at a given time gamma = 0.99 # the discount of future rewards record_idx = 10000 # replay_initial = 10000 # number frames that are held replay_buffer = ReplayBuffer(100000) model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) model.load_state_dict( torch.load("model_pretrained.pth", map_location='cpu')) #loading in the pretrained model target_model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) #load in model target_model.copy_from(model) optimizer = optim.Adam(model.parameters(), lr=0.0001) #learning rate set and optimizing the model if USE_CUDA: model = model.cuda() # sends model to gpu target_model = target_model.cuda() print("Using cuda") epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 30000 #used in ? epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) losses = [] all_rewards = [] episode_reward = 0
epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 1000000 replay_initial = 10000 learning_rate = 1e-5 train_replay_buffer = ReplayBuffer(100000) analysis_replay_buffer = ReplayBuffer(100000) policy_model = QLearner(env, train_num_frames, batch_size, gamma, train_replay_buffer) target_model = QLearner(env, train_num_frames, batch_size, gamma, train_replay_buffer) target_model.load_state_dict(policy_model.state_dict()) target_model.eval() optimizer = optim.Adam(policy_model.parameters(), lr=learning_rate) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if USE_CUDA: policy_model = policy_model.to(device) target_model = target_model.to(device) epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) def play_to_train(num_frames, policy_model, target_model, buffer): losses = [] all_rewards = [] mean_losses = [] mean_rewards = [] episode_reward = 0
env = wrap_pytorch(env) num_frames = 1000000 batch_size = 32 gamma = 0.99 record_idx = 10000 replay_initial = 10000 replay_buffer = ReplayBuffer(100000) model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) model.load_state_dict(torch.load("model_pretrained.pth", map_location='cpu')) target_model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) target_model.copy_from(model) optimizer = optim.Adam(model.parameters(), lr=0.00001) if USE_CUDA: model = model.cuda() target_model = target_model.cuda() print("Using cuda") epsilon_start = 1.0 epsilon_final = 0.01 epsilon_decay = 30000 epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) # .01 + 0.99* 1/ e^(frame index / 30000) # So epsilon starts at pretty much 1 # as frame index increases, exp will get larger so 1/exp will decrease, so the 0.99 term will decrease, leaving us with just the final # half of num_frames will give us a value very close to final value. So it decays quickly.
num_frames = 1000000 # total frames will be lerning from batch_size = 32 # num samples provided to for update ppurposes gamma = 0.99 record_idx = 10000 replay_initial = 10000 # num frames it will hold?? replay_buffer = ReplayBuffer(100000) model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) #from dpn.py; pth file??v model.load_state_dict(torch.load("model_pretrained.pth", map_location='cpu')) target_model = QLearner(env, num_frames, batch_size, gamma, replay_buffer) target_model.copy_from(model) optimizer = optim.Adam(model.parameters(), lr=0.00001) #lr == learning rate if USE_CUDA: # gpu to use model = model.cuda() target_model = target_model.cuda() print("Using cuda") epsilon_start = 1.0 # in textbook, figure out what it is used fro epsilon_final = 0.01 # go towards this from above epsilon_decay = 30000 epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * math.exp(-1. * frame_idx / epsilon_decay) losses = [] all_rewards = [] episode_reward = 0