def evaluate(idx, target_value_network, lock, counter): port = 6000 + idx seed = 123 + idx * 10 hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed) hfoEnv.connectToServer() thread_counter = 0 target_value_network.load_state_dict(torch.load('checkpoint/checkpoint')) num_episode = 0 goal_list = [] while True: epsilon = 0. print('goal_list\n\n\n', goal_list) done = False curState = hfoEnv.reset() timestep = 0 while not done and timestep < 500: action = epsilon_greedy(curState, epsilon, target_value_network) nextState, reward, done, status, info = hfoEnv.step( hfoEnv.possibleActions[action]) curState = nextState with lock: counter.value += 1 thread_counter += 1 timestep += 1 if status == GOAL: goal_list.append(num_episode) num_episode += 1
def train(idx, args, value_network, target_value_network, optimizer, lock, counter): port = args.port seed = args.seed hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed) hfoEnv.connectToServer() eps = args.epsilon I_target = args.I_target I_update = args.I_update discountFactor = args.discountFactor learn_step_counter = 0 loss_func = nn.MSELoss() threads = args.numprocesses # This runs a random agent s = hfoEnv.reset() while learn_step_counter < counter.value / threads: lock.acquire() counter.value += 1 lock.release() learn_step_counter += 1 # take action action = choose_action(s, value_network, eps) # nextState, reward, done, status, info based on the action s_, r, done, status, info = hfoEnv.step(action) act_index = hfoEnv.possibleActions.index(action) q_eval = computePrediction(s, act_index, value_network) q_target = computeTargets(r, s_, discountFactor, done, target_value_network) loss = loss_func(q_eval, q_target) loss.backward() if learn_step_counter % I_update or done: #lock.acquire() optimizer.step() optimizer.zero_grad() optimizer.share_memory() #lock.release() # target parameter update if counter.value % I_target == 0: lock.acquire() target_value_network.load_state_dict(value_network.state_dict()) lock.release() if counter.value == 1e6: saveModelNetwork(target_value_network, os.getcwd()) if done: s = hfoEnv.reset() s = s_
def train(idx, args, value_network, target_value_network, optimizer, lock, counter): port = 6000 + idx seed = 123 + idx * 10 hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed) hfoEnv.connectToServer() thread_counter = 0 criterion = nn.MSELoss() optimizer.zero_grad() target_value_network.load_state_dict(torch.load('params/params_last')) num_episode = 0 while True: epsilon = args.epsilon * ( (1 - 1 / (1 + np.exp(-thread_counter / 2000))) * 2 * 0.9 + 0.1) done = False curState = hfoEnv.reset() timestep = 0 while not done and timestep < 500: action = epsilon_greedy(curState, epsilon, value_network) nextState, reward, done, status, info = hfoEnv.step( hfoEnv.possibleActions[action]) pred_value = computePrediction(curState, action, value_network) target_value = computeTargets(reward, nextState, args.discountFactor, done, target_value_network) loss = criterion(pred_value, target_value) loss.backward() curState = nextState with lock: counter.value += 1 thread_counter += 1 timestep += 1 if counter.value % args.iterate_target == 0: target_value_network.load_state_dict( torch.load('params/params_last')) if thread_counter % args.iterate_async == 0 or done: optimizer.step() optimizer.zero_grad() saveModelNetwork(value_network, 'params/params_last') if counter.value % 1000000 == 0: saveModelNetwork( value_network, 'params/params_{0:d}'.format(counter.value // 1000000)) num_episode += 1
def train(idx, args, value_network, target_value_network, optimizer, lock, counter): port = 3020+idx*10 seed = 12+idx*10 counter_v = 0 hfoEnv = HFOEnv(numTeammates = 0, numOpponents =1, port = port, seed = seed) hfoEnv.connectToServer() loss = nn.MSELoss() for episodeNumber in range(1,args.epochs+1): observation = hfoEnv.reset() counter.value += 1 counter_v += 1 for timestep in range(args.timesteps): #observation_t = torch.Tensor(observation) #action = greedy_action(observation_t,value_network,args) #print('!!!!!!!!!!!!!!',action) #act = hfoEnv.possibleActions[action] action = random.randint(0,3) act = hfoEnv.possibleActions[action] newObservation, reward, done, status, info = hfoEnv.step(act) #print(newObservation,reward,done,status,info) optimizer.zero_grad() lock.acquire() observation_t = torch.Tensor(observation) newObservation_t = torch.Tensor(newObservation) output = computePrediction(observation_t,action,value_network) target = computeTargets(reward,newObservation_t,args.discountFactor,done,target_value_network) lock.release() out = loss(output,target) out.backward() if counter_v% args.target_interval == 0: target_value_network.load_state_dict(value_network.state_dict()) if counter.value % args.predict_interval == 0 or done: optimizer.step() optimizer.zero_grad() strDirectory = "value_network"+str(idx)+".pth" saveModelNetwork(value_network,strDirectory) strDirectory_targert = "targetNetwork"+str(idx)+".pth" saveModelNetwork(target_value_network,strDirectory_targert) observation = newObservation if done: break if counter.value >= args.tmax: break
def train(idx, args, value_network, target_value_network, optimizer, lock, counter, port, seed, I_tar, I_async, name=None): hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed, headless=True) hfoEnv.connectToServer() if name == None: name = str(time()) #logger = Logger("tb/" + name, flush_secs=5) target_value_network.train() num_episodes = args.episodes gamma = 0.99 windows = [10, 500] goal_buffer = [0]*max(windows) max_epsilon = 0.99 min_epsilon = 0.01 total = 200 epsilon_fn = lambda current : max_epsilon - current*(max_epsilon - min_epsilon)/total if current < total else min_epsilon max_lr = 0.9 min_lr = 0.1 total = 600 lr_fn = lambda current: max_lr - current*(max_lr - min_lr)/total if current < total else min_lr loss_func = nn.MSELoss() t = 0 episodeNumber = 0 episodeReward = 0 episodeSteps = 0 state1 = hfoEnv.reset() while episodeNumber <= num_episodes: if (counter.value+1) % 1e3 == 0: print("##################################################") print('t:',t) print('counter:',counter.value) print("##################################################") if (counter.value+1) % 1e6 == 0: saveModelNetwork(value_network,"trained_models/params"+str(int((counter.value+1) // 1e6))) #train_epoch(epoch, args, model, device, train_loader, optimizer) epsilon = epsilon_fn(episodeNumber) lr = lr_fn(episodeNumber) if args.eval or np.random.random() >= epsilon: qs = value_network(state1) action = torch.argmax(qs) else: action = random.randint(0,len(hfoEnv.possibleActions)-1) a1 = hfoEnv.possibleActions[action] state2, reward, done, status, info = hfoEnv.step(a1) episodeReward += reward y = computeTargets(reward, state2, gamma, done, target_value_network) prediction = computePrediction(state1,action,value_network) Y = torch.zeros(4) Y[action] = y Prediction = torch.zeros(4) Prediction[action] = prediction loss = loss_func(Y,Prediction) loss.backward() state1 = state2 t += 1 episodeSteps += 1 with lock: counter.value = counter.value + 1 if done: if status == GOAL: goal_buffer.append(1) else: goal_buffer.append(0) #logger.log_value('episode/reward',episodeReward, episodeNumber) #logger.log_value('episode/length',episodeSteps, episodeNumber) #logger.log_value('hyperparameters/epsilon', epsilon, episodeNumber) #logger.log_value('hyperparameters/lr', lr, episodeNumber) #for window in windows: # logger.log_value(learning_str + "goals/%i" % window, # np.sum(goal_buffer[-window:]), # episodeNumber) episodeNumber += 1 episodeReward = 0.0 episodeSteps = 0 state1 = hfoEnv.reset() if t % I_async == 0 or done or episodeNumber == num_episodes: # Async update of value_network using gradients with lock: # Add grads to value_network for param, shared_param in zip( value_network.parameters(), target_value_network.parameters()): shared_param._grad = param.grad #value_network._grad = target_value_network.grad # Take a step optimizer.step(lr=lr) # Clean gradients optimizer.zero_grad() target_value_network.zero_grad() #if counter.value % I_tar == 0 or episodeNumber == num_episodes: if t % I_tar == 0 or episodeNumber == num_episodes or done: # Update target network target_value_network.zero_grad() target_value_network.load_state_dict(value_network.state_dict()) hfoEnv.reset() saveModelNetwork(value_network,"trained_models/params_last") # Finishing training and showing stats hfoEnv.quitGame()
def train(idx, args, value_network, target_value_network, optimizer, lock, counter): port = 8000 + 50 * idx env = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=123) env.connectToServer() do_opt_step = False do_hard_copy = False total_reward = 0 save_counter = 1 for episode in range(args.numEpisodes): timestep = 0 total_reward = 0 obs = env.reset() done = False save_model = False do_opt_step = False do_hard_copy = False while not done and timestep < 500: # obs to tensor obs_tensor = torch.Tensor(obs).unsqueeze(0) # choose action act = chooseAction(obs_tensor, value_network, episode, idx) # execute action act_str = env.possibleActions[act] next_obs, rewards, done, status, info = env.step(act_str) # update total reward total_reward += rewards # reward to tensor reward_tensor = torch.Tensor([rewards]) # next obs to tensor next_obs_tensor = torch.Tensor(next_obs).unsqueeze(0) # update counters and flags timestep += 1 with lock: counter.value = counter.value + 1 if timestep % args.aSyncFreq == 0 or done: do_opt_step = True if counter.value % args.copy_freq == 0: do_hard_copy = True if counter.value % 1e6 == 0: save_model = True current_count = counter.value #forward pass for our networks predicted_vals = computePrediction(obs_tensor, act, value_network) target_vals = computeTargets(reward_tensor, next_obs_tensor, args.gamma, done, target_value_network) # loss function calculation loss_function = nn.MSELoss() err = loss_function(predicted_vals, target_vals) # accumulate gradients err.backward() # update optimizer if do_opt_step: with lock: optimizer.step() optimizer.zero_grad() do_opt_step = False #update global network if do_hard_copy: with lock: hard_copy(target_value_network, value_network) do_hard_copy = False # update current state obs = next_obs if save_model: #save model saveModelNetwork(value_network, 'params_' + str(save_counter)) save_counter += 1 #change learning rate change_lr(current_count, optimizer) saveModelNetwork(value_network, 'params_latest')
def train(idx, val_network, target_value_network, optimizer, lock, counter,timesteps_per_process,results_dir): # This runs a random agent episodeNumber = 0 discountFactor = 0.99 f = open(os.path.join(results_dir, 'worker_%d.out'%idx), 'w') #Each worker writes to a file to check if my network runs correctly columns = '{0:<10} {1:<8} {2:<10} {3:<12} {4:<20} {5:<15} {6:<15}\n' f.write(columns.format('Episode','Status','Steps','Total steps','Avg steps to goal','Total Goals','Counter')) hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=6000+(idx+5)*10, seed=idx) hfoEnv.connectToServer() asyc_update = 10 copy_freq = 10000 total_goals = 0 total_steps = 0 totalEpisodes = timesteps_per_process // 500 save_model = False copy_model = False for episodeNumber in range(totalEpisodes): done = False epsilon = epsilon_annealing(idx,episodeNumber,totalEpisodes) state = hfoEnv.reset() timesteps = 0 while timesteps<500: lock.acquire() counter.value += 1 locked_counter = counter.value if (locked_counter % 1e6) == 0: save_model = True if (locked_counter % copy_freq) == 0: copy_model = True lock.release() timesteps+=1 obs_tensor = torch.Tensor(state).unsqueeze(0) Q, act = compute_val(val_network, obs_tensor,idx,epsilon) act = hfoEnv.possibleActions[act] newObservation, reward, done, status, info = hfoEnv.step(act) Q_target = computeTargets(reward,torch.Tensor(newObservation).unsqueeze(0),discountFactor,done,target_value_network) loss_function = nn.MSELoss() loss = loss_function(Q_target,Q) loss.backward() if timesteps % asyc_update ==0 or done: with lock: optimizer.step() optimizer.zero_grad() if copy_model: hard_copy(target_value_network, val_network) copy_model = False if save_model: updating_learning_rate(optimizer,locked_counter) saveModelNetwork(target_value_network,os.path.join(results_dir, 'params_%d' % int(locked_counter / 1e6))) save_model = False state = newObservation if done: break if status==1: total_steps += timesteps total_goals += 1 else: total_steps += 500 f.write(columns.format(episodeNumber, status, timesteps, total_steps, '%.1f'%(total_steps/(episodeNumber+1)), total_goals, locked_counter)) f.flush()
def runTrain(idx, args, valueNetwork, targetNetwork, optimizer, lock, counter): if args.use_gpu and torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") port = 6006 + idx * 9 seed = 2019 + idx * 46 hfoEnv = HFOEnv(args.reward_opt, numTeammates=0, numOpponents=1, port=port, seed=seed) hfoEnv.connectToServer() episodeNumber = 0 numTakenActions = 0 numTakenActionCKPT = 0 discountFactor = args.discountFactor totalWorkLoad = args.t_max // args.n_jobs steps_to_ball = [ ] # number of steps spent to approach the ball in each episode steps_in_episode = [] # number of steps in each episode status_lst = [] log = { 'steps_to_ball': steps_to_ball, 'steps_in_episode': steps_in_episode, 'status_lst': status_lst } step_to_ball = None firstRecord = True optimizer.zero_grad() state = torch.tensor(hfoEnv.reset()).to(device) # get initial state while True: # take action based on eps-greedy policy action = select_action(state, valueNetwork, episodeNumber, numTakenActions, args) # action -> int act = hfoEnv.possibleActions[action] newObservation, reward, done, status, info = hfoEnv.step(act) if info['kickable'] and firstRecord: step_to_ball = numTakenActions - numTakenActionCKPT firstRecord = False next_state = torch.tensor( hfoEnv.preprocessState(newObservation)).to(device) target = computeTargets(reward, next_state, discountFactor, done, targetNetwork, device) # target -> tensor pred = computePrediction(state, action, valueNetwork, device) # pred -> tensor loss = F.mse_loss(pred, target) # compute loss loss.backward() # accumulate loss lock.acquire() counter.value += 1 counterValue = counter.value lock.release() numTakenActions += 1 if done: firstRecord = True status_lst.append(status) steps_in_episode.append(numTakenActions - numTakenActionCKPT) if step_to_ball is None: steps_to_ball.append(numTakenActions - numTakenActionCKPT) else: steps_to_ball.append(step_to_ball) step_to_ball = None episodeNumber += 1 numTakenActionCKPT = numTakenActions # hfoEnv alreay call 'preprocessState' in 'reset' state = torch.tensor(hfoEnv.reset()).to(device) if done or numTakenActions % args.i_async_update == 0: optimizer.step() # apply grads optimizer.zero_grad() # clear all cached grad if counterValue % args.i_target == 0: targetNetwork.load_state_dict( valueNetwork.state_dict()) # update target network if counterValue % args.ckpt_interval == 0: ckpt_path = os.path.join(args.log_dir, 'ckpt') if not os.path.exists(ckpt_path): os.mkdir(ckpt_path) filename = os.path.join( ckpt_path, 'params_%d' % (counterValue / args.ckpt_interval)) lock.acquire() saveModelNetwork(valueNetwork, filename) lock.release() if numTakenActions > totalWorkLoad: filename = os.path.join(args.log_dir, 'log_worker_%d.pkl' % idx) saveLog(log, filename) return
def runEval(args, valueNetwork): port = 6050 seed = 2019 if args.use_gpu and torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") hfoEnv = HFOEnv(args.reward_opt, numTeammates=0, numOpponents=1, port=port, seed=seed) hfoEnv.connectToServer() episodeNumber = 0 numTakenActions = 0 numTakenActionCKPT = 0 state = torch.tensor(hfoEnv.reset()).to(device) # get initial state steps_to_ball = [] steps_in_episode = [] status_lst = [] log = { 'steps_to_ball': steps_to_ball, 'steps_in_episode': steps_in_episode, 'status_lst': status_lst } step_to_ball = None firstRecord = True while numTakenActions < args.t_max: # take action based on eps-greedy policy action = select_action(state, valueNetwork, episodeNumber, numTakenActions, args) # action -> int act = hfoEnv.possibleActions[action] newObservation, reward, done, status, info = hfoEnv.step(act) if info['kickable'] and firstRecord: step_to_ball = numTakenActions - numTakenActionCKPT firstRecord = False # all updates on the parameters shall acqure lock numTakenActions += 1 if done: firstRecord = True status_lst.append(status) steps_in_episode.append(numTakenActions - numTakenActionCKPT) if step_to_ball is None: steps_to_ball.append(numTakenActions - numTakenActionCKPT) else: steps_to_ball.append(step_to_ball) step_to_ball = None episodeNumber += 1 numTakenActionCKPT = numTakenActions # hfoEnv alreay call 'preprocessState' in 'reset' state = torch.tensor(hfoEnv.reset()).to(device) filename = os.path.join(args.log_dir, 'log_eval.pkl') saveLog(log, filename)
def train(idx, args, learning_network, target_network, optimizer, lock, counter): # init port & seed for the thread based on id val port = 8100 + 10 * idx # init seed = idx * 113 + 923 torch.manual_seed(seed) worker_network = ValueNetwork(15, 4) worker_network.load_state_dict(learning_network.state_dict()) # change init # init env hfo_env = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed) hfo_env.connectToServer() episode_num = 0 eps = random.sample(epsilon_list, 1)[0] worker_timestep = 0 mse_loss = nn.MSELoss() max_worker_steps = args.max_steps / args.num_processes can_continue = True goal = 0 to_goal = [] while can_continue: # run episode obs_tensor = hfo_env.reset() done = False loss = 0 reward_ep = 0 ep_steps = 0 upd_steps = 0 while not done: # select action based on greedy policy action_idx = select_action(obs_tensor, worker_network, worker_timestep, max_worker_steps, args, eps) action = hfo_env.possibleActions[action_idx] # observe next next_obs_tensor, reward, done, status, info = hfo_env.step(action) y = computeTargets(reward, next_obs_tensor, args.discount, done, target_network) # compute predictions again for the best action. Here we change params q_next = computePrediction(obs_tensor, action_idx, worker_network) # put new state obs_tensor = next_obs_tensor # update episode stats loss += mse_loss(y, q_next) reward_ep += reward upd_steps += 1 ep_steps += 1 worker_timestep += 1 if status == 1: goal += 1 to_goal.append(ep_steps) with lock: counter.value += 1 if counter.value % args.checkpoint_time == 0: saveModelNetwork( learning_network, args.checkpoint_dir + '_{}'.format(counter.value)) # if terminal or time to update network if done or worker_timestep % args.val_net_update_freq == 0: worker_network.zero_grad() optimizer.zero_grad() # take mean loss loss /= upd_steps loss.backward() sync_grad(learning_network, worker_network) optimizer.step() worker_network.load_state_dict(learning_network.state_dict()) loss = 0 upd_steps = 0 # perform update of target network if counter.value % args.tgt_net_update_freq == 0: target_network.load_state_dict(learning_network.state_dict()) if counter.value % args.checkpoint_time == 0: saveModelNetwork( learning_network, args.checkpoint_dir + '_{}'.format(counter.value)) episode_num += 1 # if time is exceeded -> break the loop can_continue = counter.value <= args.max_steps\ and worker_timestep <= max_worker_steps\ and status !=SERVER_DOWN \ and episode_num <= args.num_episodes # lets run just 8K episodes # finish the game hfo_env.quitGame() # save the network it stopped with saveModelNetwork(learning_network, args.checkpoint_dir + '_{}_final'.format(counter.value))
import argparse import torch import torch.nn as nn import torch.nn.functional as F import torch.multiprocessing as mp #from Networks import ValueNetwork from SharedAdam import SharedAdam from Environment import HFOEnv #from Worker import train import random port = random.randint(0,9999) hfoEnv = HFOEnv(numTeammates=1, numOpponents=1, port=port, seed=123) hfoEnv.connectToServer() obs = hfoEnv.reset() aa = hfoEnv.possibleActions obsn = hfoEnv.step(aa[0]) obsn = hfoEnv.step(aa[1])[0] obsn = torch.Tensor(obsn).unsqueeze(0) obsn = torch.cat((obsn,torch.Tensor([0]).unsqueeze(0)), 1) print('obs1',obsn,obsn.shape)
def train(idx, port, target_network, value_network, lock, counter, num_episodes=16000, name=""): print("Starting a worker {}".format(port)) # port = 2207 seed = 2207 hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed) hfoEnv.connectToServer() episodeNumber = 0 epsilon = 1 discountFactor = 0.99 I_async_update = 5 I_target = 10000 goals = 0 paramSaves = 0 lastSaved = 0 hard_update(target_network, value_network) optimizer = optim.Adam(value_network.parameters(), lr=1e-5) optimizer.zero_grad() t = 0 # local timestep counter if idx == 0: # first thread keeps track of stats stats = [] # run for certain number of timesteps while t < num_episodes * 500: timesteps_to_goal = 0 # for measuring performance total_reward = 0 # accumulated reward (without discounting) status = 0 observation = hfoEnv.reset() # print(observation.shape) # linearly decrease epsilon epsilon = max(0.0, (22000 - episodeNumber) / 22000) while status == 0: # EPSILON GREEDY - TAKE AN ACTION if np.random.rand() < epsilon: # choose a random action action = np.random.choice(range(len(hfoEnv.possibleActions))) else: # choose greedy action lock.acquire() values = value_network( torch.Tensor(observation)).detach().numpy() action = np.argmax(values) lock.release() newObservation, reward, done, status, info = hfoEnv.step( hfoEnv.possibleActions[action]) total_reward += reward # keep track of goals scored if reward >= 50.0: goals += 1 # COMPUTE TARGET VALUE lock.acquire() target_value = computeTargets(reward, [newObservation], discountFactor, done, target_network) prediction = computePrediction([observation], action, value_network) loss = 0.5 * (prediction - target_value.detach())**2 # accummulate gradient loss.backward() lock.release() observation = newObservation # update local counter t t += 1 timesteps_to_goal += 1 # update global counter T lock.acquire() counter.value += 1 # update target network if counter.value % I_target == 0: hard_update(target_network, value_network) # only the first worker saves the model (every 1 mil) if idx == 0 and counter.value >= 1000000 + lastSaved: lastSaved = counter.value print("saving model") paramSaves += 1 path = "{}_params_{}".format(name, paramSaves) saveModelNetwork(value_network, path) # update value network and zero gradients if t % I_async_update == 0 or done: print("Doing async update") optimizer.step() optimizer.zero_grad() lock.release() if done: if idx == 0: timesteps_to_goal = timesteps_to_goal if status == 1 else 500 stats.append(timesteps_to_goal) mean = np.mean(stats) # mean ep length # output things to a csv for monitoring print("{}, {}, {}, {}, {}, {}, {}, {}".format( episodeNumber, t, mean, goals, epsilon, timesteps_to_goal, status, total_reward), file=open("{}experiment.csv".format(name), "a")) episodeNumber += 1