def evaluate(idx, target_value_network, lock, counter):
    port = 6000 + idx
    seed = 123 + idx * 10
    hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed)
    hfoEnv.connectToServer()

    thread_counter = 0
    target_value_network.load_state_dict(torch.load('checkpoint/checkpoint'))
    num_episode = 0
    goal_list = []

    while True:
        epsilon = 0.
        print('goal_list\n\n\n', goal_list)
        done = False
        curState = hfoEnv.reset()
        timestep = 0

        while not done and timestep < 500:
            action = epsilon_greedy(curState, epsilon, target_value_network)
            nextState, reward, done, status, info = hfoEnv.step(
                hfoEnv.possibleActions[action])
            curState = nextState

            with lock:
                counter.value += 1
            thread_counter += 1
            timestep += 1

            if status == GOAL:
                goal_list.append(num_episode)

        num_episode += 1
Exemple #2
0
def train(idx, args, value_network, target_value_network, optimizer, lock,
          counter):

    port = args.port
    seed = args.seed
    hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed)
    hfoEnv.connectToServer()

    eps = args.epsilon
    I_target = args.I_target
    I_update = args.I_update
    discountFactor = args.discountFactor
    learn_step_counter = 0
    loss_func = nn.MSELoss()
    threads = args.numprocesses

    # This runs a random agent
    s = hfoEnv.reset()
    while learn_step_counter < counter.value / threads:
        lock.acquire()
        counter.value += 1
        lock.release()
        learn_step_counter += 1
        # take action
        action = choose_action(s, value_network, eps)
        # nextState, reward, done, status, info based on the action
        s_, r, done, status, info = hfoEnv.step(action)

        act_index = hfoEnv.possibleActions.index(action)
        q_eval = computePrediction(s, act_index, value_network)
        q_target = computeTargets(r, s_, discountFactor, done,
                                  target_value_network)
        loss = loss_func(q_eval, q_target)

        loss.backward()
        if learn_step_counter % I_update or done:
            #lock.acquire()
            optimizer.step()
            optimizer.zero_grad()
            optimizer.share_memory()
            #lock.release()

        # target parameter update
        if counter.value % I_target == 0:
            lock.acquire()
            target_value_network.load_state_dict(value_network.state_dict())
            lock.release()

        if counter.value == 1e6:
            saveModelNetwork(target_value_network, os.getcwd())

        if done:
            s = hfoEnv.reset()

        s = s_
def train(idx, args, value_network, target_value_network, optimizer, lock,
          counter):
    port = 6000 + idx
    seed = 123 + idx * 10
    hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed)
    hfoEnv.connectToServer()

    thread_counter = 0
    criterion = nn.MSELoss()
    optimizer.zero_grad()
    target_value_network.load_state_dict(torch.load('params/params_last'))
    num_episode = 0

    while True:
        epsilon = args.epsilon * (
            (1 - 1 / (1 + np.exp(-thread_counter / 2000))) * 2 * 0.9 + 0.1)
        done = False
        curState = hfoEnv.reset()
        timestep = 0

        while not done and timestep < 500:
            action = epsilon_greedy(curState, epsilon, value_network)
            nextState, reward, done, status, info = hfoEnv.step(
                hfoEnv.possibleActions[action])

            pred_value = computePrediction(curState, action, value_network)
            target_value = computeTargets(reward, nextState,
                                          args.discountFactor, done,
                                          target_value_network)
            loss = criterion(pred_value, target_value)
            loss.backward()

            curState = nextState

            with lock:
                counter.value += 1
            thread_counter += 1
            timestep += 1

            if counter.value % args.iterate_target == 0:
                target_value_network.load_state_dict(
                    torch.load('params/params_last'))

            if thread_counter % args.iterate_async == 0 or done:
                optimizer.step()
                optimizer.zero_grad()
                saveModelNetwork(value_network, 'params/params_last')

            if counter.value % 1000000 == 0:
                saveModelNetwork(
                    value_network,
                    'params/params_{0:d}'.format(counter.value // 1000000))

        num_episode += 1
def train(idx, args, value_network, target_value_network, optimizer, lock, counter):
	port = 3020+idx*10
	seed = 12+idx*10
	counter_v = 0
	hfoEnv = HFOEnv(numTeammates = 0, numOpponents =1, port = port, seed = seed)
	hfoEnv.connectToServer()
	loss = nn.MSELoss()
	for episodeNumber in range(1,args.epochs+1):
		observation = hfoEnv.reset()
		counter.value += 1
		counter_v += 1

		for timestep in range(args.timesteps):
			#observation_t = torch.Tensor(observation)
			#action = greedy_action(observation_t,value_network,args)
			#print('!!!!!!!!!!!!!!',action)
			#act = hfoEnv.possibleActions[action]
			action = random.randint(0,3)
			act = hfoEnv.possibleActions[action]
			newObservation, reward, done, status, info = hfoEnv.step(act)
			#print(newObservation,reward,done,status,info)                                
			optimizer.zero_grad()
			lock.acquire()

			observation_t = torch.Tensor(observation)
			newObservation_t = torch.Tensor(newObservation)
			output = computePrediction(observation_t,action,value_network)
			target = computeTargets(reward,newObservation_t,args.discountFactor,done,target_value_network)
			lock.release()
			out = loss(output,target)
			out.backward()
			if counter_v% args.target_interval == 0:
				target_value_network.load_state_dict(value_network.state_dict())
				
			if counter.value % args.predict_interval == 0 or done:
				optimizer.step()
				optimizer.zero_grad()
				strDirectory = "value_network"+str(idx)+".pth"
				saveModelNetwork(value_network,strDirectory)
				strDirectory_targert = "targetNetwork"+str(idx)+".pth"
				saveModelNetwork(target_value_network,strDirectory_targert)
			observation = newObservation
			if done:
				break
			if counter.value >= args.tmax:
				break
Exemple #5
0
def train(idx, args, value_network, target_value_network, optimizer, lock, counter, 
                port, seed, I_tar, I_async, name=None):

        hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed, headless=True)
        hfoEnv.connectToServer()

        if name == None:
                name = str(time())
        #logger = Logger("tb/" + name, flush_secs=5)

        target_value_network.train()
        num_episodes = args.episodes

        gamma = 0.99
        windows = [10, 500]
        goal_buffer = [0]*max(windows)

        max_epsilon = 0.99
        min_epsilon = 0.01
        total = 200
        epsilon_fn = lambda current : max_epsilon - current*(max_epsilon - min_epsilon)/total if current < total else min_epsilon

        max_lr = 0.9
        min_lr = 0.1
        total = 600
        lr_fn = lambda current: max_lr - current*(max_lr - min_lr)/total if current < total else min_lr


        loss_func = nn.MSELoss()
        t = 0
        episodeNumber = 0
        episodeReward = 0
        episodeSteps  = 0
        state1 = hfoEnv.reset()
        
        while episodeNumber <= num_episodes:

                if (counter.value+1) % 1e3 == 0: 
                    print("##################################################")
                    print('t:',t)
                    print('counter:',counter.value)
                    print("##################################################")
                if (counter.value+1) % 1e6 == 0:
                    saveModelNetwork(value_network,"trained_models/params"+str(int((counter.value+1) // 1e6)))


                #train_epoch(epoch, args, model, device, train_loader, optimizer)
                epsilon = epsilon_fn(episodeNumber)
                lr = lr_fn(episodeNumber)

                if args.eval or np.random.random() >= epsilon:
                        qs = value_network(state1)
                        action = torch.argmax(qs)
                else:
                        action = random.randint(0,len(hfoEnv.possibleActions)-1)

                a1 = hfoEnv.possibleActions[action]

                state2, reward, done, status, info = hfoEnv.step(a1)
                episodeReward += reward
                
                y = computeTargets(reward, state2, gamma, done, target_value_network)

                prediction = computePrediction(state1,action,value_network)

                Y = torch.zeros(4)
                Y[action] = y
                Prediction = torch.zeros(4)
                Prediction[action] = prediction

                loss = loss_func(Y,Prediction)
                loss.backward()

                state1 = state2
                t += 1
                episodeSteps += 1
                with lock:
                        counter.value = counter.value + 1

                if done:

                        if status == GOAL:
                                goal_buffer.append(1)
                        else:
                                goal_buffer.append(0)

                        #logger.log_value('episode/reward',episodeReward, episodeNumber)
                        #logger.log_value('episode/length',episodeSteps, episodeNumber)
                        #logger.log_value('hyperparameters/epsilon', epsilon, episodeNumber)
                        #logger.log_value('hyperparameters/lr', lr, episodeNumber)
                        #for window in windows:
                        #        logger.log_value(learning_str + "goals/%i" % window,
                        #                        np.sum(goal_buffer[-window:]),
                        #                        episodeNumber)
                        episodeNumber += 1
                        episodeReward = 0.0
                        episodeSteps  = 0
                        state1 = hfoEnv.reset()

                if t % I_async == 0 or done or episodeNumber == num_episodes:
                        # Async update of value_network using gradients
                        with lock:
                                # Add grads to value_network
                                for param, shared_param in zip(
                                            value_network.parameters(), 
                                            target_value_network.parameters()):
                                        shared_param._grad = param.grad
                                        #value_network._grad = target_value_network.grad
                                # Take a step
                                optimizer.step(lr=lr)
                                # Clean gradients
                                optimizer.zero_grad()
                        target_value_network.zero_grad()

                #if counter.value % I_tar == 0 or episodeNumber == num_episodes:
                if t % I_tar == 0 or episodeNumber == num_episodes or done:
                        # Update target network
                        target_value_network.zero_grad()
                        target_value_network.load_state_dict(value_network.state_dict())

                hfoEnv.reset()

        saveModelNetwork(value_network,"trained_models/params_last")
        # Finishing training and showing stats
        hfoEnv.quitGame()
Exemple #6
0
def train(idx, args, value_network, target_value_network, optimizer, lock,
          counter):
    port = 8000 + 50 * idx
    env = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=123)
    env.connectToServer()
    do_opt_step = False
    do_hard_copy = False

    total_reward = 0
    save_counter = 1
    for episode in range(args.numEpisodes):

        timestep = 0
        total_reward = 0
        obs = env.reset()
        done = False
        save_model = False
        do_opt_step = False
        do_hard_copy = False
        while not done and timestep < 500:

            # obs to tensor
            obs_tensor = torch.Tensor(obs).unsqueeze(0)
            # choose action
            act = chooseAction(obs_tensor, value_network, episode, idx)
            # execute action
            act_str = env.possibleActions[act]
            next_obs, rewards, done, status, info = env.step(act_str)
            # update total reward
            total_reward += rewards
            # reward to tensor
            reward_tensor = torch.Tensor([rewards])
            # next obs to tensor
            next_obs_tensor = torch.Tensor(next_obs).unsqueeze(0)
            # update counters and flags
            timestep += 1
            with lock:
                counter.value = counter.value + 1
                if timestep % args.aSyncFreq == 0 or done:
                    do_opt_step = True
                if counter.value % args.copy_freq == 0:
                    do_hard_copy = True
                if counter.value % 1e6 == 0:
                    save_model = True
            current_count = counter.value
            #forward pass for our networks
            predicted_vals = computePrediction(obs_tensor, act, value_network)
            target_vals = computeTargets(reward_tensor, next_obs_tensor,
                                         args.gamma, done,
                                         target_value_network)
            # loss function calculation
            loss_function = nn.MSELoss()
            err = loss_function(predicted_vals, target_vals)
            # accumulate gradients
            err.backward()

            # update optimizer
            if do_opt_step:
                with lock:
                    optimizer.step()
                    optimizer.zero_grad()
                do_opt_step = False
            #update global network
            if do_hard_copy:
                with lock:
                    hard_copy(target_value_network, value_network)
                do_hard_copy = False
            # update current state
            obs = next_obs

        if save_model:
            #save model

            saveModelNetwork(value_network, 'params_' + str(save_counter))
            save_counter += 1
            #change learning rate
            change_lr(current_count, optimizer)

    saveModelNetwork(value_network, 'params_latest')
def train(idx, val_network, target_value_network, optimizer, lock, counter,timesteps_per_process,results_dir):
	
	# This runs a random agent

	episodeNumber = 0
	discountFactor = 0.99
	f = open(os.path.join(results_dir, 'worker_%d.out'%idx), 'w')
	#Each worker writes to a file to check if my network runs correctly
	columns = '{0:<10} {1:<8} {2:<10} {3:<12} {4:<20} {5:<15} {6:<15}\n'
	f.write(columns.format('Episode','Status','Steps','Total steps','Avg steps to goal','Total Goals','Counter'))
	
	hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=6000+(idx+5)*10, seed=idx)
	hfoEnv.connectToServer()
	asyc_update = 10
	copy_freq = 10000
	total_goals = 0
	total_steps = 0
	totalEpisodes = timesteps_per_process // 500
	save_model = False
	copy_model = False
	for episodeNumber in range(totalEpisodes):
		done = False
		epsilon = epsilon_annealing(idx,episodeNumber,totalEpisodes)
		state = hfoEnv.reset()
		timesteps = 0 
		while timesteps<500:
			lock.acquire()
			counter.value += 1
			locked_counter = counter.value
			if (locked_counter % 1e6) == 0:
				save_model = True
			if (locked_counter % copy_freq) == 0:
				copy_model = True
			lock.release()
			timesteps+=1	

			obs_tensor = torch.Tensor(state).unsqueeze(0)
			Q, act = compute_val(val_network, obs_tensor,idx,epsilon)
			act = hfoEnv.possibleActions[act]
			newObservation, reward, done, status, info = hfoEnv.step(act)

			Q_target = computeTargets(reward,torch.Tensor(newObservation).unsqueeze(0),discountFactor,done,target_value_network)

			loss_function = nn.MSELoss()
			loss = loss_function(Q_target,Q)
			loss.backward()
			
			if timesteps % asyc_update ==0 or done:
				with lock:
					optimizer.step()
					optimizer.zero_grad()		

			if copy_model:
				hard_copy(target_value_network, val_network)
				copy_model = False
			if save_model:
				updating_learning_rate(optimizer,locked_counter)
				saveModelNetwork(target_value_network,os.path.join(results_dir, 'params_%d' % int(locked_counter / 1e6)))
				save_model = False
			state = newObservation
			if done:
				break
		if status==1:
			total_steps += timesteps
			total_goals += 1
		else:
			total_steps += 500
		f.write(columns.format(episodeNumber, status, timesteps, total_steps, '%.1f'%(total_steps/(episodeNumber+1)), total_goals, locked_counter))
		f.flush()
Exemple #8
0
def runTrain(idx, args, valueNetwork, targetNetwork, optimizer, lock, counter):
    if args.use_gpu and torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    port = 6006 + idx * 9
    seed = 2019 + idx * 46

    hfoEnv = HFOEnv(args.reward_opt,
                    numTeammates=0,
                    numOpponents=1,
                    port=port,
                    seed=seed)
    hfoEnv.connectToServer()

    episodeNumber = 0
    numTakenActions = 0
    numTakenActionCKPT = 0
    discountFactor = args.discountFactor

    totalWorkLoad = args.t_max // args.n_jobs

    steps_to_ball = [
    ]  # number of steps spent to approach the ball in each episode
    steps_in_episode = []  # number of steps in each episode
    status_lst = []

    log = {
        'steps_to_ball': steps_to_ball,
        'steps_in_episode': steps_in_episode,
        'status_lst': status_lst
    }

    step_to_ball = None
    firstRecord = True

    optimizer.zero_grad()
    state = torch.tensor(hfoEnv.reset()).to(device)  # get initial state

    while True:
        # take action based on eps-greedy policy
        action = select_action(state, valueNetwork, episodeNumber,
                               numTakenActions, args)  # action -> int

        act = hfoEnv.possibleActions[action]
        newObservation, reward, done, status, info = hfoEnv.step(act)

        if info['kickable'] and firstRecord:
            step_to_ball = numTakenActions - numTakenActionCKPT
            firstRecord = False

        next_state = torch.tensor(
            hfoEnv.preprocessState(newObservation)).to(device)

        target = computeTargets(reward, next_state, discountFactor, done,
                                targetNetwork, device)  # target -> tensor
        pred = computePrediction(state, action, valueNetwork,
                                 device)  # pred -> tensor

        loss = F.mse_loss(pred, target)  # compute loss
        loss.backward()  # accumulate loss

        lock.acquire()
        counter.value += 1
        counterValue = counter.value
        lock.release()

        numTakenActions += 1

        if done:
            firstRecord = True
            status_lst.append(status)
            steps_in_episode.append(numTakenActions - numTakenActionCKPT)

            if step_to_ball is None:
                steps_to_ball.append(numTakenActions - numTakenActionCKPT)
            else:
                steps_to_ball.append(step_to_ball)
                step_to_ball = None

            episodeNumber += 1
            numTakenActionCKPT = numTakenActions

            # hfoEnv alreay call 'preprocessState' in 'reset'
            state = torch.tensor(hfoEnv.reset()).to(device)

        if done or numTakenActions % args.i_async_update == 0:
            optimizer.step()  # apply grads
            optimizer.zero_grad()  # clear all cached grad

        if counterValue % args.i_target == 0:
            targetNetwork.load_state_dict(
                valueNetwork.state_dict())  # update target network

        if counterValue % args.ckpt_interval == 0:
            ckpt_path = os.path.join(args.log_dir, 'ckpt')
            if not os.path.exists(ckpt_path):
                os.mkdir(ckpt_path)

            filename = os.path.join(
                ckpt_path, 'params_%d' % (counterValue / args.ckpt_interval))

            lock.acquire()
            saveModelNetwork(valueNetwork, filename)
            lock.release()

        if numTakenActions > totalWorkLoad:
            filename = os.path.join(args.log_dir, 'log_worker_%d.pkl' % idx)
            saveLog(log, filename)
            return
Exemple #9
0
def runEval(args, valueNetwork):
    port = 6050
    seed = 2019

    if args.use_gpu and torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")

    hfoEnv = HFOEnv(args.reward_opt,
                    numTeammates=0,
                    numOpponents=1,
                    port=port,
                    seed=seed)
    hfoEnv.connectToServer()

    episodeNumber = 0
    numTakenActions = 0
    numTakenActionCKPT = 0

    state = torch.tensor(hfoEnv.reset()).to(device)  # get initial state

    steps_to_ball = []
    steps_in_episode = []
    status_lst = []

    log = {
        'steps_to_ball': steps_to_ball,
        'steps_in_episode': steps_in_episode,
        'status_lst': status_lst
    }
    step_to_ball = None
    firstRecord = True

    while numTakenActions < args.t_max:
        # take action based on eps-greedy policy
        action = select_action(state, valueNetwork, episodeNumber,
                               numTakenActions, args)  # action -> int

        act = hfoEnv.possibleActions[action]
        newObservation, reward, done, status, info = hfoEnv.step(act)

        if info['kickable'] and firstRecord:
            step_to_ball = numTakenActions - numTakenActionCKPT
            firstRecord = False

        # all updates on the parameters shall acqure lock
        numTakenActions += 1

        if done:
            firstRecord = True
            status_lst.append(status)
            steps_in_episode.append(numTakenActions - numTakenActionCKPT)

            if step_to_ball is None:
                steps_to_ball.append(numTakenActions - numTakenActionCKPT)
            else:
                steps_to_ball.append(step_to_ball)
                step_to_ball = None

            episodeNumber += 1
            numTakenActionCKPT = numTakenActions

            # hfoEnv alreay call 'preprocessState' in 'reset'
            state = torch.tensor(hfoEnv.reset()).to(device)

    filename = os.path.join(args.log_dir, 'log_eval.pkl')
    saveLog(log, filename)
Exemple #10
0
def train(idx, args, learning_network, target_network, optimizer, lock,
          counter):
    # init port & seed for the thread based on id val
    port = 8100 + 10 * idx  # init
    seed = idx * 113 + 923
    torch.manual_seed(seed)
    worker_network = ValueNetwork(15, 4)
    worker_network.load_state_dict(learning_network.state_dict())
    # change init
    # init env
    hfo_env = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed)
    hfo_env.connectToServer()
    episode_num = 0
    eps = random.sample(epsilon_list, 1)[0]
    worker_timestep = 0
    mse_loss = nn.MSELoss()
    max_worker_steps = args.max_steps / args.num_processes
    can_continue = True
    goal = 0
    to_goal = []
    while can_continue:
        # run episode
        obs_tensor = hfo_env.reset()
        done = False
        loss = 0
        reward_ep = 0
        ep_steps = 0
        upd_steps = 0
        while not done:
            # select action based on greedy policy
            action_idx = select_action(obs_tensor, worker_network,
                                       worker_timestep, max_worker_steps, args,
                                       eps)
            action = hfo_env.possibleActions[action_idx]
            # observe next
            next_obs_tensor, reward, done, status, info = hfo_env.step(action)
            y = computeTargets(reward, next_obs_tensor, args.discount, done,
                               target_network)
            # compute predictions again for the best action. Here we change params
            q_next = computePrediction(obs_tensor, action_idx, worker_network)
            # put new state
            obs_tensor = next_obs_tensor
            # update episode stats
            loss += mse_loss(y, q_next)
            reward_ep += reward
            upd_steps += 1
            ep_steps += 1
            worker_timestep += 1
            if status == 1:
                goal += 1
                to_goal.append(ep_steps)
            with lock:
                counter.value += 1
                if counter.value % args.checkpoint_time == 0:
                    saveModelNetwork(
                        learning_network,
                        args.checkpoint_dir + '_{}'.format(counter.value))
            # if terminal or time to update network
            if done or worker_timestep % args.val_net_update_freq == 0:
                worker_network.zero_grad()
                optimizer.zero_grad()
                # take mean loss
                loss /= upd_steps
                loss.backward()
                sync_grad(learning_network, worker_network)
                optimizer.step()
                worker_network.load_state_dict(learning_network.state_dict())
                loss = 0
                upd_steps = 0
            # perform update of target network
            if counter.value % args.tgt_net_update_freq == 0:
                target_network.load_state_dict(learning_network.state_dict())

            if counter.value % args.checkpoint_time == 0:
                saveModelNetwork(
                    learning_network,
                    args.checkpoint_dir + '_{}'.format(counter.value))
        episode_num += 1

        # if time is exceeded -> break the loop
        can_continue = counter.value <= args.max_steps\
                       and worker_timestep <= max_worker_steps\
                       and status !=SERVER_DOWN \
                       and episode_num <= args.num_episodes # lets run just 8K episodes
    # finish the game
    hfo_env.quitGame()
    # save the network it stopped with
    saveModelNetwork(learning_network,
                     args.checkpoint_dir + '_{}_final'.format(counter.value))
Exemple #11
0
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.multiprocessing as mp
#from Networks import ValueNetwork
from SharedAdam import SharedAdam
from Environment import HFOEnv
#from Worker import train
import random

port = random.randint(0,9999)
hfoEnv = HFOEnv(numTeammates=1, numOpponents=1, port=port, seed=123)
hfoEnv.connectToServer()
obs = hfoEnv.reset()
aa = hfoEnv.possibleActions

obsn = hfoEnv.step(aa[0])
obsn = hfoEnv.step(aa[1])[0]
obsn = torch.Tensor(obsn).unsqueeze(0)

obsn = torch.cat((obsn,torch.Tensor([0]).unsqueeze(0)), 1)
print('obs1',obsn,obsn.shape)
def train(idx,
          port,
          target_network,
          value_network,
          lock,
          counter,
          num_episodes=16000,
          name=""):

    print("Starting a worker {}".format(port))

    # port = 2207
    seed = 2207
    hfoEnv = HFOEnv(numTeammates=0, numOpponents=1, port=port, seed=seed)
    hfoEnv.connectToServer()

    episodeNumber = 0
    epsilon = 1
    discountFactor = 0.99

    I_async_update = 5
    I_target = 10000

    goals = 0
    paramSaves = 0
    lastSaved = 0

    hard_update(target_network, value_network)

    optimizer = optim.Adam(value_network.parameters(), lr=1e-5)
    optimizer.zero_grad()

    t = 0  # local timestep counter

    if idx == 0:  # first thread keeps track of stats
        stats = []

    # run for certain number of timesteps
    while t < num_episodes * 500:

        timesteps_to_goal = 0  # for measuring performance
        total_reward = 0  # accumulated reward (without discounting)

        status = 0
        observation = hfoEnv.reset()
        #        print(observation.shape)

        # linearly decrease epsilon
        epsilon = max(0.0, (22000 - episodeNumber) / 22000)

        while status == 0:

            # EPSILON GREEDY - TAKE AN ACTION

            if np.random.rand() < epsilon:
                # choose a random action
                action = np.random.choice(range(len(hfoEnv.possibleActions)))
            else:
                # choose greedy action
                lock.acquire()
                values = value_network(
                    torch.Tensor(observation)).detach().numpy()
                action = np.argmax(values)
                lock.release()

            newObservation, reward, done, status, info = hfoEnv.step(
                hfoEnv.possibleActions[action])

            total_reward += reward

            # keep track of goals scored
            if reward >= 50.0:
                goals += 1

            # COMPUTE TARGET VALUE
            lock.acquire()
            target_value = computeTargets(reward, [newObservation],
                                          discountFactor, done, target_network)

            prediction = computePrediction([observation], action,
                                           value_network)

            loss = 0.5 * (prediction - target_value.detach())**2

            # accummulate gradient
            loss.backward()
            lock.release()

            observation = newObservation

            # update local counter t
            t += 1
            timesteps_to_goal += 1

            # update global counter T
            lock.acquire()
            counter.value += 1

            # update target network
            if counter.value % I_target == 0:
                hard_update(target_network, value_network)

            # only the first worker saves the model (every 1 mil)
            if idx == 0 and counter.value >= 1000000 + lastSaved:
                lastSaved = counter.value
                print("saving model")
                paramSaves += 1
                path = "{}_params_{}".format(name, paramSaves)
                saveModelNetwork(value_network, path)

            # update value network and zero gradients
            if t % I_async_update == 0 or done:
                print("Doing async update")
                optimizer.step()
                optimizer.zero_grad()

            lock.release()

            if done:
                if idx == 0:
                    timesteps_to_goal = timesteps_to_goal if status == 1 else 500
                    stats.append(timesteps_to_goal)
                    mean = np.mean(stats)  # mean ep length
                    # output things to a csv for monitoring
                    print("{}, {}, {}, {}, {}, {}, {}, {}".format(
                        episodeNumber, t, mean, goals, epsilon,
                        timesteps_to_goal, status, total_reward),
                          file=open("{}experiment.csv".format(name), "a"))
                episodeNumber += 1