def setUpClass(cls):
    environment = None
    if _options.environment_type == 'k8s':
      import k8s_environment  # pylint: disable=g-import-not-at-top
      environment = k8s_environment.K8sEnvironment
    if _options.environment_type == 'local':
      import local_environment  # pylint: disable=g-import-not-at-top
      environment = local_environment.LocalEnvironment
    elif _options.environment_type == 'aws':
      import aws_environment  # pylint: disable=g-import-not-at-top
      environment = aws_environment.AwsEnvironment

    if not environment:
      logging.fatal('No environment type selected')

    if _options.name:
      cls.env = environment()
      cls.env.use_named(_options.name)
    elif _options.environment_params:
      cls.env = environment()
      environment_params = dict((k, v) for k, v in (
        param.split('=') for param in _options.test_params.split(',')))
      cls.env.create(**environment_params)
    else:
      cls.create_default_local_environment()
    if not cls.env:
      logging.fatal('No test environment exists to test against!')
    cls.test_params = dict((k, v) for k, v in (
        param.split('=') for param in _options.test_params.split(',')))
Example #2
0
def run():
    # Run function, to be called below
    # set up the excel link
    # Sort out the ridiculous problems with working directory
    # that are created by xlwings
    exec_dir = os.path.dirname(os.path.abspath(__file__))
    os.chdir(exec_dir)    
    
    # set up the links
    xl_filename = os.path.join(os.getcwd(), 'dapper_control.xlsm')
    xl_control = xl_link(xl_filename)
    
    # grab parameters from the xl file
    input_topkml = xl_control.topkml
    input_bottomkml = xl_control.bottomkml
    outputkml = xl_control.oputkml
    yearlist = xl_control.grab_time()
    oputschedule = xl_control.grab_oputschedule()
    debug_dapper = xl_control.debug
    
    welcome()

    # (1) initialize environment for simulation
    t_env = environment(xl_control)
    
    # (2) initialize dune field from input
    t_dunefield = dunefield(input_topkml, input_bottomkml)
    
    # (3) run the dune field forward in time
    for t in range(0, t_env.timesteps):
        t_dunefield.advance(t_env, t)
        
        # call debug code to push debug outputs
        if debug_dapper:
            t_dunefield.debug(t_env, t)
        
        # check output schedule to see if its time to output interim file
        if oputschedule[t]:
            interim_oputfile = xl_control.interim_prefix + str(yearlist[t]) + '.kml'
            interim_oputfile = interim_oputfile
            t_dunefield.output(interim_oputfile)
        
        # message the user
        user_message = 'Timestep completed: ' + str(yearlist[t])
        print(user_message)
        xl_control.message(user_message)
    
    # (4) output final dune field crest position 
    t_dunefield.output(outputkml)
    print ('Simulation finished!')
    xl_control.message('Simulation finished . . ')
    
    return
Example #3
0
        uid = sessions[session]
        did_update = update_account_name(client, uid, name)
        return json.dumps({"msg": str(did_update)}), 200 if did_update else 500
    else:
        return json.dumps({"err": "Bad method 405"}), 405


@app.route('/api/account/picture/<u_id>', methods=['GET'])
@cross_origin()
def getUserPicture(u_id):
    """Find a user picture given a supplied u_id"""
    image_string = read_pictureFromUID(client, u_id)
    return image_string


@app.route('/api/account/thumbnail/<u_id>', methods=['GET'])
@cross_origin()
def getUserThumbnail(u_id):
    """Find a user thumbnail given a supplied u_id"""
    image_string = read_thumbnailFromUID(client, u_id)
    return image_string


if __name__ == "__main__":
    """Configuration"""
    env = environment("app.env")
    url = env.get_env("url").format(env.get_env("password"))
    client = mongoClient(url).getClient()

    app.run(debug=True, host='0.0.0.0', port=int(os.environ.get('PORT', 8080)))
Example #4
0
from environment import *
import time, random, math

if __name__ == "__main__":
    a = time.time()
    env = environment()
    env.reset()
    # env.elasticity = 1

    p0 = env.robots[0]
    p1 = env.robots[1]

    p0.pos(50, 50)
    p0.diameter = 50
    p0.raio = 25
    p0.m = 1
    p0.angle = math.pi * 6 / 4
    p0.saveState()

    p1.pos(800, 500)
    p1.diameter = 50
    p1.raio = 25
    p1.m = 1
    p1.angle = 0
    p1.saveState()

    # Ciclo percorrido
    # Example 50x  (1/50)s >>> Percorrido 1s
    # Example 100x (1/50)s >>> Percorrido 2s

    # env.step([[0,0],[5000,0]])
Example #5
0
def main():
    start_time = time.time()
    print("Started...")
    Result2 = []
    Result1 = []
    Henkel1 = []
    Henkel2 = []
    environmen = environment([0, 1, 2, 3], [0, 1, 2, 3, 4, 5, 6, 7], rank=3)
    t = TestGenerator(3, environmen)
    testsdata = t.generate()
    File = "logNewExperiment.txt"  #for storing detail information
    deleteContent(File)
    File2 = "ResultNewExperiment.txt"  #for storing results
    deleteContent(File2)
    with open(File, "a") as myfile:
        myfile.write("started .... " + '\n')
    with open(File2, "a") as myfile2:
        myfile2.write("started .... " + '\n')
    ObservationProbability = observation(Bias=5, Numberobservation=4)
    print(ObservationProbability)
    # c=ConstructEnvironment(10,TestData=l,SizeOfTraining=100,stepSize=0.01,lambda1=10,numIteration=2,FileName=File)
    for step in [0.001, 0.01, 0.1, 0.5, 1]:
        for lamda1 in [10, 3, 1, 0.1, 0.01, 0.005]:
            for i in [50, 100, 500, 1000, 3000]:  # size of Training data
                start_time = time.time()
                L1 = []
                L2 = []
                H1 = []
                H2 = []
                for k in range(4):
                    A = ConstructEnvironment(
                        observationProbabilty=ObservationProbability,
                        sizeOfCone=4,
                        SizeOfTraining=i,
                        TestData=testsdata,
                        stepSize=step,
                        lambda1=lamda1,
                        numIteration=150,
                        FileName=File)
                    L1.append(A.R1)
                    L2.append(A.R2)
                    H1.append(A.E1)
                    H2.append(A.E2)
                Result1.append(L1)
                Result2.append(L2)
                Henkel1.append(H1)
                Henkel2.append(H2)
            with open(File2, "a") as myfile2:
                myfile2.write('Results For step size equal to: ' + str(step) +
                              ' and lamda1: ' + str(lamda1) + '\n')
                myfile2.write(
                    "Error Algorithm without Denoising On Test Data: " +
                    str(Result1) + '\n')
                myfile2.write("Error Algorithm with Denoising On Test Data: " +
                              str(Result2) + '\n')
                myfile2.write(
                    "Error Henkel Matrix of Algorithm without Denoising: " +
                    str(Henkel1) + '\n')
                myfile2.write(
                    "Error Henkel Matrix of Algorithm with Denoising: " +
                    str(Henkel2) + '\n')
                myfile2.write("Running time: " + str(time.time() - start_time))
            print('Results For step size equal to: ', str(step),
                  ' and lamda1: ', str(lamda1), '\n')
            print("Error Algorithm without Denoising On Test Data: ", Result1)
            print("Error Algorithm with Denoising On Test Data: ", Result2)
            print("Error Henkel Matrix of Algorithm without Denoising: ",
                  Henkel1)
            print("Error Henkel Matrix of Algorithm with Denoising: ", Henkel2)
            print("Running Time : ", time.time() - start_time)
Example #6
0
parser.add_argument(
    '--model',
    help="select specific model to test, sac-stg2, sac-stg1 or sac-wos")
args = parser.parse_args()
model = args.model

if __name__ == "__main__":

    if model != 'sac-stg2' and model != 'sac-stg1' and model != 'sac-wos':
        print('Wrong model name :( ')
    else:
        pygame.init()
        pygame.font.init()

        if model == 'sac-stg2' or model == 'sac-stg1':
            env = environment(traj_num=6, model='sac')
        else:
            env = environment(traj_num=6, model='sac-wos')

        action_dim = 2  # steer, throttle
        state = env.getState()
        state_dim = len(state)
        print('action_dimension:', action_dim, ' --- state_dimension:',
              state_dim)

        # Initializing the Agent for SAC and load the trained weights
        actor = SAC_Actor(state_dim=state_dim,
                          action_dim=action_dim).to(device)
        if model == 'sac-stg2':
            model_path = '../weights/sac-stg2/policy_net_1280.pth'
        if model == 'sac-stg1':
Example #7
0
def listas_iguales(lista, lista2):
    if len(lista) == len(lista2):
        for i in range(len(lista)):
            if lista[i] != lista2[i]:

                return False
        return True
    else:
        return False


if __name__ == "__main__":
    env_params = {"base_pose": [200, 300], "nBots": 50, "r_rad": 20}

    env = environment(env_params)
    gap = env.robots[0].radius * 0.8

    pathplanner = formation_planner(env, gap=gap)

    pathplanner.set_robots_active(n_bots_formation)
    pathplanner.update_actives()

    goal = np.array([(gap * n_bots_formation) / 2, 0])
    print('New goal:', goal)

    pathplanner.create_formation()
    pathplanner.move_formation_goal(np.array([20, 20]))
    count = 0
    set_goal = 0
    same_robots = True
Example #8
0
import pygame
import numpy as np
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense
import csv
import os
from tools import getHeading, bool2num
np.random.seed(1234)

if __name__ == "__main__":

    pygame.init()
    pygame.font.init()
    env = environment(
        4, 9, traj_num=6,
        model='dqn')  #Block number of throttle and the steering angle
    action_num = env.tStateNum * env.sStateNum
    state = env.getState()
    states_num = len(state)
    print('action_num: ', action_num, ' --- ', 'states_num: ', states_num)

    # Initializing the Agent for DQN and load the trained weights
    model = Sequential()
    model.add(Dense(48, input_dim=42, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(48, activation='relu'))
    model.add(Dense(50, activation='linear'))
    model.load_weights('../weights/dqn/weights_eposide_1330.h5')

    destinationFlag = False
Example #9
0
def run_MC(initialQV=None, train=True, random=False):
    epochs = 100000 if train else 1
    epsilon = 1. if train else 0  # E-greedy

    maze_size = 15
    walls = [(3, 4), (3, 5), (3, 6)]
    dim = compress_dim2()
    qv = initialQV if initialQV is not None else np.zeros(
        dim)  #This creates our Q-value look-up table
    sa_count = np.zeros(
        dim)  #Record how many times we've seen a given state-action pair.
    returnSum = 0
    stepSum = 0
    gameplay = []
    stats = np.zeros((int(epochs / 100), 2))

    for i in range(epochs):
        env = environment.random(maze_size,
                                 walls=walls) if random else environment(
                                     maze_size=maze_size, walls=walls)
        state = env.state
        ended = False
        episodeReward = 0
        max_epoch_length = 100

        ds = []  #we keep track of all the "decision states"

        while not ended and (not random or max_epoch_length > 0
                             ):  #while the snake hasn't eaten itself/Wall
            d = compress2(env,
                          state)  #we "compress" the state to make it smaller

            if not train:
                gameplay.append(env.maze_string())

            # E-greedy policy
            if (np.random.random() < epsilon
                    or np.count_nonzero(qv[d[0], d[1], d[2], d[3], :]) == 0):
                act = np.random.randint(0, len(actions))
            else:
                act = np.argmax(qv[d[0], d[1], d[2],
                                   d[3], :])  #select the best action

            d = tuple(list(d) +
                      [act])  #append the chosen action to the decision

            sa_count[d] += 1
            ds.append(d)

            state, reward, ended = env.step(act)
            episodeReward += reward
            max_epoch_length -= 1

        epsilon = epsilon * 0.9999

        # Update Q values of the visited states
        for d in ds:
            qv[d] = qv[d] + (1. / sa_count[d]) * (episodeReward - qv[d])

        returnSum += episodeReward
        stepSum += len(ds)
        if (i % 100 == 0 and i > 0):
            print("Episode: ", i, "Average Return: ", returnSum / 100.0,
                  "Average Steps: ", stepSum / 100.0)
            stats[int(i / 100) - 1, 0] = returnSum / 100.0
            stats[int(i / 100) - 1, 1] = stepSum / 100.0
            returnSum = 0
            stepSum = 0

    if train:
        return qv, stats
    else:
        return qv, gameplay
import environment
from environment import *
from mpl_toolkits import mplot3d
import numpy as np
import matplotlib.pyplot as plt

qfunc = {}
envir = environment()
N = 100


def monteControl(s):
    qfunc.setdefault((s.dealer_first, s.sum_player, 0), {
        'value': 0,
        'count': 0
    })
    qfunc.setdefault((s.dealer_first, s.sum_player, 1), {
        'value': 0,
        'count': 0
    })

    if qfunc[(s.dealer_first, s.sum_player,
              0)]['value'] > qfunc[(s.dealer_first, s.sum_player, 1)]['value']:
        maxi = 0
    elif qfunc[(s.dealer_first, s.sum_player, 0)]['value'] < qfunc[(
            s.dealer_first, s.sum_player, 1)]['value']:
        maxi = 1
    else:
        maxi = np.random.choice([0, 1], p=[0.5, 0.5])

    other = np.random.choice([0, 1], p=[0.5, 0.5])
def sarsa( num_episodes, time_steps, max_negative_reward, actions, discount_factor, num_agents, threshold ):

	"""
	1. actions: 
			0 -> Do not warn the driver
			1 -> Warn the driver
	2. no_features:
		Around 40
	"""
	# Correct
	## Changing this parameter: The number of agents considered are 5. 
	## Only the nearest 5 agents are considered for feature extraction. 

	no_features  = num_agents*4*len(actions)

	returns_episodes = []
	weights = np.zeros(( no_features ))
	epsilon = 0.8

	# Correct
	for i in range(num_episodes):

		if(i%100==0):
			print ( "episode: ",i)	

		# Correct
		car    =  Automobile ( -100, 0, 1, 0)
		env    =  environment( num_agents )
		agents = env.get_agents()


		if(i%100==0):
			epsilon = epsilon/2

		agent_warning = warning( weights, no_features, num_agents,  car, env, epsilon )

		agent_reward = 0

		terminate = False
		count = 0

		while( not terminate and agent_reward > max_negative_reward and count < time_steps ):

			# print "Count: ",count
			count += 1
			"""
			Can update the feature vector here for the warning agent. 
			"""
			# This step is done. 
			env.take_one_step()

			agents_list = env.get_agents()

			# 2. 
			car_action  = car.get_action( threshold , agents_list )
			## Till here everything is correct. The environment is working perfectly. 

			if( car_action == 1 ):
				terminate = True

			if( not car.goal):
				car.action(car_action)
				agent_Rew     = agent_warning.update()
				agent_reward += agent_Rew
			else:
				break

			points = []
			points.append( [car.posX, car.posY])

			for i in range(len(agents)):
				points.append([agents[i].posX, agents[i].posY])

			points = np.array(points)

			# plt.scatter(points[ 0, 0],points[ 0, 1], color='green', linewidths = 3)
			# plt.scatter(points[ 1:, 0], points[ 1:, 1], color = 'blue', linewidths = 3)
			# plt.title("Environment. Green - Car with the warning agent, Blue - Other agents(Cars)")
			# plt.xlim( -100, 100)
			# plt.ylim( -100, 100)
			# plt.show()


		returns_episodes.append(agent_reward)
		weights = agent_warning.weights

		all_actions = agent_warning.all_actions

		# print "Number of 0's: ",all_actions.count(0)
		# print "Number of 1's: ",all_actions.count(1)
		# plt.hist(all_actions)
		# plt.ylim(0,200)
		# plt.show()

	return returns_episodes
Example #12
0
    parser.add_argument('--num_hidden_layers', default=2, type=int)
    parser.add_argument('--num_hidden_units_per_layer', default=256, type=int)
    parser.add_argument('--sample_frequency', default=256, type=int)
    parser.add_argument('--activation', default='Relu', type=str)
    parser.add_argument('--render', default=False, type=bool)  # show UI or not
    parser.add_argument('--log_interval', default=50, type=int)  #
    parser.add_argument('--load', default=False, type=bool)  # load model

    args = parser.parse_args()

    print(1)
    pygame.init()
    print(2)
    pygame.font.init()
    print(3)
    env = environment(traj_num=1)

    action_dim = 2
    state = env.getState()
    state_dim = len(state)
    print('action_dimension:', action_dim, ' & state_dimension:', state_dim)

    destinationFlag = False
    collisionFlag = False
    awayFlag = False
    carla_startFlag = False

    agent = SACAgent(state_dim=state_dim, action_dim=action_dim)

    if args.load: agent.load(epoch=60, capacity=1)
Example #13
0
assert (numIndividuals > 0) & (numTimesteps > 0) & (numNearestNeighbours > 0) & (numIndividuals > numNearestNeighbours), print("invalid arguments: numTimesteps={}!>0, numIndividuals={}!>numNearestNeighbours={}!>0".format(numTimesteps, numIndividuals, numNearestNeighbours))

### Define Korali Problem
import korali
k = korali.Engine()
e = korali.Experiment()

### Define results folder and loading previous results, if any
resultFolder = '_result_vracer/'
found = e.loadState(resultFolder + '/latest')
if found == True:
	print("[Korali] Continuing execution from previous run...\n");

### Define Problem Configuration
e["Problem"]["Type"] = "Reinforcement Learning / Continuous"
e["Problem"]["Environment Function"] = lambda x : environment( args, x )
e["Problem"]["Agents Per Environment"] = numIndividuals

### Define Agent Configuration 
e["Solver"]["Type"] = "Agent / Continuous / VRACER"
e["Solver"]["Mode"] = "Training"
e["Solver"]["Episodes Per Generation"] = 10
e["Solver"]["Experiences Between Policy Updates"] = 1
e["Solver"]["Learning Rate"] = 0.0001
e["Solver"]["Discount Factor"] = 0.995
e["Solver"]["Mini Batch"]["Size"] = 256

### Define Variables
# States (distance and angle to nearest neighbours)
for i in range(numNearestNeighbours):
  e["Variables"][i]["Name"] = "Distance " + str(i)
Example #14
0
def experiment(variant):
    args = getArgs()
    expl_env = NormalizedBoxEnv(environment(args, 'sac'))
    eval_env = NormalizedBoxEnv(environment(args, 'sac'))
    obs_dim = expl_env.observation_space.low.size  #
    action_dim = expl_env.action_space.low.size  #

    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )

    trainedfile = '/home/yujr/rlkit/data/SAC/3.0/params.pkl'
    data = torch.load(trainedfile)
    print("data loaded", data['evaluation/policy'])
    policy = data['evaluation/policy'].stochastic_policy

    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(env=eval_env,
                         policy=policy,
                         qf1=qf1,
                         qf2=qf2,
                         target_qf1=target_qf1,
                         target_qf2=target_qf2,
                         **variant['trainer_kwargs'])
    algorithm = TorchBatchRLAlgorithm(
        trainer=trainer,
        exploration_env=expl_env,
        evaluation_env=eval_env,
        exploration_data_collector=expl_path_collector,
        evaluation_data_collector=eval_path_collector,
        replay_buffer=replay_buffer,
        **variant['algorithm_kwargs'])
    algorithm.to(ptu.device)
    algorithm.train()
Example #15
0
#? 4.

player = agent(2, "player")
scorer = {}

#* Player gets "batch_size" tries with the same parameters.
#* After "batch_size", the environment's reset and the agent is updated.
for _ in range(episodes):
    #* Initial parameters
    player_distance_from_hoop = np.random.randint(
        100, hoop_pole_position[0] - hoop_width - player_width)
    player_position = (hoop_pole_position[0] - player_distance_from_hoop,
                       screen_height - ground_thickness - player_height)

    for count in range(batch_size):
        env = environment(player_position, player_distance_from_hoop)
        (power, angle) = (player.predict([player_distance_from_hoop])[0][0],
                          player.predict([player_distance_from_hoop])[0][1])
        original_power = power
        original_angle = angle
        power *= 2000
        angle *= 90
        score = env.throw(power, angle)
        print("Power: ", power, "Angle: ", angle, "Score: ", score)
        scorer[(player_distance_from_hoop, original_power,
                original_angle)] = score

    for index, value in enumerate(scorer.values()):
        if value > 0:
            for _ in range(data_loop):
                player.train(list(scorer.keys())[index])
Example #16
0

if __name__ == "__main__":
    # Intialization of the connector to V-Rep simulator
    clientID = vrep.simxStart("127.0.0.1", 19997, 1, 1, 2000, 5)
    res, objs = vrep.simxGetObjects(clientID, vrep.sim_handle_all,
                                    vrep.simx_opmode_blocking)
    if clientID > -1:
        print("Connect to Remote API server!")
    else:
        print('Failed connecting to remote API server')
        sys.exit()
    #Initializing the Robot information
    #Initializing the Learning Agent for DQN
    env = environment(
        10,
        10)  #Block number of linear velocity and the grad of angular velocity
    action_num = env.vStateNum * env.aStateNum
    states_num = len(env.getState())
    print(action_num, ' --- ', states_num)
    agent = DQNAgent(states_num, action_num)
    # Start Training
    done = False
    batch_size = 32
    for e in range(EPISODES):
        print("------------------------->  ", e)
        print(agent.epsilon)
        env.reset(clientID)
        env.setCtrl(INIT_CORR_NUM)
        time.sleep(1)
        # Collecting the status information of  mobile robot
from environment import *
import matplotlib.pyplot as plt
import numpy as np
from robot import *
from bicycleRobot import *

world = environment()
world.print()
feature = [[1, ii] for ii in range(world.length - 1)]
world.addFeature(feature)
world.addGoal([9, 7])
rb = robot(0, 0, 0, world)
world.addRobot(rb)
world.print()
world.moveRobot([1, 1], 0)
world.moveRobot([0, 1], 0)
world.prettyPrint()
hueristic = []
world.prettyPrint(hueristic)
for y in range(world.length):
    hueristic.append([])
    for x in range(world.width):
        hueristic[y].append(
            math.sqrt((world.goalX - x)**2 + (world.goalY - y)**2))
pathPlan = world.planAStar(hueristic)
world.prettyPrint(pathPlan)

#Test DP algorithm on left turn scenario world
costFxn = [1, 1, 1, 10]
driveWorld = environment(6, 6, 1)
driveWorld.addGoal([0, 3], 270)
Example #18
0
def run_QL(initialQV=None, train=True, random=False):
    epochs = 100000 if train else 1
    epsilon = 1. if train else 0  # E-greedy
    gamma = 0.1
    alpha = 0.1

    maze_size = 15
    walls = [(3, 4), (3, 5), (3, 6)]
    dim = compress_dim2()
    qv = initialQV if initialQV is not None else np.zeros(
        dim)  #This creates our Q-value look-up table
    returnSum = 0
    stepSum = 0
    gameplay = []
    stats = np.zeros((int(epochs / 100), 2))

    for i in range(epochs):
        env = environment.random(maze_size,
                                 walls=walls) if random else environment(
                                     maze_size=maze_size, walls=walls)
        state = env.state
        ended = False
        max_epoch_length = 100

        while not ended and (not random or max_epoch_length > 0):
            if not train:
                gameplay.append(env.maze_string())

            d = compress2(env, state)

            # E-greedy policy
            if (np.random.random() < epsilon
                    or np.count_nonzero(qv[d[0], d[1], d[2], d[3], :]) == 0):
                act = np.random.randint(0, len(actions))
            else:
                act = np.argmax(qv[d[0], d[1], d[2],
                                   d[3], :])  #select the best action

            d = tuple(list(d) +
                      [act])  #append the chosen action to the decision

            state_new, reward, ended = env.step(act)

            q_next = 0 if ended else np.max(qv[d[0], d[1], d[2], d[3], :])
            qv[d] += alpha * (reward + gamma * q_next - qv[d])
            state = state_new
            returnSum += reward
            stepSum += 1
            max_epoch_length -= 1

        epsilon = epsilon * 0.9999
        if (i % 100 == 0 and i > 0):
            print("Episode: ", i, "Average Return: ", returnSum / 100.0,
                  "Average Steps: ", stepSum / 100.0)
            stats[int(i / 100) - 1, 0] = returnSum / 100.0
            stats[int(i / 100) - 1, 1] = stepSum / 100.0
            returnSum = 0
            stepSum = 0

    if train:
        return qv, stats
    else:
        return qv, gameplay