Ejemplo n.º 1
0
    def __init__(self):
        self.sample_batch_size = 32
        self.episodes = 10
        self.env = PivitEnv(6)

        self.state_size = 6  #self.env.observation_space.shape[0]
        self.action_size = 6  #self.env.action_space.n
        self.dqnagent = DQNAgent(self.state_size)
        self.minimax = Bot()
Ejemplo n.º 2
0
def selectAgentIndex(index, functionAproxModel):
    iToA = {0: rAgent.rAgent(),
            1: TDLA.TDLambdaAgent(functionAproxModel),
            2: TDLAN.TDLambdaAgent(functionAproxModel),
            3: DQN.DQNAgent(functionAproxModel)}

    return iToA[index]
Ejemplo n.º 3
0
def setup_env_agent(display_screen, frame_skip, force_fps, reward_shaping,
                    frame_stack, train):
    game = FlappyBird()
    ple_flappy = PLE(game,
                     fps=30,
                     display_screen=display_screen,
                     frame_skip=frame_skip,
                     force_fps=force_fps)
    if reward_shaping and train:
        z = ple_flappy.game.rewards
        z['tick'] = 0.1
        ple_flappy.game.adjustRewards(z)
    ple_flappy.init()
    agent = DQNAgent(ple_flappy.getActionSet(), frame_stack=frame_stack)

    return ple_flappy, agent
Ejemplo n.º 4
0
def run_experiment(args):
    parameters = Parameters.processArguments(args, __doc__)

    #if the nnFile is a directory, check for a previous experiment run in it and start from there
    #load its parameters, append to its evalresults file, open its largest network file
    #If its none, create a experiment directory. create a results file, save parameters, save network files here.

    experimentDirectory = parameters.rom + "_" + time.strftime(
        "%d-%m-%Y-%H-%M") + "/"
    resultsFileName = experimentDirectory + "results.csv"
    startingEpoch = 1
    if parameters.nnFile is None or parameters.nnFile.endswith(".pkl"):
        #Create your experiment directory, results file, save parameters
        if not os.path.isdir(experimentDirectory):
            os.mkdir(experimentDirectory)

        resultsFile = open(resultsFileName, "a")
        resultsFile.write("Epoch,\tAverageReward,\tMean Q Value\n")
        resultsFile.close()

        parametersFile = open(experimentDirectory + "parameters.pkl", 'wb', -1)
        cPickle.dump(parameters, parametersFile)
        parametersFile.close()

    if parameters.nnFile is not None and os.path.isdir(parameters.nnFile):
        #Found a experiment directory
        if not parameters.nnFile.endswith("/"):
            parameters.nnFile += "/"

        experimentDirectory = parameters.nnFile
        resultsFileName = experimentDirectory + "results.csv"

        if os.path.exists(experimentDirectory + "parameters.pkl"):
            parametersFile = open(experimentDirectory + "parameters.pkl", 'rb')
            parameters = cPickle.load(parametersFile)
            parametersFile.close()
        else:
            parametersFile = open(experimentDirectory + "parameters.pkl", 'wb',
                                  -1)
            cPickle.dump(parameters, parametersFile)
            parametersFile.close()

        contents = os.listdir(experimentDirectory)
        networkFiles = []
        for handle in contents:
            if handle.startswith("network") and handle.endswith(".pkl"):
                networkFiles.append(handle)

        if len(networkFiles) == 0:
            #Found a premature experiment, didnt finish a single training epoch
            parameters.nnFile = None
        else:
            #Found a previous experiments network files, now find the highest epoch number
            highestNNFile = networkFiles[0]
            highestNetworkEpochNumber = int(
                highestNNFile[highestNNFile.index("_") +
                              1:highestNNFile.index(".")])
            for networkFile in networkFiles:
                networkEpochNumber = int(networkFile[networkFile.index("_") +
                                                     1:networkFile.index(".")])
                if networkEpochNumber > highestNetworkEpochNumber:
                    highestNNFile = networkFile
                    highestNetworkEpochNumber = networkEpochNumber

            startingEpoch = highestNetworkEpochNumber + 1
            #dont use full exploration, its not a good way to fill the replay memory when we already have a decent policy
            if startingEpoch > 1:
                parameters.epsilonStart = parameters.epsilonEnd

            parameters.nnFile = experimentDirectory + highestNNFile
            print "Loaded experiment: " + experimentDirectory + "\nLoaded network file:" + highestNNFile

    sys.setrecursionlimit(10000)
    ale = ALEInterface()

    Environment.initializeALEParameters(ale, parameters.seed,
                                        parameters.frameSkip,
                                        parameters.repeatActionProbability,
                                        parameters.displayScreen)
    ale.loadROM(parameters.fullRomPath)
    minimalActions = ale.getMinimalActionSet()

    agent = DQNAgent.DQNAgent(
        minimalActions, parameters.croppedHeight, parameters.croppedWidth,
        parameters.batchSize, parameters.phiLength, parameters.nnFile,
        parameters.loadWeightsFlipped, parameters.updateFrequency,
        parameters.replayMemorySize, parameters.replayStartSize,
        parameters.networkType, parameters.updateRule,
        parameters.batchAccumulator, parameters.networkUpdateDelay,
        parameters.discountRate, parameters.learningRate, parameters.rmsRho,
        parameters.rmsEpsilon, parameters.momentum, parameters.epsilonStart,
        parameters.epsilonEnd, parameters.epsilonDecaySteps,
        parameters.evalEpsilon, parameters.useSARSAUpdate,
        parameters.kReturnLength)

    for epoch in xrange(startingEpoch, parameters.epochs + 1):
        agent.startTrainingEpoch(epoch)
        runTrainingEpoch(ale, agent, epoch, parameters.stepsPerEpoch)
        agent.endTrainingEpoch(epoch)

        networkFileName = experimentDirectory + "network_" + str(
            epoch) + ".pkl"
        DeepNetworks.saveNetworkParams(agent.network.qValueNetwork,
                                       networkFileName)

        if parameters.stepsPerTest > 0 and epoch % parameters.evaluationFrequency == 0:
            agent.startEvaluationEpoch(epoch)
            avgReward = runEvaluationEpoch(ale, agent, epoch,
                                           parameters.stepsPerTest)
            holdoutQVals = agent.computeHoldoutQValues(3200)

            resultsFile = open(resultsFileName, 'a')
            resultsFile.write(
                str(epoch) + ",\t" + str(round(avgReward, 4)) + ",\t\t" +
                str(round(holdoutQVals, 4)) + "\n")
            resultsFile.close()

            agent.endEvaluationEpoch(epoch)

    agent.agentCleanup()
Ejemplo n.º 5
0
def main():
    # reward every episode
    waiting_time_plot = []
    total_reward_plot = []
    episode_plot = []
    E_reward = np.load('array_plot/array_total_reward_fix_10000_40.npy')[0]
    version = 0
    E_reward_33 = np.load('array_plot/array_total_reward_fix_10000_33.npy')[0]
    array_plot_reward_40 = []
    array_plot_reward_33 = []
    print('E_reward: ', str(E_reward))
    # Control code here
    memory_size = constants.memory_size  # size memory
    mini_batch_size = constants.mini_batch_size  # minibatch_size
    a_dec = constants.a_dec  # m/s^2
    num_of_phase = constants.num_of_phase  # 2 phase
    action_space_size = num_of_phase * 2 + 1  # 5 actions
    action_policy = constants.action_policy
    tentative_action = [
        np.asarray([1, 1, 1, 1, 1]).reshape(1, action_space_size),
        np.asarray([1, 1, 0, 0, 0]).reshape(1, action_space_size),
        np.asarray([1, 0, 1, 0, 0]).reshape(1, action_space_size),
        np.asarray([1, 0, 0, 1, 0]).reshape(1, action_space_size),
        np.asarray([1, 0, 0, 0, 1]).reshape(1, action_space_size)
    ]

    # global count_action_dif_default
    I = np.full((action_space_size, action_space_size),
                0.5).reshape(1, action_space_size, action_space_size)
    idLightControl = constants.idLightControl

    numb_of_cycle = 0

    # new Agent.
    agent = DQNAgent.DQNAgent(memory_size, action_space_size, mini_batch_size)
    try:
        agent.load('Models/reinf_traf_control_v14_loss_real_time.h5')
    except:
        print('No models found')
    # agent.start_epsilon = 0
    # new Sumo Intersection
    sumo_int = SumoIntersection.SumoIntersection()

    # 2000 episodes
    episodes = 2000

    # command to run SUMO
    sumo_cmd = [sumoBinary, "-c", sumoConfig, '--no-warnings']

    # run 2000 episodes
    for e in range(episodes):
        waiting_time_t = 0
        total_reward = 0
        waiting_time = 0
        waiting_time_t_v2 = 0
        waiting_time_average = []
        # start sumo simulation.
        traci.start(sumo_cmd)

        # init action.
        action = 0

        # time for each phase
        action_time = [33, 33]

        state, tentative_act_dec = sumo_int.getState(I, action,
                                                     tentative_action)

        # run a cycle.
        while (traci.simulation.getMinExpectedNumber() > 0):

            # run a step on SUMO (~ 1 second).
            traci.simulationStep()

            # Get progress?
            agent.progress = agent.get_progress()
            action = agent.select_action_v2(state, tentative_act_dec)

            #  ============================================================ Perform action ======================
            for j in range(num_of_phase):
                action_time[j] += action_policy[action][j]
                if action_time[j] < 0:
                    action_time[j] = 0
                elif action_time[j] > 60:
                    action_time[j] = 60
            for j in range(action_time[0]):
                traci.trafficlight.setPhase(idLightControl, 0)
                traci.simulationStep()
                waiting_time_average.append(cal_waiting_time_average())
                waiting_time += cal_waiting_time_v2()
            yellow_time1 = sumo_int.cal_yellow_phase(['gneE21', 'gneE89'],
                                                     a_dec)
            for j in range(yellow_time1):
                traci.trafficlight.setPhase(idLightControl, 1)
                traci.simulationStep()
                waiting_time_average.append(cal_waiting_time_average())
                waiting_time += cal_waiting_time_v2()
            for j in range(action_time[1]):
                traci.trafficlight.setPhase(idLightControl, 2)
                traci.simulationStep()
                waiting_time_average.append(cal_waiting_time_average())
                waiting_time += cal_waiting_time_v2()
            yellow_time2 = sumo_int.cal_yellow_phase(['gneE86', 'gneE85'],
                                                     a_dec)
            for j in range(yellow_time2):
                traci.trafficlight.setPhase(idLightControl, 3)
                traci.simulationStep()
                waiting_time_average.append(cal_waiting_time_average())
                waiting_time += cal_waiting_time_v2()
            #  ============================================================ Finish action ======================:

            # caclulate REWARD V2
            waiting_time_t1_v2 = waiting_time
            reward_t_v2 = waiting_time_t_v2 - waiting_time_t1_v2
            waiting_time_t_v2 = waiting_time_t1_v2
            total_reward += reward_t_v2

            # calculate REWARD
            waiting_time_t1 = cal_waiting_time()
            reward_t = waiting_time_t - waiting_time_t1
            waiting_time_t = waiting_time_t1

            # get NewState by selected-action
            new_state, tentative_act_dec = sumo_int.getState(
                I, action, tentative_action)

            # Case 1: Experience Replay (store tuple) + store TD_error
            agent.store_priority(state, action, reward_t, new_state)

            # Case 2: stored EXP/Tuple
            # agent.remember(state, action, reward_t, new_state, False)

            # reassign
            state = new_state
            numb_of_cycle += 1
            agent.step += 1
            print('------------------------- step: ', numb_of_cycle,
                  ' - total_reward: ', total_reward, ' - action time:',
                  action_time, ' --------------------')

            if agent.progress == 'Training':
                # step 1: if agent.step % 100 == 0 then update weights of target_network.
                # ......... thinking ....................

                # step 2: get mini_batch?
                # minibatch, w_batch, batch_index  = agent.get_prioritized_minibatch()

                # step 3: train.
                agent.replay_priority()
                # agent.replay_random_sample()

                # step 4: update epsilon:
                agent.start_epsilon -= agent.epsilon_decay

        agent.save('Models/reinf_traf_control_v14_loss_real_time.h5')
        traci.close(wait=False)

        if (E_reward < total_reward):
            version += 1
            agent.save('Models_max/reinf_traf_control_v17_reward_max_v' +
                       str(version) + '_e_' + str(e) + '.h5')

        average_waiting_time = (-total_reward) / constants.count_vehicle
        waiting_time_plot.append(average_waiting_time)
        total_reward_plot.append(total_reward)
        array_plot_reward_40.append(E_reward)
        array_plot_reward_33.append(E_reward_33)
        episode_plot.append(e)
        np.save('array_plot/array_waiting_time_average.npy', waiting_time_plot)
        np.save('array_plot/array_total_reward.npy', total_reward_plot)
        np.save('array_plot/array_episode.npy', episode_plot)
        plot_durations(total_reward_plot, array_plot_reward_40,
                       array_plot_reward_33)

    plt.ioff()
    plt.show()
Ejemplo n.º 6
0
class Test:
    def __init__(self):
        self.sample_batch_size = 32
        self.episodes = 10
        self.env = PivitEnv(6)

        self.state_size = 6  #self.env.observation_space.shape[0]
        self.action_size = 6  #self.env.action_space.n
        self.dqnagent = DQNAgent(self.state_size)
        self.minimax = Bot()

    def testDQN(self):
        """Testar DQN jogando contra si"""
        dqn_points = 0
        for index_episode in range(self.episodes):
            state = self.env.reset()
            #state = np.reshape(state, [1, self.state_size])
            done = False
            index = 0
            while not done:
                action = self.dqnagent.act(state, index % 2)
                if action == None:
                    break
                next_state, reward, done, _ = self.env.step(action)
                if reward >= 0:
                    print("DQNAgent made a valid move")
                    reward = 1
                self.dqnagent.memorize(change_colors(state), action, reward,
                                       change_colors(next_state), done)
                dqn_points += reward
                state = next_state
                index += 1
                if index == 100:
                    break
        self.dqnagent.replay(self.sample_batch_size)
        return dqn_points

    def testMinimaxDQN(self):
        """Testar DQN jogando contra MinMax"""
        dqn_points = 0
        minmax_points = 0
        for index_episode in range(self.episodes):
            state = self.env.reset()
            #state = np.reshape(state, [1, self.state_size])
            done = False
            index = 0
            while not done:
                action = self.minimax.act(box_to_board(state), 1, 0)
                if action == None:
                    break
                next_state, reward, done, _ = self.env.step(action)
                self.dqnagent.memorize(state, action, reward + 1, next_state,
                                       done)
                minmax_points += reward
                state = next_state
                action = self.dqnagent.act(state, 1)
                if action == None:
                    break
                next_state, reward, done, _ = self.env.step(action)
                if reward >= 0:
                    print("DQNAgent made a valid move")
                    reward = 1
                self.dqnagent.memorize(change_colors(state), action, reward,
                                       change_colors(next_state), done)
                dqn_points += reward
                state = next_state

                index += 1
                if index == 100:
                    break
        self.dqnagent.replay(self.sample_batch_size)
        return minmax_points, dqn_points

    def testMinimax(self):
        try:
            for index_episode in range(self.episodes):
                state = self.env.reset()
                done = False
                index = 0
                while not done:
                    action = self.minimax.act(box_to_board(state), 1,
                                              index % 2)
                    if action == None:
                        break
                    next_state, reward, done, _ = self.env.step(action)

                    self.dqnagent.memorize(state, action, reward, next_state,
                                           done)
                    state = next_state
                    index += 1
                    if index == 100:
                        break
                print("Episode", index_episode, "Number of moves:", index + 1)
                self.dqnagent.replay(self.sample_batch_size)
        finally:
            self.dqnagent.save_model()
Ejemplo n.º 7
0
import DQNAgent
import torch

BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10
screen_width = 256
screen_height = 240
num_episodes = 50

agent = DQNAgent.DQNAgent(BATCH_SIZE, GAMMA, EPS_START, EPS_END, EPS_DECAY,
                          TARGET_UPDATE, num_episodes, screen_height,
                          screen_width)
agent.train()

tens = torch.randn((1, 1, 240, 256))
#print(tens)

replace = torch.ones(240, 256)

tens[0][0] = replace

#print(tens[0][0])
#print(tens)
Ejemplo n.º 8
0
if __name__ == "__main__":
    # Intialization of the connector to V-Rep simulator
    clientID = vrep.simxStart("127.0.0.1", 19997, 1, 1, 2000, 5)
    res, objs = vrep.simxGetObjects(clientID, vrep.sim_handle_all,
                                    vrep.simx_opmode_blocking)
    if clientID > -1:
        print("Connect to Remote API server!")
    else:
        print('Failed connecting to remote API server')
        sys.exit()

    env = environment(10, 10)
    action_num = env.vStateNum * env.aStateNum
    states_num = len(env.getState())
    print(action_num, ' --- ', states_num)
    agent = DQNAgent(states_num, action_num)
    agent.load("./save/dqn_mTT_121.h5")
    done = False

    env.reset(clientID)
    env.setCtrl(INIT_CORR_NUM)
    time.sleep(0.1)
    # Collecting the status information of  mobile robot
    # Produce the action ID for a robot status information to control robot
    tState = env.getState()
    tState = np.reshape(tState, [1, states_num])
    action = agent.act(tState)
    for tt in range(TEST_STEP):
        print("Action --> ", action)
        next_state, reward, done = env.step(action)
        tState = next_state
Ejemplo n.º 9
0
def main():

    # start time of the program
    start_time = time.time()

    # pixel/frame data
    env = gym.make(hp['GAME'])

    # set an environemntal seed
    env.seed(0)
    np.random.seed(0)

    # 4 actions
    # 0: no-op 1: fire 2: right 3: left
    # -> 0: fire (no-op) 1: right 2: left
    action_space = env.action_space.n - 1

    # returns a tuple, (210, 160, 3)
    input_space = env.observation_space.shape[0]

    # create a new 3 dimensional space for a downscaled grayscale image
    agent_input_space = np.array(
        [hp['HEIGHT'], hp['WIDTH'], hp['FRAME_BATCH_SIZE']])

    if hp['DISCRETE_FRAMING']:
        # create a new 3 dimensional space for a downscaled grayscale image, default: (64, 64, 4)
        # uses two discrete memory history frames
        memory_input_space = np.array(
            [hp['HEIGHT'], hp['WIDTH'], hp['FRAME_BATCH_SIZE']])
    else:
        # create a new 3 dimensional space for a downscaled grayscale image, default: (64, 64, 5)
        # uses a sliding memory history frames
        memory_input_space = np.array(
            [hp['HEIGHT'], hp['WIDTH'], hp['FRAME_BATCH_SIZE'] + 1])

    # print the initial state
    print('AGENT FRAME input:', agent_input_space.shape,
          'DISCRETE FRAME SAVING:', hp['DISCRETE_FRAMING'], 'MEMORY input:',
          memory_input_space.shape, 'ACTION output:', action_space)

    # performance
    stats = []

    # create a DQN Agent
    agent = DQNAgent(agent_input_space, action_space)

    # and a target DQN Agent
    target_agent = DQNAgent(agent_input_space, action_space)

    # to load weights
    if (hp['LOAD_WEIGHTS']):
        agent.load_weights(hp['LOAD_WEIGHTS'])

    # create a memory for remembering and replay
    memory = ReplayMemory(hp['MEMORY_SIZE'], memory_input_space, action_space)
    """
    Run the main loop of the game
    """
    if hp['DISCRETE_FRAMING']:
        run_discrete(agent, target_agent, memory, env, stats, start_time)

    else:
        run_frame_sliding(agent, target_agent, memory, env, stats, start_time)

    # end time of the program
    end_time = time.time()

    # total time in seconds
    time_elapsed = end_time - start_time

    # print the final time elapsed
    print('finished training in', time_elapsed, 'seconds')

    # save and quit
    agent.quit(stats)
import distutils
from distutils import util
import sys
import argparse
import snakeClass
import DQNAgent
import json
import config

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--humanplay",
                        nargs='?',
                        type=distutils.util.strtobool,
                        default=False)
    parser.add_argument("--speed", nargs='?', type=int, default=config.SPEED)
    #    parser.add_argument("--help", nargs='?', type=distutils.util.strtobool, default=False)
    args = parser.parse_args()
    config.SPEED = args.speed
    config.LoadScores()

    game = snakeClass.SnakeGameAI()

    if args.humanplay:
        game.humanGame()
    else:
        DQNAgent.train(game)
Ejemplo n.º 11
0
SHOW_PREVIEW = False

# For stats
ep_rewards = [-200]

# For more repetitive results
# random.seed(1)
# np.random.seed(1)
# tf.random.set_seed(1)

# Create models folder
if not os.path.isdir('models'):
    os.makedirs('models')

env = SnakeEnv()
agent = DQNAgent(env)
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit="episode"):
    agent.tensorboard.step = episode
    episode_reward = 0
    step = 1
    current_state = env.reset()

    done = False
    while not done:
        if np.random.random() > epsilon:
            action = np.argmax(agent.get_qs(current_state))
        else:
            action = np.random.randint(0, env.ACTION_SPACE_SIZE)

        new_state, reward, done = env.step(action)
        episode_reward += reward
Ejemplo n.º 12
0
 res, objs = vrep.simxGetObjects(clientID, vrep.sim_handle_all,
                                 vrep.simx_opmode_blocking)
 if clientID > -1:
     print("Connect to Remote API server!")
 else:
     print('Failed connecting to remote API server')
     sys.exit()
 #Initializing the Robot information
 #Initializing the Learning Agent for DQN
 env = environment(
     10,
     10)  #Block number of linear velocity and the grad of angular velocity
 action_num = env.vStateNum * env.aStateNum
 states_num = len(env.getState())
 print(action_num, ' --- ', states_num)
 agent = DQNAgent(states_num, action_num)
 # Start Training
 done = False
 batch_size = 32
 for e in range(EPISODES):
     print("------------------------->  ", e)
     print(agent.epsilon)
     env.reset(clientID)
     env.setCtrl(INIT_CORR_NUM)
     time.sleep(1)
     # Collecting the status information of  mobile robot
     tState = env.getState()
     tState = np.reshape(tState, [1, states_num])
     # Produce the action ID for a robot status information
     action = agent.act(tState)
     for tt in range(TRAIN_STEP):
Ejemplo n.º 13
0
paddle_speed = window_height / 105
ball_x = 0.5 * window_width
ball_y = (0.5 * (window_height - ScoreBarHeight)) + ScoreBarHeight
ball_xspeed = window_width / 160
ball_yspeed = random.uniform(-3, 3) * window_height / 210
playerScore = 0
cpuScore = 0
paddle_shift = 0
paddle_shift_rate = 0.6

myFont = pygame.font.SysFont("Courier New", 20, bold=True)

# instantiate the Deep Q Neural Agent
state_size = 8
action_size = 3
agent = DQNAgent(state_size, action_size)

# kinda large
batch_size = 1000

# total rewards throughout the lifetime of the game
total_reward = 0

# how many clocks until exit
epoch = 0
TOTAL_TICKS = 300000

# deque for the mean of the rewards measured in the matches
mean = deque(maxlen=10000)

print('hackmt pong ai: Training Mode', TRAINING)
import os
import os.path
import json

setup_dict = {}

if os.path.isfile('config.json'):
    setup_dict = json.loads(open('config.json').read())
setup_dict['observing_frames'] = 25000
setup_dict['replay_memory_size'] = 25000
setup_dict['learning_rate'] = 1e-4
setup_dict['start_eps'] = 0.7
setup_dict['exploring_frames'] = 2000000
setup_dict['saving_dir'] = "DuelingDDQN_AgentPrioritizedForgettingEpsGreedy_2018_06_14"
setup_dict['log_freq'] = 5
setup_dict['MemoryType'] = 'MemoryPrioritizedForgetting'
#setup_dict['MemoryType'] = 'PrioritizedExperienceReplayMemory'
setup_dict['ExplorationStrategy'] = 'EpsilonGreedyExplorationStrategy'
setup_dict['Agent'] = 'Dueling_DDQN_Agent'
setup_dict['update_freq'] = 2
setup_dict['tau'] = 0.005
#agent = DuelingDDQNAgentPER.Dueling_DDQN_PER_Agent(setup_dict)
agent = None
if setup_dict['Agent'] == 'DQN_Agent':
    agent = DQNAgent.DQN_Agent(setup_dict)
if setup_dict['Agent'] == 'Dueling_DDQN_Agent':
    agent = DuelingDDQNAgent.Dueling_DDQN_Agent(setup_dict)
if setup_dict['Agent'] == 'DDQN_Agent':
    agent = DDQNAgent.DDQN_Agent(setup_dict)
agent.train()
Ejemplo n.º 15
0
def trainAgentNEW(agent, nEpisodes, seed, qg, fIndex, initEpisode=1):
    random.seed(seed)
    arpm = RP.AgentMemory(100)
    rpm = RP.ReplayMemoryDQN(1000)

    print(agent)
    if str(agent) == 'TDLambdaAgentNEW':
        envAgent = TDLAN.TDLambdaAgent(functionIndex(fIndex))
        envAgent.lparams = agent.lparams.copy()
    elif str(agent) == 'DQNAgent':
        envAgent = DQN.DQNAgent(functionIndex(fIndex))
        envAgent.lparams = agent.lparams.copy()


    arpm.push(envAgent)
    # print(f'AGENT LPARAMS: {agent.lparams}')
    placementPiece = None
    print(f'TYPE OF AGENT: {agent}')
    eTimeBegin = time.time()
    eTimeEnd = 0
    optimizer = optim.SGD(agent.currentNN.parameters(), lr=agent.lparams['alpha'], momentum=0.9)
    avg_loss = []
    for episode in range(initEpisode, nEpisodes):
        qG = None #qg.GameBoard() # HHHHHHUUUUUUUUUUUUSSSSSSSKKKKKKKKK at få tilbage til randomized initial!!

        while True:
            qG = qg.GameBoard()
            placementPiece = qG.randomInitOfGame()
            if not qG.isDone:
                break

        #print(f'GAME INIT: {torch.stack([qG.boardRep, qG.piecePoolRep, qG.pickedPieceRep])}')

        envAgent = arpm.sample()
        envAgent.setBoard(qG)
        envAgent.lparams['epsilon'] = torch.tensor([0.0])
        agent.setBoard(qG)
        e = agent.lparams['epsilon']
        # print(f'agent EPSILON: {e}')
        # rpm = RP.ReplayMemory(1000)
        #print(f'TYPE OF ADVERSARY: {envAgent}')


        #result = playTrainingGameNEW(agent, envAgent, qG, placementPiece)
        result = playTrainingGameDQN(agent, envAgent, qG, placementPiece, rpm) # DQN TEST


        loss = trainNetworkDQN(agent, qG, rpm, optimizer, batch_size=50)
        if loss is not None:
            avg_loss.append(loss)
        '''
        if result == -1:
            print(f'Game was a draw')
            # break
        elif result == 0:
            print(f'ADVESARY WON!')
        elif result == 1:
            print(f'{agent} WON!')
        else:
            raise Exception('INVALID RESULT')
        '''

        if str(agent) == 'TDLambdaAgentNEW':
            newAgent = TDLAN.TDLambdaAgent(functionIndex(fIndex))
            newAgent.lparams = agent.lparams.copy()
            newAgent.currentNN.load_state_dict(agent.currentNN.state_dict().copy())
            arpm.push(newAgent)
        elif str(agent) == 'DQNAgent':
            newAgent = DQN.DQNAgent(functionIndex(fIndex))
            newAgent.lparams = agent.lparams.copy()
            newAgent.currentNN.load_state_dict(agent.currentNN.state_dict().copy())
            arpm.push(newAgent)



        #print(f'Lparams: {agent.lparams}')
        if episode % 50 == 0:
            av_loss = torch.mean(torch.stack(avg_loss))
            print(f'avg_loss {av_loss}')
            agent.targetNN.load_state_dict(agent.currentNN.state_dict().copy())
            agent.targetNN.eval()
            print(f'ROUND {episode}')
            eTimeEnd = time.time() - eTimeBegin
            print(f'eTimeEnd: {eTimeEnd}')
            torch.save({
                'eTime': eTimeEnd,
                'target_state_dict': agent.targetNN.state_dict(),
                'current_state_dict': agent.currentNN.state_dict(),
                'episode': episode,
                'avg_loss': av_loss,
                'agent_lparams': agent.lparams.copy(),
                'checkpoint_number': episode / 50}, f"./modelTargets/DQN/un_restricted_1_h/agent{int(episode / 50)}.tar")
            eTimeBegin = time.time()

            avg_loss = []


        if agent.lparams['alpha'] > 0.1:
            agent.lparams['alpha'] *= agent.lparams['alpha_decay']

        if agent.lparams['epsilon'] > 0.1:
            agent.lparams['epsilon'] *= agent.lparams['epsilon_decay']
Ejemplo n.º 16
0
import gym
import numpy as np
import matplotlib.pyplot as plt
from DQNAgent import *

env = gym.make('CartPole-v1')  # initializing the game environment
state_size = env.observation_space.shape[
    0]  # how many inputs the NN should take in
action_size = env.action_space.n  # how many outputs the NN should give out
cartpoleAI = DQNAgent(
    state_size, action_size
)  # generating an agent to play the game with the right amount of inputs and outputs

episodes = 100  # how many episodes to play
batch_size = 32  # how much memory the AI needs to have before it starts learning from past episodes

fin = open('finalproject.txt', 'a')  # opening a file to save good weights
scores = []  # making a list to hold the scores

for e in range(episodes):
    state = env.reset()  # generate a new game state for each episode
    state = np.reshape(state, [1, state_size])  # formatting to 2d array

    done = False  # used to tell when to stop episode
    time = 0  # counts how many timesteps the episode has been running, used for score

    while not done:
        env.render()
        time += 1  # increment time

        action = cartpoleAI.act(state)  # determine which action to take
Ejemplo n.º 17
0
def main():
    log = open('Logs_result/log-model.txt', 'w')
    time_plot = []
    time_for_waiting_time_plot = []
    average_waiting_time_plot = []
    waiting_time_plot = []
    reward_t_plot = []
    time_reward_t_plot = []

    # Control code here
    memory_size = 20000  # size memory
    mini_batch_size = 64  # minibatch_size
    a_dec = 4.5  # m/s^2
    num_of_phase = 2  # 2 phase
    action_space_size = num_of_phase * 2 + 1  # 5 actions
    action_policy = [[0, 0], [5, 0], [-5, 0], [0, 5], [0, -5]]
    tentative_action = [
        np.asarray([1, 1, 1, 1, 1]).reshape(1, action_space_size),
        np.asarray([1, 1, 0, 0, 0]).reshape(1, action_space_size),
        np.asarray([1, 0, 1, 0, 0]).reshape(1, action_space_size),
        np.asarray([1, 0, 0, 1, 0]).reshape(1, action_space_size),
        np.asarray([1, 0, 0, 0, 1]).reshape(1, action_space_size)
    ]
    # tentative_action = [np.asarray([1, 1, 1, 1, 1]).reshape(1, action_space_size),
    #                     np.asarray([1, 0, 0, 0, 0]).reshape(1, action_space_size),
    #                     np.asarray([1, 0, 0, 0, 0]).reshape(1, action_space_size),
    #                     np.asarray([1, 0, 0, 0, 0]).reshape(1, action_space_size),
    #                     np.asarray([1, 0, 0, 0, 0]).reshape(1, action_space_size)]

    # global count_action_dif_default
    I = np.full((action_space_size, action_space_size),
                0.5).reshape(1, action_space_size, action_space_size)
    idLightControl = constants.idLightControl
    waiting_time_t = 0
    numb_of_cycle = 0

    # new Agent.
    agent = DQNAgent.DQNAgent(memory_size, action_space_size, mini_batch_size)
    try:
        agent.load('Models_max/reinf_traf_control_v13_random_sample.h5')
    except:
        print('No models found')
    agent.start_epsilon = 0
    # new Sumo Intersection
    sumo_int = SumoIntersection.SumoIntersection()

    # 2000 episodes
    episodes = 2000

    # command to run SUMO
    sumo_cmd = [sumoBinary, "-c", sumoConfig, '--start']

    # run 2000 episodes
    for e in range(episodes):
        # start sumo simulation.
        type = 1
        traci.start(sumo_cmd)

        # init action.
        action = 0

        # time for each phase
        action_time = [33, 33]

        # getState by action.
        state, tentative_act_dec = sumo_int.getState(I, action,
                                                     tentative_action)

        waiting_time = 0
        # run a cycle.
        while traci.simulation.getMinExpectedNumber() > 0:
            traci.simulationStep()

            # get action.
            action = agent.select_action_v2(state, tentative_act_dec)

            #  ============================================================ Perform action ======================:
            for j in range(num_of_phase):
                action_time[j] += action_policy[action][j]
                if action_time[j] < 0:
                    action_time[j] = 0
                elif action_time[j] > 60:
                    action_time[j] = 60
            for j in range(action_time[0]):
                traci.trafficlight.setPhase(idLightControl, 0)
                waiting_time += cal_waiting_time_v2()
                traci.simulationStep()
                time_plot.append(traci.simulation.getTime())
                waiting_time_plot.append(cal_waiting_time())
            yellow_time1 = sumo_int.cal_yellow_phase(['gneE21', 'gneE89'],
                                                     a_dec)
            for j in range(yellow_time1):
                traci.trafficlight.setPhase(idLightControl, 1)
                waiting_time += cal_waiting_time_v2()
                traci.simulationStep()
                time_plot.append(traci.simulation.getTime())
                waiting_time_plot.append(cal_waiting_time())
            for j in range(action_time[1]):
                traci.trafficlight.setPhase(idLightControl, 2)
                waiting_time += cal_waiting_time_v2()
                traci.simulationStep()
                time_plot.append(traci.simulation.getTime())
                waiting_time_plot.append(cal_waiting_time())
            yellow_time2 = sumo_int.cal_yellow_phase(['gneE86', 'gneE85'],
                                                     a_dec)
            for j in range(yellow_time2):
                traci.trafficlight.setPhase(idLightControl, 3)
                waiting_time += cal_waiting_time_v2()
                traci.simulationStep()
                time_plot.append(traci.simulation.getTime())
                waiting_time_plot.append(cal_waiting_time())
            #  ============================================================ Finish action ======================:

            # calculate REWARD
            waiting_time_t1 = waiting_time
            reward_t = waiting_time_t - waiting_time_t1
            reward_t_plot.append(reward_t)
            time_reward_t_plot.append(traci.simulation.getTime())
            waiting_time_t = waiting_time_t1

            # get NewState by selected-action
            new_state, tentative_act_dec = sumo_int.getState(
                I, action, tentative_action)

            # reassign
            state = new_state
            numb_of_cycle += 1
            waiting_time_average = cal_waiting_time_average()
            print('action - ' + '(' + str(action_time[0]) + ',' +
                  str(yellow_time1) + ',' + str(action_time[1]) + ',' +
                  str(yellow_time2) + ')')
            log.write('action - ' + str(numb_of_cycle) +
                      ', total waiting time - ' + str(waiting_time_average) +
                      ', action - ' + '(' + str(action_time[0]) + ',' +
                      str(yellow_time1) + ',' + str(action_time[1]) + ',' +
                      str(yellow_time2) + ')' + ', reward - ' + str(reward_t) +
                      '\n')

        traci.close(wait=False)
        log.close()
        key = ''
        # time_plot # average_waiting_time_plot
        average_waiting_time = waiting_time / constants.count_vehicle[type]
        print('average waiting time', average_waiting_time)
        np.save('array_plot/array_waiting_time_average' + key + '.npy',
                [average_waiting_time])
        np.save('array_plot/array_time' + key + '.npy', time_plot)
        np.save('array_plot/array_waiting_time' + key + '.npy',
                waiting_time_plot)

        np.save('array_plot/reward_t_plot' + key + '.npy', reward_t_plot)
        np.save('array_plot/time_reward_t_plot' + key + '.npy',
                time_reward_t_plot)
        break
Ejemplo n.º 18
0
if __name__ == '__main__':
    main = main()

    if play:
        pg.init()
        window = pg.display.set_mode((30, 30))
    else:
        config = ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 1.0
        config.gpu_options.allow_growth = False
        session = InteractiveSession(config=config)

        input_shape = np.zeros((size[0], size[1]))
        input_shape = np.expand_dims(input_shape, -1)
        DQNA = DQNAgent(input_shape)

    for e in tqdm(range(1, n_episodes + 1), ascii=True, unit='episodes'):
        background = main.background_head()
        main.spawn_snake()
        main.spawn_apple()
        background = main.update(background)

        ep_reward = 0
        done = False
        main.old_snake = []
        step = 0
        while not done:
            state = main.creating_state()
            if play:
                action = main.get_inputs()
Ejemplo n.º 19
0
                zero_idx] + "_dqn/plot_no_" + options_list[
                    zero_idx] + "_dqn.png"
    else:
        PATH = "Plots/" + folder + "/plot_{}".format(folder) + ".png"

    plt.savefig(PATH)
    plt.close()


####################################################################################################
############################################ MAIN FILE #############################################
####################################################################################################

if __name__ == "__main__":

    qnet_agent = DQNAgent.DQN(options,
                              resume_previous_train)  # Define the RL agent

    # Initialize the episode
    frames_total = 0
    done = 0
    action = 0
    counter = 0

    gui = 0  # Set to 1 if you like to see the episode steps in SUMO graphical interface

    # Determine the folder to save the data and plots
    if RL:
        folder = "RL"
    if Fixed_time:
        folder = "Fixed Time"
    elif SOTL: