Esempi in Python per RL, esempi in Python per RL

Esempio n. 1

0

Mostra file

def policy_visualize(Q, env, decks):
    Q = rl.convert_to_sum_states(Q, env)
    Q_ = q_with_optimalaction(Q)
    optQ = rl.fill_missing_sum_states(rl.filter_states(Q_), default_value=0.5)

    data = pd.DataFrame(list(optQ.items()))
    for i in data[0]:
        if i == data[0][0]:
            x = np.array(i[0])
            y = np.array(i[1])
            z = np.array(i[2])
        else:
            x = np.append(x, i[0])
            y = np.append(y, i[1])
            z = np.append(z, i[2])
    data["player_hand"] = x
    data["show_card"] = y
    data["use_ace"] = z
    data.drop(0, axis=1, inplace=True)

    use_ace_set = data[data["use_ace"] == True]
    nouse_ace_set = data[data["use_ace"] == False]

    use_ace_set = use_ace_set.pivot(index="player_hand",
                                    columns="show_card",
                                    values=1).sort_index(ascending=False)
    nouse_ace_set = nouse_ace_set.pivot(index="player_hand",
                                        columns="show_card",
                                        values=1).sort_index(ascending=False)
    """ax1, ax2 = plt.axes()
    ax1.set_title("Optimal Policy with use ace")
    ax2.set_title("Optimal Policy without use ace")

    fig1 = sns.heatmap(use_ace_set, ax = ax1).get_figure()
    fig2 = sns.heatmap(nouse_ace_set, ax = ax2).get_figure()

    fig1.savefig("figures/Optimal Policy with use ace in {}deck.jpg".format(decks))
    fig2.savefig("figures/Optimal Policy without use ace in {}decks.jpg".format(decks))"""

    fig, ax = plt.subplots(1, 2, figsize=(20, 10))
    fig.suptitle("optimal policy in {}decks".format(decks), fontsize=16)
    ax[0].set_title("with use ace")
    ax[1].set_title("without use ace")
    color = ["k", "w", "g"]
    cmap = sns.color_palette(color, n_colors=3)

    sns.heatmap(use_ace_set,
                ax=ax[0],
                cmap=cmap,
                linewidths=.5,
                linecolor="lightgray",
                cbar_kws={"ticks": [0., 0.5, 1.]})
    sns.heatmap(nouse_ace_set,
                ax=ax[1],
                cmap=cmap,
                linewidths=.5,
                linecolor="lightgray",
                cbar_kws={"ticks": [0., 0.5, 1.]})

    fig.savefig("figures/Optimal Policy in {}deck.jpg".format(decks))

Esempio n. 2

0

Mostra file

File: trafficlight_RL_final.py Progetto: wangzhongjian5920/AI-traffic-light

def traffic():
    for i in range(100):
        observation = env.reset()
        t_reward = 0
        step = 0
        r1 = rnd
        r2 = rnd
        r1.seed(1)
        r2.seed(2)
        while True:
            step += 1
            # time.sleep(0.1)
            cars(r1, r2)
            env.render()
            action = RL.choose_action(observation)
            if int(observation[5]) < 6:
                # print("can not change")
                action = "n"
            # print(action)
            observation_, reward, done = env.switch_light(action)
            t_reward += reward
            RL.save_memory(observation, action, reward, observation_)
            if step > 500 and step % 5 == 0:
                RL.learn()

            observation = observation_
            if done:
                print(t_reward)
                break

Esempio n. 3

0

Mostra file

 def __init__(self, graph_path='models/simpleDQN.pb', reload_every=60 * 60):
     self.graph_path = graph_path
     #self.sess = None
     #self.load_graph()
     self.reload_every = reload_every
     self.counter = 0
     self.simple_controller = ssbm.SimpleControllerState()
     RL.restore()

Esempio n. 4

0

Mostra file

File: agent.py Progetto: vladfi1/cpu-level-11

 def __init__(self, graph_path='models/simpleDQN.pb', reload_every=60*60):
     self.graph_path = graph_path
     #self.sess = None
     #self.load_graph()
     self.reload_every = reload_every
     self.counter = 0
     self.simple_controller = ssbm.SimpleControllerState()
     RL.restore()

Esempio n. 5

0

Mostra file

File: train.py Progetto: vladfi1/cpu-level-11

def sweep(data_dir='experience/'):
  # for f in ["2"]:
  for f in os.listdir(data_dir):
    if f.isdigit():
        filename = data_dir + f
        print("Training on " + filename)
        RL.train(filename)
    else:
        print("Not training on file:", f)
  RL.save()

Esempio n. 6

0

Mostra file

File: train.py Progetto: vladfi1/cpu-level-11

def sweep(data_dir='experience/'):
    # for f in ["2"]:
    for f in os.listdir(data_dir):
        if f.isdigit():
            filename = data_dir + f
            print("Training on " + filename)
            RL.train(filename)
        else:
            print("Not training on file:", f)
    RL.save()

Esempio n. 7

0

Mostra file

File: agent.py Progetto: vladfi1/cpu-level-11

 def get_action(self, state):
     scores = RL.scoreActions(state)
     
     score, best_action = max(zip(scores, ssbm.simpleControllerStates), key=lambda x: x[0])
     #print(score, best_action)
     
     self.epsilon = RL.getEpsilon()
     
     if flip(self.epsilon):
       self.simple_controller = ssbm.SimpleControllerState.randomValue()
     else:
       self.simple_controller = best_action

Esempio n. 8

0

Mostra file

    def get_action(self, state):
        scores = RL.scoreActions(state)

        score, best_action = max(zip(scores, ssbm.simpleControllerStates),
                                 key=lambda x: x[0])
        #print(score, best_action)

        self.epsilon = RL.getEpsilon()

        if flip(self.epsilon):
            self.simple_controller = ssbm.SimpleControllerState.randomValue()
        else:
            self.simple_controller = best_action

Esempio n. 9

0

Mostra file

    def advance(self, state, pad):
        self.counter += 1

        if self.counter >= self.reload_every:
            #self.load_graph()
            print("RL.restore()")
            RL.restore()
            self.counter = 0

        self.get_action(state)
        if self.counter % 60 == 0:
            print("Frame %d of recording." % self.counter)
            print(self.simple_controller)
            print(self.epsilon)
        pad.send_controller(self.simple_controller.realController())

Esempio n. 10

0

Mostra file

File: agent.py Progetto: vladfi1/cpu-level-11

    def advance(self, state, pad):
        self.counter += 1

        if self.counter >= self.reload_every:
            #self.load_graph()
            print("RL.restore()")
            RL.restore()
            self.counter = 0

        self.get_action(state)
        if self.counter % 60 == 0:
            print("Frame %d of recording." % self.counter)
            print(self.simple_controller)
            print(self.epsilon)
        pad.send_controller(self.simple_controller.realController())

Esempio n. 11

0

Mostra file

def test_rl(model_name: str, trained_model_name: str) -> dict:
    """
    Tests the RL agent
    Note that the parameters of the trained and tested RL agent need to be the same for these parameters:
    - Antigens included
    - Max age
    - state type
    - obs method
    :param model_name: name of the model to be stored
    :param trained_model_name: name of the agent trained to be evaluated
    :return: dict containing all evaluation metrics
    """
    model_name = model_name + "_RL"

    print('- start testing RL model')
    results = RL.solve(supply_distribution=supply_distribution,
                       demand_distribution=demand_distribution,
                       model_name=model_name,
                       export_model='results/model/' + trained_model_name +
                       '/best_model',
                       max_age=parameters['max_age'],
                       demand=parameters['demand'],
                       doi=parameters['doi'],
                       n_warm_start_days=parameters['n_warm_start_days'],
                       n_days=parameters['n_days'],
                       obs_method=parameters['rl']['obs_method'],
                       state_type=parameters['rl']['state_type'])
    print('- complete testing RL model')
    return results[0]

Esempio n. 12

0

Mostra file

File: main.py Progetto: kottkech/TicTacToeNN

def play():
    board = Tic.Tic(size)

    nn = RL.RL([squ, 10 * squ, 10 * squ, 10 * squ, squ])

    sess = tf.Session()
    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state(path)
    saver.restore(sess, ckpt.model_checkpoint_path)

    done = False

    i = 0

    ai = 0  #0 is x, 1 is o

    agent = -1 * (ai * 2 - 1)

    winner = 0

    while not done:
        loc = [-1, -1]
        if i % 2 == ai:
            a, m = sess.run([nn.predict, nn.out],
                            feed_dict={nn.input: board.state(agent)})
            a = a[0]
            print(m)
            loc[0] = int(a / size)
            loc[1] = a % size
            board.play(agent, loc)
        else:
            board.print()
            validPlay = False
            while not validPlay:
                text = input("Please enter play position 'row,column': ")
                loc = text.split(',')
                loc[0] = int(loc[0])
                loc[1] = int(loc[1])
                validPlay = board.valid(loc)
                if not validPlay:
                    print("INVALID PLAY: Please choose another position")
            board.play(-1 * agent, loc)

        output = board.done()
        done = output[0]
        winner = output[1]

        i += 1

    board.print()

    if winner == 0:
        print("Tie!")
    elif winner == agent:
        print("Computer Wins")
    else:
        print("Human Wins")

Esempio n. 13

0

Mostra file

def make_user_features(userId, prodId, date, ratings, recommend, review_words,
                       review_text):
    user_features = pd.DataFrame()

    unique_user = list(np.unique(userId))
    user_features.insert(0, "userId", unique_user)
    # 1. MNR
    user_features.insert(1, "mnr", MNR(userId, date))
    #2. PR
    user_features.insert(2, "PR", PR_NR(userId, ratings, "PR"))
    #3. NR
    user_features.insert(3, "NR", PR_NR(userId, ratings, "NR"))

    #4. avgRD
    user_features.insert(4, "avgRD",
                         avgRD(userId, prodId, ratings, us_pr="user"))

    #5. WRD
    #did not do.

    #6. BST
    user_features.insert(5, "BST", BST_user(userId, date))

    #7. ERD
    #did not do.

    #8. ETG
    #did not do.

    #9. RL
    #use review_text
    #remove 0:2000 later...

    user_features.insert(6, "RL", RL(userId, review_words[0:3000]))

    #uses review content to find TFIDF.

    vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                                 max_features=2000,
                                 stop_words='english')

    # Max Features would mean
    # creating a feature matrix out of the most 2000 frequent words accross text documents.
    TFIDF = vectorizer.fit_transform(review_text)

    #10. ACS
    user_features.insert(7, "ACS", ACS(userId, TFIDF))

    #11. MCS
    user_features.insert(8, "MCS", MCS(userId, TFIDF))

    #write to a csv file and exit the function
    user_features.to_csv(
        '/Users/anaghakaranam/Desktop/Opinion_Spam/coding-playground/feature_csvs/user_features.csv',
        index=None,
        header=True)

Esempio n. 14

0

Mostra file

   def __init__(self, agent, memory):

      self.agent = agent

      self.directions = [
         np.array([1, 0]),
         np.array([-1, 0]),
         np.array([0, -1]), 
         np.array([0, 1])]

      self.senses = []

      self.brain = RL.QLearn(numActions = len(self.directions), memory = memory)
      self.scheduledAction = None
      self.learningModule = None

Esempio n. 15

0

Mostra file

File: main.py Progetto: tysonwang26/Policy-Gradient-cartpole

def main():
    for i in range(1, MAX_EPISODES):
        print(i, "of episodes", end="\n")
        start_time = time.time()
        observation = env.reset()
        for j in range(MAX_STEP_EPISODES):
            env.render()
            action = RL.choose_action(observation)
            if j < 5:
                action = 0
            observation_, reward, done, info = env.step(action)
            RL.store_transition(observation, action, reward, False)

            if done:
                RL.store_transition(observation, action, 0.0, True)
                RL.learn()
                break
            observation = observation_

        end_time = time.time()
        plot_.plot_graph((end_time - start_time), i)
    env.close()
    RL.store_net()

Esempio n. 16

0

Mostra file

File: agent.py Progetto: jodizzle/phillip

 def __init__(self,
              model=None,
              path=None,
              reload_every=60 * 60,
              swap=False,
              memory=0,
              delay=0,
              **kwargs):
     self.model = RL.Model(model,
                           path,
                           swap=swap,
                           mode=RL.Mode.PLAY,
                           memory=memory,
                           **kwargs)
     self.reload_every = reload_every
     self.counter = 0
     self.action = 0
     self.actions = util.CircularQueue(delay + 1, 0)
     self.memory = util.CircularQueue(array=((memory + 1) *
                                             ssbm.SimpleStateAction)())
     self.model.restore()

Esempio n. 17

0

Mostra file

def train_rl(model_name: str) -> str:
    """
    Trains the model using the parameters defined
    :param model_name: name of the model to save
    :return: str, name of the stored model
    """
    # model name
    model_name = model_name + "_RL"

    print('- start training RL model')
    trained_model_name = RL.train(
        supply_distribution=supply_distribution,  # global
        demand_distribution=demand_distribution,  # global
        model_name=model_name,  # in loop
        max_age=parameters['max_age'],
        demand=parameters['demand'],
        max_day=parameters['rl']['max_day'],
        obs_method=parameters['rl']['obs_method'],
        doi=parameters['doi'],
        training_timesteps_list=parameters['rl']['training_interval'],
        tblog=parameters['rl']['tb_log'])
    print('- Complete training RL model')
    return trained_model_name

Esempio n. 18

0

Mostra file

   def __initAI__(self):    
      self.efficiencyPlot = view.Plot()

      self.aiCollection = ai.AICollection()
      self.positionMonitors = []
      self.trainedAI = ai.TrainedAI(goalId = 'e', wallId = '#', statisticsPlot = self.efficiencyPlot)      

      self.eyesight = ai.Eyesight(1)
      self.smell = ai.Smell('e')

      self.memory = RL.QMemory()
      self.savedMemory = None
      self.savedMemoryNo = -1
      
      self.timer = utils.Timer(0.5)
      self.efficiencyPlot.show()

      def updatePositionMonitors():
         for monitor in self.positionMonitors:
            monitor.update(self.scene)

      self.timer.addToTick(lambda: self.aiCollection.think(self.scene))
      self.timer.addToTick(updatePositionMonitors)

Esempio n. 19

0

Mostra file

File: flappybot.py Progetto: lingchunkai/flappybird-RL

def PlotValueFunction(AI):
    if hasattr(AI, 'QueryQBestAction') and callable(
            getattr(AI, 'QueryQBestAction')):
        # Update plot of optimal value function (only of position and velocity)a
        X, Y = np.meshgrid(range(0, int(BASEY + 30), 20), range(-10, 10, 1))

        Z = np.zeros(X.shape)
        for yy in xrange(X.shape[0]):
            for xx in xrange(X.shape[1]):
                Z[yy, xx] = AI.QueryQBestAction(
                    RL.FB_GS(0, X[yy, xx], 0, Y[yy, xx], [{
                        'x': 0,
                        'y': 0
                    }, {
                        'x': 0,
                        'y': 0
                    }], [{
                        'x': 0,
                        'y': 0
                    }, {
                        'x': 0,
                        'y': 0
                    }]))

        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(X,
                        Y,
                        Z,
                        rstride=1,
                        cstride=1,
                        cmap=cm.coolwarm,
                        linewidth=0,
                        antialiased=False)
        plt.savefig('optimalQ.png')
        plt.close(fig)

Esempio n. 20

0

Mostra file

File: train.py Progetto: vladfi1/cpu-level-11

import RL
import os

RL.init()
# RL.restore()


def sweep(data_dir='experience/'):
    # for f in ["2"]:
    for f in os.listdir(data_dir):
        if f.isdigit():
            filename = data_dir + f
            print("Training on " + filename)
            RL.train(filename)
        else:
            print("Not training on file:", f)
    RL.save()
    #RL.writeGraph()


while True:
    sweep()

Esempio n. 21

0

Mostra file

Pl[5,0,5]=0.1
Pl[6,0,6]=1
Pl[0,1,0]=1
Pl[1,1,1]=0
Pl[1,1,0]=1
Pl[2,1,1]=1
Pl[3,1,2]=1
Pl[4,1,3]=1
Pl[5,1,4]=1    
Pl[6,1,5]=1
   
Rl = np.zeros((7,2))
Rl[[0,6],:]=1
absorv = np.zeros((7,1))
absorv[[0,6]]=1
fmdp = RL.finiteMDP(7,2,0.9,Pl,Rl,absorv)

J,traj = fmdp.runPolicy(10000,3,poltype = "exploration") #choose this value
data = np.load("Q1.npz")
Qr = fmdp.traces2Q(traj)
if np.sqrt(sum(sum((data['Q1']-Qr)**2)))<1:
    print("Aproximação de Q dentro do previsto. OK\n")
else:
    print("Aproximação de Q fora do previsto. FAILED\n")

J,traj = fmdp.runPolicy(3,3,poltype = "exploitation", polpar = Qr)
if np.sqrt(sum(sum((data['traj2']-traj)**2)))<1:
    print("Trajectória óptima. OK\n")
else:
    print("Trajectória não óptima. FAILED\n")

Esempio n. 22

0

Mostra file

File: main.py Progetto: ohsai/FlappyBot_pytorch

def experiment(device,
               reward_system,
               PIPEGAP,
               BATCH_SIZE,
               learning_rate,
               MEMORY_SIZE,
               GAMMA,
               EPS_START,
               EPS_END,
               EPS_DECAY,
               OBSERVE,
               FRAME_PER_ACTION,
               TARGET_UPDATE,
               num_episodes,
               save_model=False,
               load_model=False,
               load_model_path_prefix=None):
    expected_q_value = 0

    policy_net = RL.DQN().to(device)
    target_net = RL.DQN().to(device)
    if load_model:
        policy_net.load_state_dict(
            torch.load(load_model_path_prefix + "_policy_net.mdl"))
        target_net.load_state_dict(
            torch.load(load_model_path_prefix + "_target_net.mdl"))
    else:
        target_net.load_state_dict(policy_net.state_dict())
        target_net.eval()
    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
    memory = RL.ReplayMemory(MEMORY_SIZE)

    #Setup Game environment
    game = FlappyBird.FlappyBird(pipe_gap=PIPEGAP)
    env = PLE(game,
              fps=30,
              display_screen=True,
              force_fps=True,
              reward_values=reward_system)

    #Setup plot
    RLplot.plot_init()
    episode_durations = []

    # Main part with game execution

    env.init()
    steps_done = 0
    infinity = False

    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset_game()
        state = env.getScreenRGB()
        state = RLip.BCHW_format(state)
        frames = (state, state, state, state)
        state = RLip.last_4_frames(state, frames[1], frames[2], frames[3])

        for t in count():
            # Select an action
            action, steps_done = RL.select_action(state, policy_net,
                                                  steps_done, device,
                                                  EPS_START, EPS_END,
                                                  EPS_DECAY, OBSERVE)
            if steps_done % FRAME_PER_ACTION != 0:
                action = torch.tensor([[1]], device=device, dtype=torch.long)

            # Perform an action
            reward = env.act(env.getActionSet()[action[0, 0]])
            next_state = env.getScreenRGB()
            done = env.game_over()
            reward = torch.tensor([reward], device=device)

            # Formatting next state for network
            if not done:
                next_state = RLip.BCHW_format(next_state)
                frames = (next_state, frames[0], frames[1], frames[2])
                next_state = RLip.last_4_frames(next_state, frames[1],
                                                frames[2], frames[3])
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)  # edit

            # Move to the next state
            state = next_state

            # Print Log of training info
            if steps_done <= OBSERVE:
                state_of_training = "observe"
            elif steps_done > OBSERVE and steps_done <= OBSERVE + EPS_DECAY:
                state_of_training = "explore"
            else:
                state_of_training = "train"
            print("TIMESTEP", steps_done, "/ STATE", state_of_training,\
                 "/ ACTION", action[0,0].data,"/ REWARD", reward[0].data,"/ Expected_Q",expected_q_value)

            # Perform one step of the optimization (on the target network)
            if steps_done > OBSERVE:
                RL.optimize_model(policy_net, target_net, memory, optimizer,
                                  device, BATCH_SIZE, GAMMA)
                if done:
                    episode_durations.append(t + 1)
                    RLplot.plot_durations(episode_durations)
                    break
                if t > 10000:
                    infinity = True
                    episode_durations.append(t + 1)
                    RLplot.plot_durations(episode_durations)
                    break
            else:
                if done:
                    break

        # Update the target network
        if i_episode % TARGET_UPDATE == 0 and steps_done > OBSERVE:
            target_net.load_state_dict(policy_net.state_dict())
        if infinity:
            break
    # End training process
    # Save experiment result
    data = {
        "data": episode_durations,
        'pipe_gap': PIPEGAP,
        'reward_values': reward_system,
        'BATCH_SIZE': BATCH_SIZE,
        'learning_rate': learning_rate,
        'MEMORY_SIZE': MEMORY_SIZE,
        'GAMMA': GAMMA,
        'EPS_START': EPS_START,
        'EPS_END': EPS_END,
        'EPS_DECAY': EPS_DECAY,
        'OBSERVE': OBSERVE,
        'FRAME_PER_ACTION': FRAME_PER_ACTION,
        'TARGET_UPDATE': TARGET_UPDATE,
        'num_episodes': num_episodes
    }
    filenameprefix = './result/Expe_' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S')
    filename = filenameprefix + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
    # Save model if said so
    if save_model:
        torch.save(policy_net.state_dict(), filenameprefix + '_policy_net.mdl')
        torch.save(target_net.state_dict(), filenameprefix + '_target_net.mdl')

    # Save plot figure
    plotname = filenameprefix + '.png'
    RLplot.plot_end(plotname)

Esempio n. 23

0

Mostra file

        os.system("sbatch " + slurmfile)
        #os.system("sbatch -N 1 -c 2 --mem=8000 --time=6-23:00:00 slurm_scripts/" + jobname + ".slurm &")


init = False
init = True

if dry_run:
    print("NOT starting jobs:")
else:
    print("Starting jobs:")

    # init model for the first time
    if init:
        import RL
        model = RL.Model(mode=RL.Mode.TRAIN, gpu=False, **job_dicts['train'])
        model.init()
        model.save()

train_name = "trainer_" + exp_name
train_command = "python3 -u train.py" + job_flags['train']

slurm_script(train_name, train_command, gpu=True)

#sys.exit()

agent_count = 0
agent_command = "python3 -u run.py" + job_flags['agent']
for c1 in characters:
    for c2 in characters:
        command = agent_command + " --p1 %s --p2 %s" % (c1, c2)

Esempio n. 24

0

Mostra file

import RL
import random

env = RL.Env()
jernej = RL.Player()

for i in range(env.STEPS):
    action = [jernej.move_o(), jernej.move_p()]
    #action = [random.choice([-5, 5]), random.choice([-.05, .05])]
    env.step(action)
    env.render()
    if env.done:
        print(f'Crashed in episode step: {env.episode_step}')
        env.reset()

Esempio n. 25

0

Mostra file

File: main.py Progetto: tysonwang26/Policy-Gradient-cartpole

import gym
import RL
from draw_graph import Plot
import time

env = gym.make('CartPole-v0')
env = env.unwrapped

plot_ = Plot()

MAX_EPISODES = 2000
MAX_STEP_EPISODES = 5000

RL = RL.PolicyGradient(n_actions=env.action_space.n,
                       n_features=env.observation_space.shape[0],
                       n_hidden=10,
                       learning_rate=0.01,
                       reward_decay=0.99,
                       epsilon=0.90)


def main():
    for i in range(1, MAX_EPISODES):
        print(i, "of episodes", end="\n")
        start_time = time.time()
        observation = env.reset()
        for j in range(MAX_STEP_EPISODES):
            env.render()
            action = RL.choose_action(observation)
            if j < 5:
                action = 0
            observation_, reward, done, info = env.step(action)

Esempio n. 26

0

Mostra file

File: run_RL.py Progetto: egillvignis/SML2

        start_time_expanded = time.time()
        Q, avg_reward, state_action_count = rl.learn_Q(
            env, n_sims, gamma = 1, omega = omega, epsilon = epsilon, init_val = init_val,
            episode_file=path_fun("hand_state"), warmup=warmup)
        print("Number of explored states: " + str(len(Q)))
        print("Cumulative avg. reward = " + str(avg_reward))
        time_to_completion_expanded = time.time() - start_time_expanded
        """

        print("----- Starting Q-learning for sum-based state space -----")
        # Q-learning with player sum state representation
        start_time_sum = time.time()
        sumQ, sum_avg_reward, sum_state_action_count = rl.learn_Q(
            sum_env,
            n_sims,
            omega=omega,
            epsilon=epsilon,
            init_val=init_val,
            episode_file=path_fun("sum_state"),
            warmup=warmup)

        time_to_completion_sum = time.time() - start_time_sum

        print("Number of explored states (sum states): " + str(len(sumQ)))
        print("Cumulative avg. reward = " + str(sum_avg_reward))
        """
        print("Training time: \n " +
              "Expanded state space MC: {} \n Expanded state space: {} \n Sum state space: {}".format(
                 time_to_completion_MC, time_to_completion_expanded, time_to_completion_sum))

Esempio n. 27

0

Mostra file

File: train.py Progetto: ericjang/phillip

                    type=int,
                    default=0,
                    help="how many frames to remember")

args = parser.parse_args()

if args.name is None:
    args.name = args.model

if args.path is None:
    args.path = "saves/%s/" % args.name

experience_dir = args.path + 'experience/'
os.makedirs(experience_dir, exist_ok=True)

model = RL.Model(mode=RL.Mode.TRAIN, **args.__dict__)

# do this in RL?
if args.init:
    model.init()
    model.save()
else:
    model.restore()

import numpy as np


def sweep(data_dir='experience/'):
    i = 0
    start_time = time.time()
    files = os.listdir(data_dir)

Esempio n. 28

0

Mostra file

File: rlcarsim.py Progetto: anilyerrapragada/Reinforcement_Learning_Car_Simulator

def reinfrocement_neural_network_control(load_weights=None,
                                         run_only=False,
                                         track_select='SS',
                                         random_seed=None,
                                         rl_prams=None):
    run = run_only
    weights_save_dir = "./weights/"
    if not os.path.exists(weights_save_dir): os.makedirs(weights_save_dir)
    Environment.track_generator(track, track_select=track_select)
    env = Environment.Environment(track, rl_parameters['max_steps'])
    gui = GUI.GUI(track, cars, trace=True)
    car_objects = [Environment.Car(c) for c in cars]
    rl = RL.QLearning_NN(rl_prams, weights_save_dir=weights_save_dir)
    rl.generate_nn()
    if load_weights is not None:
        if load_weights == 'all':
            run = True
        else:
            rl.load_weights(load_weights)
    if random_seed is not None: rl.random_seed(random_seed)
    weight_names = sorted([name for name in glob.glob(weights_save_dir + '*')])
    weight_names_index = 0

    def initialize(run_state):
        env.compute_interaction(car_objects)
        for car in car_objects:
            car.reset()
            car.get_sensor_reading()
        if run_state == True:
            env.set_max_steps(1500)
            gui.remove_traces()
            gui.disable_trace()
            gui.set_run_select(gui.runs[1])
            gui.update_debug_info('[Testing]\n' +
                                  'Currently learned weights loaded')
        else:
            env.set_max_steps(rl_prams['max_steps'])
            gui.enable_trace()
            gui.set_run_select(gui.runs[0])
            gui.update_debug_info('[Training]\n')

    def check_run_button(current_state):
        if gui.get_run_select() == gui.runs[0] and current_state == True:
            print '\n\n\nLearning\n'
            initialize(run_state=False)
            return False
        if gui.get_run_select() == gui.runs[1] and run == False:
            print '\n\n\nRun only\n'
            initialize(run_state=True)
            return True
        return None

    initialize(run_state=run)
    while (1):
        new_run_state = check_run_button(current_state=run)
        if new_run_state is not None: run = new_run_state
        if run == True:
            for i, car in enumerate(car_objects):
                terminal = rl.run_step(car, env, dt)
                if terminal is not None:
                    print 'Car', i, ':', terminal
                    if i == 0:
                        if load_weights == 'all' and weight_names_index < len(
                                weight_names):
                            rl.load_weights(weight_names[weight_names_index])
                            gui.update_debug_info(
                                '[Testing]\n' + 'Weights loaded:\n' +
                                weight_names[weight_names_index])
                            weight_names_index += 1
                gui.update(i, car.get_state())
            env.compute_interaction(car_objects)
            gui.refresh()
        else:
            terminal, debug, epoch, avg_loss, final_score, cross_score = rl.learn_step(
                car_objects[0], env, dt)
            if terminal is not None:
                if debug is not None:
                    gui.update_debug_info(debug)
                    gui.update_graph(epoch, avg_loss, gui.graphs[0])
                    gui.update_graph(epoch, final_score, gui.graphs[1])
                    gui.update_graph(epoch, cross_score, gui.graphs[2])
                    gui.refresh()
                gui.update(0, terminal, draw_car=False, force_end_line=True)
                gui.refresh()
            if rl.epoch % 100 == 0:
                gui.update(0, car_objects[0].get_state(), draw_car=True)
                gui.refresh()
            else:
                gui.update(0, car_objects[0].get_state(), draw_car=False)

Esempio n. 29

0

Mostra file

File: trafficlight_RL_final.py Progetto: wangzhongjian5920/AI-traffic-light

            env.render()
            action = RL.choose_action(observation)
            if int(observation[5]) < 6:
                # print("can not change")
                action = "n"
            # print(action)
            observation_, reward, done = env.switch_light(action)
            t_reward += reward
            RL.save_memory(observation, action, reward, observation_)
            if step > 500 and step % 5 == 0:
                RL.learn()

            observation = observation_
            if done:
                print(t_reward)
                break


if __name__ == "__main__":
    env = map_env.Map()
    mode = sys.argv[1]
    env.after(100, traffic_baseline())
    env.destroy()
    if mode == 'RL':
        env = map_env.Map()
        RL = RL.QLearningTable(env.action_space)

    elif mode == 'DQN':
        env = map_env.Map()
        RL = RL.DeepQNetwork(num_actions=2, num_features=6, actions=['y', 'n'])
    env.after(100, traffic())

Esempio n. 30

0

Mostra file

File: train.py Progetto: vladfi1/cpu-level-11

import RL
import os

RL.init()
# RL.restore()

def sweep(data_dir='experience/'):
  # for f in ["2"]:
  for f in os.listdir(data_dir):
    if f.isdigit():
        filename = data_dir + f
        print("Training on " + filename)
        RL.train(filename)
    else:
        print("Not training on file:", f)
  RL.save()
  #RL.writeGraph()

while True:
  sweep()

Esempio n. 31

0

Mostra file

        # Q-learning with expanded state representation
        start_time_expanded = time.time()
        Q, avg_reward, state_action_count = rl.learn_Q(
            env, n_sims, gamma = 1, omega = omega, epsilon = epsilon, init_val = init_val,
            episode_file=path_fun("hand_state"), warmup=warmup)
        print("Number of explored states: " + str(len(Q)))
        print("Cumulative avg. reward = " + str(avg_reward))
        time_to_completion_expanded = time.time() - start_time_expanded"""

        print("----- Starting Q-learning for sum-based state space -----")
        # Q-learning with player sum state representation
        start_time_sum = time.time()
        sumQ, sum_avg_reward, sum_state_action_count = rl.learn_Q(
            sum_env,
            n_sims,
            omega=omega,
            epsilon=epsilon,
            init_val=init_val,
            episode_file=path_fun("sum_state"),
            warmup=warmup)
        time_to_completion_sum = time.time() - start_time_sum
        print("Number of explored states (sum states): " + str(len(sumQ)))
        print("Cumulative avg. reward = " + str(sum_avg_reward))
        """print("Training time: \n " +
              "Expanded state space MC: {} \n Expanded state space: {} \n Sum state space: {}".format(
                 time_to_completion_MC, time_to_completion_expanded, time_to_completion_sum))


        # Convert Q (extended state) to sum state representation and make 3D plots
        # Extended state MC-learning
        Q_conv_MC = rl.convert_to_sum_states(Q_MC, env)
        V_conv_MC = rl.convert_to_value_function(Q_conv_MC)

Esempio n. 32

0

Mostra file

File: main_parallel.py Progetto: sborah53/DeepReinforcementLearningControlOfQuantumCartpoles

    data_size = round(
        (2 + args.input_moment_order + 1) * args.input_moment_order / 2)
elif args.input == 'wavefunction':
    data_size = 2 * (x_n - 10 * 2)

# we do not plot when we do parallelized computation

#import plot
#plot.set_parameters(x=x, x_max=x_max, dt=time_step, num_of_episodes=num_of_episodes, probability=probability,
#    reward_multiply=reward_multiply, read_length=read_length, controls_per_unit_time=controls_per_unit_time)

# set the reinforcement learning settings
if __name__ == '__main__':
    import RL
    RL.set_parameters(control_interval=control_interval,
                      t_max=t_max,
                      F_max=args.F_max,
                      failing_reward=failing_reward)

################################## end learning setting


# Below is the worker function for subprocesses, which carries out the control simulations and pushes the experiences and records to queues that are collected and handled by other processes. (Quantum simulation is implemented in a compiled C module)
# Because too many processes using CUDA will occupy a huge amount of GPU memory, we avoid using CUDA in these workers. Instead, these workers ask a manager process when they want to evaluate the neural network, and only the manager process is allowed to use CUDA to evaluate the neural network for the controls.
def Control(net, pipes, shared_buffer, seed, idx):
    simulation = __import__('simulation')
    # seeding
    random = np.random.RandomState(seed)
    simulation.set_seed(random.randint(0, 2**31 - 1))
    # preparing pipes
    MemoryQueue, ResultsQueue, ActionPipe, EndEvent, PauseEvent = pipes
    state_data_to_manager = np.frombuffer(shared_buffer, dtype='float32')

Esempio n. 33

0

Mostra file

                        num_of_data_per_time_unit)  # 3600
    read_control_step_length = control_interval // coarse_grain
    data_size = 2 * read_length
    shape_measurement_data = (2, read_length)

# we do not plot when we do parallelized computation

#import plot
#plot.set_parameters(x=x, x_max=x_max, dt=time_step, num_of_episodes=num_of_episodes, probability=probability,
#    reward_multiply=reward_multiply, read_length=read_length, controls_per_half_period=controls_per_half_period)

# set the reinforcement learning settings
if __name__ == '__main__':
    import RL
    RL.set_parameters(control_interval=control_interval,
                      t_max=t_max,
                      F_max=args.F_max)
    if args.input == 'measurements':
        RL.set_parameters(read_step_length=read_control_step_length)

################################## end learning setting


# Below is the worker function for subprocesses, which carries out the control simulations and pushes the experiences and records to queues that are collected and handled by other processes. (Quantum simulation is implemented in a compiled C module)
# Because too many processes using CUDA will occupy a huge amount of GPU memory, we avoid using CUDA in these workers. Instead, these workers ask a manager process when they want to evaluate the neural network, and only the manager process is allowed to use CUDA to evaluate the neural network for the controls.
def Control(net, pipes, shared_buffer, seed, idx):
    simulation = __import__('simulation')
    # seeding
    random.seed(seed)
    np.random.seed(seed)
    simulation.set_seed(seed)

Esempio n. 34

0

Mostra file

File: TestRL.py Progetto: galkampel/RL_courses

import numpy as np
import MDP
import RL
''' Construct simple MDP as described in Lecture 2a Slides 13-14'''
T = np.array([[[0.5, 0.5, 0, 0], [0, 1, 0, 0], [0.5, 0.5, 0, 0], [0, 1, 0, 0]],
              [[1, 0, 0, 0], [0.5, 0, 0, 0.5], [0.5, 0, 0.5, 0],
               [0, 0, 0.5, 0.5]]])
R = np.array([[0, 0, 10, 10], [0, 0, 10, 10]])
discount = 0.9
mdp = MDP.MDP(T, R, discount)
rlProblem = RL.RL(mdp, np.random.normal)

# Test Q-learning
[Q,
 policy] = rlProblem.qLearning(s0=0,
                               initialQ=np.zeros([mdp.nActions, mdp.nStates]),
                               nEpisodes=1000,
                               nSteps=100,
                               epsilon=0.3)
print("\nQ-learning results")
print(Q)
print(policy)

# import numpy as np
# import MDP
# import RL
#
#
# ''' Construct simple MDP as described in Lecture 2a Slides 13-14'''
# T = np.array([[[0.5,0.5,0,0],[0,1,0,0],[0.5,0.5,0,0],[0,1,0,0]],[[1,0,0,0],[0.5,0,0,0.5],[0.5,0,0.5,0],[0,0,0.5,0.5]]])
# R = np.array([[0,0,10,10],[0,0,10,10]])

Esempio n. 35

0

Mostra file

File: run_RL.py Progetto: epineye/SF2957-Statistical-Machine-Learning

    for decks in [1, 2, 6, 8, inf]:
        print("----- deck number equal to {} -----".format(decks))
        # set seed
        seed = 31233
        # init envs.
        env = bjk.BlackjackEnvExtend(decks=decks, seed=seed)
        sum_env = bjk_base.BlackjackEnvBase(decks=decks, seed=seed)

        print("----- Starting MC training on expanded state space -----")
        # MC-learning wit expanded state representation
        start_time_MC = time.time()
        Q_MC, MC_avg_reward, state_action_count = rl.learn_MC(
            env,
            n_sims,
            gamma=1,
            epsilon=epsilon,
            init_val=init_val,
            episode_file=path_fun("hand_MC_state"),
            warmup=warmup)
        print("Number of explored states: " + str(len(Q_MC)))
        print("Cumulative avg. reward = " + str(MC_avg_reward))
        time_to_completion_MC = time.time() - start_time_MC

        print("----- Starting Q-learning on expanded state space -----")
        # Q-learning with expanded state representation
        start_time_expanded = time.time()
        Q, avg_reward, state_action_count = rl.learn_Q(
            env,
            n_sims,
            gamma=1,
            omega=omega,

Esempi in Python per RL, Outsmart