Esempio n. 1
0
def policy_visualize(Q, env, decks):
    Q = rl.convert_to_sum_states(Q, env)
    Q_ = q_with_optimalaction(Q)
    optQ = rl.fill_missing_sum_states(rl.filter_states(Q_), default_value=0.5)

    data = pd.DataFrame(list(optQ.items()))
    for i in data[0]:
        if i == data[0][0]:
            x = np.array(i[0])
            y = np.array(i[1])
            z = np.array(i[2])
        else:
            x = np.append(x, i[0])
            y = np.append(y, i[1])
            z = np.append(z, i[2])
    data["player_hand"] = x
    data["show_card"] = y
    data["use_ace"] = z
    data.drop(0, axis=1, inplace=True)

    use_ace_set = data[data["use_ace"] == True]
    nouse_ace_set = data[data["use_ace"] == False]

    use_ace_set = use_ace_set.pivot(index="player_hand",
                                    columns="show_card",
                                    values=1).sort_index(ascending=False)
    nouse_ace_set = nouse_ace_set.pivot(index="player_hand",
                                        columns="show_card",
                                        values=1).sort_index(ascending=False)
    """ax1, ax2 = plt.axes()
    ax1.set_title("Optimal Policy with use ace")
    ax2.set_title("Optimal Policy without use ace")

    fig1 = sns.heatmap(use_ace_set, ax = ax1).get_figure()
    fig2 = sns.heatmap(nouse_ace_set, ax = ax2).get_figure()

    fig1.savefig("figures/Optimal Policy with use ace in {}deck.jpg".format(decks))
    fig2.savefig("figures/Optimal Policy without use ace in {}decks.jpg".format(decks))"""

    fig, ax = plt.subplots(1, 2, figsize=(20, 10))
    fig.suptitle("optimal policy in {}decks".format(decks), fontsize=16)
    ax[0].set_title("with use ace")
    ax[1].set_title("without use ace")
    color = ["k", "w", "g"]
    cmap = sns.color_palette(color, n_colors=3)

    sns.heatmap(use_ace_set,
                ax=ax[0],
                cmap=cmap,
                linewidths=.5,
                linecolor="lightgray",
                cbar_kws={"ticks": [0., 0.5, 1.]})
    sns.heatmap(nouse_ace_set,
                ax=ax[1],
                cmap=cmap,
                linewidths=.5,
                linecolor="lightgray",
                cbar_kws={"ticks": [0., 0.5, 1.]})

    fig.savefig("figures/Optimal Policy in {}deck.jpg".format(decks))
def traffic():
    for i in range(100):
        observation = env.reset()
        t_reward = 0
        step = 0
        r1 = rnd
        r2 = rnd
        r1.seed(1)
        r2.seed(2)
        while True:
            step += 1
            # time.sleep(0.1)
            cars(r1, r2)
            env.render()
            action = RL.choose_action(observation)
            if int(observation[5]) < 6:
                # print("can not change")
                action = "n"
            # print(action)
            observation_, reward, done = env.switch_light(action)
            t_reward += reward
            RL.save_memory(observation, action, reward, observation_)
            if step > 500 and step % 5 == 0:
                RL.learn()

            observation = observation_
            if done:
                print(t_reward)
                break
Esempio n. 3
0
 def __init__(self, graph_path='models/simpleDQN.pb', reload_every=60 * 60):
     self.graph_path = graph_path
     #self.sess = None
     #self.load_graph()
     self.reload_every = reload_every
     self.counter = 0
     self.simple_controller = ssbm.SimpleControllerState()
     RL.restore()
Esempio n. 4
0
 def __init__(self, graph_path='models/simpleDQN.pb', reload_every=60*60):
     self.graph_path = graph_path
     #self.sess = None
     #self.load_graph()
     self.reload_every = reload_every
     self.counter = 0
     self.simple_controller = ssbm.SimpleControllerState()
     RL.restore()
Esempio n. 5
0
def sweep(data_dir='experience/'):
  # for f in ["2"]:
  for f in os.listdir(data_dir):
    if f.isdigit():
        filename = data_dir + f
        print("Training on " + filename)
        RL.train(filename)
    else:
        print("Not training on file:", f)
  RL.save()
Esempio n. 6
0
def sweep(data_dir='experience/'):
    # for f in ["2"]:
    for f in os.listdir(data_dir):
        if f.isdigit():
            filename = data_dir + f
            print("Training on " + filename)
            RL.train(filename)
        else:
            print("Not training on file:", f)
    RL.save()
Esempio n. 7
0
 def get_action(self, state):
     scores = RL.scoreActions(state)
     
     score, best_action = max(zip(scores, ssbm.simpleControllerStates), key=lambda x: x[0])
     #print(score, best_action)
     
     self.epsilon = RL.getEpsilon()
     
     if flip(self.epsilon):
       self.simple_controller = ssbm.SimpleControllerState.randomValue()
     else:
       self.simple_controller = best_action
Esempio n. 8
0
    def get_action(self, state):
        scores = RL.scoreActions(state)

        score, best_action = max(zip(scores, ssbm.simpleControllerStates),
                                 key=lambda x: x[0])
        #print(score, best_action)

        self.epsilon = RL.getEpsilon()

        if flip(self.epsilon):
            self.simple_controller = ssbm.SimpleControllerState.randomValue()
        else:
            self.simple_controller = best_action
Esempio n. 9
0
    def advance(self, state, pad):
        self.counter += 1

        if self.counter >= self.reload_every:
            #self.load_graph()
            print("RL.restore()")
            RL.restore()
            self.counter = 0

        self.get_action(state)
        if self.counter % 60 == 0:
            print("Frame %d of recording." % self.counter)
            print(self.simple_controller)
            print(self.epsilon)
        pad.send_controller(self.simple_controller.realController())
Esempio n. 10
0
    def advance(self, state, pad):
        self.counter += 1

        if self.counter >= self.reload_every:
            #self.load_graph()
            print("RL.restore()")
            RL.restore()
            self.counter = 0

        self.get_action(state)
        if self.counter % 60 == 0:
            print("Frame %d of recording." % self.counter)
            print(self.simple_controller)
            print(self.epsilon)
        pad.send_controller(self.simple_controller.realController())
Esempio n. 11
0
def test_rl(model_name: str, trained_model_name: str) -> dict:
    """
    Tests the RL agent
    Note that the parameters of the trained and tested RL agent need to be the same for these parameters:
    - Antigens included
    - Max age
    - state type
    - obs method
    :param model_name: name of the model to be stored
    :param trained_model_name: name of the agent trained to be evaluated
    :return: dict containing all evaluation metrics
    """
    model_name = model_name + "_RL"

    print('- start testing RL model')
    results = RL.solve(supply_distribution=supply_distribution,
                       demand_distribution=demand_distribution,
                       model_name=model_name,
                       export_model='results/model/' + trained_model_name +
                       '/best_model',
                       max_age=parameters['max_age'],
                       demand=parameters['demand'],
                       doi=parameters['doi'],
                       n_warm_start_days=parameters['n_warm_start_days'],
                       n_days=parameters['n_days'],
                       obs_method=parameters['rl']['obs_method'],
                       state_type=parameters['rl']['state_type'])
    print('- complete testing RL model')
    return results[0]
Esempio n. 12
0
def play():
    board = Tic.Tic(size)

    nn = RL.RL([squ, 10 * squ, 10 * squ, 10 * squ, squ])

    sess = tf.Session()
    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state(path)
    saver.restore(sess, ckpt.model_checkpoint_path)

    done = False

    i = 0

    ai = 0  #0 is x, 1 is o

    agent = -1 * (ai * 2 - 1)

    winner = 0

    while not done:
        loc = [-1, -1]
        if i % 2 == ai:
            a, m = sess.run([nn.predict, nn.out],
                            feed_dict={nn.input: board.state(agent)})
            a = a[0]
            print(m)
            loc[0] = int(a / size)
            loc[1] = a % size
            board.play(agent, loc)
        else:
            board.print()
            validPlay = False
            while not validPlay:
                text = input("Please enter play position 'row,column': ")
                loc = text.split(',')
                loc[0] = int(loc[0])
                loc[1] = int(loc[1])
                validPlay = board.valid(loc)
                if not validPlay:
                    print("INVALID PLAY: Please choose another position")
            board.play(-1 * agent, loc)

        output = board.done()
        done = output[0]
        winner = output[1]

        i += 1

    board.print()

    if winner == 0:
        print("Tie!")
    elif winner == agent:
        print("Computer Wins")
    else:
        print("Human Wins")
Esempio n. 13
0
def make_user_features(userId, prodId, date, ratings, recommend, review_words,
                       review_text):
    user_features = pd.DataFrame()

    unique_user = list(np.unique(userId))
    user_features.insert(0, "userId", unique_user)
    # 1. MNR
    user_features.insert(1, "mnr", MNR(userId, date))
    #2. PR
    user_features.insert(2, "PR", PR_NR(userId, ratings, "PR"))
    #3. NR
    user_features.insert(3, "NR", PR_NR(userId, ratings, "NR"))

    #4. avgRD
    user_features.insert(4, "avgRD",
                         avgRD(userId, prodId, ratings, us_pr="user"))

    #5. WRD
    #did not do.

    #6. BST
    user_features.insert(5, "BST", BST_user(userId, date))

    #7. ERD
    #did not do.

    #8. ETG
    #did not do.

    #9. RL
    #use review_text
    #remove 0:2000 later...

    user_features.insert(6, "RL", RL(userId, review_words[0:3000]))

    #uses review content to find TFIDF.

    vectorizer = TfidfVectorizer(ngram_range=(1, 2),
                                 max_features=2000,
                                 stop_words='english')

    # Max Features would mean
    # creating a feature matrix out of the most 2000 frequent words accross text documents.
    TFIDF = vectorizer.fit_transform(review_text)

    #10. ACS
    user_features.insert(7, "ACS", ACS(userId, TFIDF))

    #11. MCS
    user_features.insert(8, "MCS", MCS(userId, TFIDF))

    #write to a csv file and exit the function
    user_features.to_csv(
        '/Users/anaghakaranam/Desktop/Opinion_Spam/coding-playground/feature_csvs/user_features.csv',
        index=None,
        header=True)
Esempio n. 14
0
   def __init__(self, agent, memory):

      self.agent = agent

      self.directions = [
         np.array([1, 0]),
         np.array([-1, 0]),
         np.array([0, -1]), 
         np.array([0, 1])]

      self.senses = []

      self.brain = RL.QLearn(numActions = len(self.directions), memory = memory)
      self.scheduledAction = None
      self.learningModule = None
Esempio n. 15
0
def main():
    for i in range(1, MAX_EPISODES):
        print(i, "of episodes", end="\n")
        start_time = time.time()
        observation = env.reset()
        for j in range(MAX_STEP_EPISODES):
            env.render()
            action = RL.choose_action(observation)
            if j < 5:
                action = 0
            observation_, reward, done, info = env.step(action)
            RL.store_transition(observation, action, reward, False)

            if done:
                RL.store_transition(observation, action, 0.0, True)
                RL.learn()
                break
            observation = observation_

        end_time = time.time()
        plot_.plot_graph((end_time - start_time), i)
    env.close()
    RL.store_net()
Esempio n. 16
0
 def __init__(self,
              model=None,
              path=None,
              reload_every=60 * 60,
              swap=False,
              memory=0,
              delay=0,
              **kwargs):
     self.model = RL.Model(model,
                           path,
                           swap=swap,
                           mode=RL.Mode.PLAY,
                           memory=memory,
                           **kwargs)
     self.reload_every = reload_every
     self.counter = 0
     self.action = 0
     self.actions = util.CircularQueue(delay + 1, 0)
     self.memory = util.CircularQueue(array=((memory + 1) *
                                             ssbm.SimpleStateAction)())
     self.model.restore()
Esempio n. 17
0
def train_rl(model_name: str) -> str:
    """
    Trains the model using the parameters defined
    :param model_name: name of the model to save
    :return: str, name of the stored model
    """
    # model name
    model_name = model_name + "_RL"

    print('- start training RL model')
    trained_model_name = RL.train(
        supply_distribution=supply_distribution,  # global
        demand_distribution=demand_distribution,  # global
        model_name=model_name,  # in loop
        max_age=parameters['max_age'],
        demand=parameters['demand'],
        max_day=parameters['rl']['max_day'],
        obs_method=parameters['rl']['obs_method'],
        doi=parameters['doi'],
        training_timesteps_list=parameters['rl']['training_interval'],
        tblog=parameters['rl']['tb_log'])
    print('- Complete training RL model')
    return trained_model_name
Esempio n. 18
0
   def __initAI__(self):    
      self.efficiencyPlot = view.Plot()

      self.aiCollection = ai.AICollection()
      self.positionMonitors = []
      self.trainedAI = ai.TrainedAI(goalId = 'e', wallId = '#', statisticsPlot = self.efficiencyPlot)      

      self.eyesight = ai.Eyesight(1)
      self.smell = ai.Smell('e')

      self.memory = RL.QMemory()
      self.savedMemory = None
      self.savedMemoryNo = -1
      
      self.timer = utils.Timer(0.5)
      self.efficiencyPlot.show()

      def updatePositionMonitors():
         for monitor in self.positionMonitors:
            monitor.update(self.scene)

      self.timer.addToTick(lambda: self.aiCollection.think(self.scene))
      self.timer.addToTick(updatePositionMonitors)
Esempio n. 19
0
def PlotValueFunction(AI):
    if hasattr(AI, 'QueryQBestAction') and callable(
            getattr(AI, 'QueryQBestAction')):
        # Update plot of optimal value function (only of position and velocity)a
        X, Y = np.meshgrid(range(0, int(BASEY + 30), 20), range(-10, 10, 1))

        Z = np.zeros(X.shape)
        for yy in xrange(X.shape[0]):
            for xx in xrange(X.shape[1]):
                Z[yy, xx] = AI.QueryQBestAction(
                    RL.FB_GS(0, X[yy, xx], 0, Y[yy, xx], [{
                        'x': 0,
                        'y': 0
                    }, {
                        'x': 0,
                        'y': 0
                    }], [{
                        'x': 0,
                        'y': 0
                    }, {
                        'x': 0,
                        'y': 0
                    }]))

        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.plot_surface(X,
                        Y,
                        Z,
                        rstride=1,
                        cstride=1,
                        cmap=cm.coolwarm,
                        linewidth=0,
                        antialiased=False)
        plt.savefig('optimalQ.png')
        plt.close(fig)
Esempio n. 20
0
import RL
import os

RL.init()
# RL.restore()


def sweep(data_dir='experience/'):
    # for f in ["2"]:
    for f in os.listdir(data_dir):
        if f.isdigit():
            filename = data_dir + f
            print("Training on " + filename)
            RL.train(filename)
        else:
            print("Not training on file:", f)
    RL.save()
    #RL.writeGraph()


while True:
    sweep()
Esempio n. 21
0
Pl[5,0,5]=0.1
Pl[6,0,6]=1
Pl[0,1,0]=1
Pl[1,1,1]=0
Pl[1,1,0]=1
Pl[2,1,1]=1
Pl[3,1,2]=1
Pl[4,1,3]=1
Pl[5,1,4]=1    
Pl[6,1,5]=1
   
Rl = np.zeros((7,2))
Rl[[0,6],:]=1
absorv = np.zeros((7,1))
absorv[[0,6]]=1
fmdp = RL.finiteMDP(7,2,0.9,Pl,Rl,absorv)

J,traj = fmdp.runPolicy(10000,3,poltype = "exploration") #choose this value
data = np.load("Q1.npz")
Qr = fmdp.traces2Q(traj)
if np.sqrt(sum(sum((data['Q1']-Qr)**2)))<1:
    print("Aproximação de Q dentro do previsto. OK\n")
else:
    print("Aproximação de Q fora do previsto. FAILED\n")

J,traj = fmdp.runPolicy(3,3,poltype = "exploitation", polpar = Qr)
if np.sqrt(sum(sum((data['traj2']-traj)**2)))<1:
    print("Trajectória óptima. OK\n")
else:
    print("Trajectória não óptima. FAILED\n")
    
Esempio n. 22
0
def experiment(device,
               reward_system,
               PIPEGAP,
               BATCH_SIZE,
               learning_rate,
               MEMORY_SIZE,
               GAMMA,
               EPS_START,
               EPS_END,
               EPS_DECAY,
               OBSERVE,
               FRAME_PER_ACTION,
               TARGET_UPDATE,
               num_episodes,
               save_model=False,
               load_model=False,
               load_model_path_prefix=None):
    expected_q_value = 0

    policy_net = RL.DQN().to(device)
    target_net = RL.DQN().to(device)
    if load_model:
        policy_net.load_state_dict(
            torch.load(load_model_path_prefix + "_policy_net.mdl"))
        target_net.load_state_dict(
            torch.load(load_model_path_prefix + "_target_net.mdl"))
    else:
        target_net.load_state_dict(policy_net.state_dict())
        target_net.eval()
    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
    memory = RL.ReplayMemory(MEMORY_SIZE)

    #Setup Game environment
    game = FlappyBird.FlappyBird(pipe_gap=PIPEGAP)
    env = PLE(game,
              fps=30,
              display_screen=True,
              force_fps=True,
              reward_values=reward_system)

    #Setup plot
    RLplot.plot_init()
    episode_durations = []

    # Main part with game execution

    env.init()
    steps_done = 0
    infinity = False

    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset_game()
        state = env.getScreenRGB()
        state = RLip.BCHW_format(state)
        frames = (state, state, state, state)
        state = RLip.last_4_frames(state, frames[1], frames[2], frames[3])

        for t in count():
            # Select an action
            action, steps_done = RL.select_action(state, policy_net,
                                                  steps_done, device,
                                                  EPS_START, EPS_END,
                                                  EPS_DECAY, OBSERVE)
            if steps_done % FRAME_PER_ACTION != 0:
                action = torch.tensor([[1]], device=device, dtype=torch.long)

            # Perform an action
            reward = env.act(env.getActionSet()[action[0, 0]])
            next_state = env.getScreenRGB()
            done = env.game_over()
            reward = torch.tensor([reward], device=device)

            # Formatting next state for network
            if not done:
                next_state = RLip.BCHW_format(next_state)
                frames = (next_state, frames[0], frames[1], frames[2])
                next_state = RLip.last_4_frames(next_state, frames[1],
                                                frames[2], frames[3])
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)  # edit

            # Move to the next state
            state = next_state

            # Print Log of training info
            if steps_done <= OBSERVE:
                state_of_training = "observe"
            elif steps_done > OBSERVE and steps_done <= OBSERVE + EPS_DECAY:
                state_of_training = "explore"
            else:
                state_of_training = "train"
            print("TIMESTEP", steps_done, "/ STATE", state_of_training,\
                 "/ ACTION", action[0,0].data,"/ REWARD", reward[0].data,"/ Expected_Q",expected_q_value)

            # Perform one step of the optimization (on the target network)
            if steps_done > OBSERVE:
                RL.optimize_model(policy_net, target_net, memory, optimizer,
                                  device, BATCH_SIZE, GAMMA)
                if done:
                    episode_durations.append(t + 1)
                    RLplot.plot_durations(episode_durations)
                    break
                if t > 10000:
                    infinity = True
                    episode_durations.append(t + 1)
                    RLplot.plot_durations(episode_durations)
                    break
            else:
                if done:
                    break

        # Update the target network
        if i_episode % TARGET_UPDATE == 0 and steps_done > OBSERVE:
            target_net.load_state_dict(policy_net.state_dict())
        if infinity:
            break
    # End training process
    # Save experiment result
    data = {
        "data": episode_durations,
        'pipe_gap': PIPEGAP,
        'reward_values': reward_system,
        'BATCH_SIZE': BATCH_SIZE,
        'learning_rate': learning_rate,
        'MEMORY_SIZE': MEMORY_SIZE,
        'GAMMA': GAMMA,
        'EPS_START': EPS_START,
        'EPS_END': EPS_END,
        'EPS_DECAY': EPS_DECAY,
        'OBSERVE': OBSERVE,
        'FRAME_PER_ACTION': FRAME_PER_ACTION,
        'TARGET_UPDATE': TARGET_UPDATE,
        'num_episodes': num_episodes
    }
    filenameprefix = './result/Expe_' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S')
    filename = filenameprefix + '.pkl'
    with open(filename, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
    # Save model if said so
    if save_model:
        torch.save(policy_net.state_dict(), filenameprefix + '_policy_net.mdl')
        torch.save(target_net.state_dict(), filenameprefix + '_target_net.mdl')

    # Save plot figure
    plotname = filenameprefix + '.png'
    RLplot.plot_end(plotname)
Esempio n. 23
0
        os.system("sbatch " + slurmfile)
        #os.system("sbatch -N 1 -c 2 --mem=8000 --time=6-23:00:00 slurm_scripts/" + jobname + ".slurm &")


init = False
init = True

if dry_run:
    print("NOT starting jobs:")
else:
    print("Starting jobs:")

    # init model for the first time
    if init:
        import RL
        model = RL.Model(mode=RL.Mode.TRAIN, gpu=False, **job_dicts['train'])
        model.init()
        model.save()

train_name = "trainer_" + exp_name
train_command = "python3 -u train.py" + job_flags['train']

slurm_script(train_name, train_command, gpu=True)

#sys.exit()

agent_count = 0
agent_command = "python3 -u run.py" + job_flags['agent']
for c1 in characters:
    for c2 in characters:
        command = agent_command + " --p1 %s --p2 %s" % (c1, c2)
Esempio n. 24
0
import RL
import random

env = RL.Env()
jernej = RL.Player()

for i in range(env.STEPS):
    action = [jernej.move_o(), jernej.move_p()]
    #action = [random.choice([-5, 5]), random.choice([-.05, .05])]
    env.step(action)
    env.render()
    if env.done:
        print(f'Crashed in episode step: {env.episode_step}')
        env.reset()
Esempio n. 25
0
import gym
import RL
from draw_graph import Plot
import time

env = gym.make('CartPole-v0')
env = env.unwrapped

plot_ = Plot()

MAX_EPISODES = 2000
MAX_STEP_EPISODES = 5000

RL = RL.PolicyGradient(n_actions=env.action_space.n,
                       n_features=env.observation_space.shape[0],
                       n_hidden=10,
                       learning_rate=0.01,
                       reward_decay=0.99,
                       epsilon=0.90)


def main():
    for i in range(1, MAX_EPISODES):
        print(i, "of episodes", end="\n")
        start_time = time.time()
        observation = env.reset()
        for j in range(MAX_STEP_EPISODES):
            env.render()
            action = RL.choose_action(observation)
            if j < 5:
                action = 0
            observation_, reward, done, info = env.step(action)
Esempio n. 26
0
        start_time_expanded = time.time()
        Q, avg_reward, state_action_count = rl.learn_Q(
            env, n_sims, gamma = 1, omega = omega, epsilon = epsilon, init_val = init_val,
            episode_file=path_fun("hand_state"), warmup=warmup)
        print("Number of explored states: " + str(len(Q)))
        print("Cumulative avg. reward = " + str(avg_reward))
        time_to_completion_expanded = time.time() - start_time_expanded
        """

        print("----- Starting Q-learning for sum-based state space -----")
        # Q-learning with player sum state representation
        start_time_sum = time.time()
        sumQ, sum_avg_reward, sum_state_action_count = rl.learn_Q(
            sum_env,
            n_sims,
            omega=omega,
            epsilon=epsilon,
            init_val=init_val,
            episode_file=path_fun("sum_state"),
            warmup=warmup)

        time_to_completion_sum = time.time() - start_time_sum

        print("Number of explored states (sum states): " + str(len(sumQ)))
        print("Cumulative avg. reward = " + str(sum_avg_reward))
        """
        print("Training time: \n " +
              "Expanded state space MC: {} \n Expanded state space: {} \n Sum state space: {}".format(
                 time_to_completion_MC, time_to_completion_expanded, time_to_completion_sum))
        
    
        
Esempio n. 27
0
                    type=int,
                    default=0,
                    help="how many frames to remember")

args = parser.parse_args()

if args.name is None:
    args.name = args.model

if args.path is None:
    args.path = "saves/%s/" % args.name

experience_dir = args.path + 'experience/'
os.makedirs(experience_dir, exist_ok=True)

model = RL.Model(mode=RL.Mode.TRAIN, **args.__dict__)

# do this in RL?
if args.init:
    model.init()
    model.save()
else:
    model.restore()

import numpy as np


def sweep(data_dir='experience/'):
    i = 0
    start_time = time.time()
    files = os.listdir(data_dir)
def reinfrocement_neural_network_control(load_weights=None,
                                         run_only=False,
                                         track_select='SS',
                                         random_seed=None,
                                         rl_prams=None):
    run = run_only
    weights_save_dir = "./weights/"
    if not os.path.exists(weights_save_dir): os.makedirs(weights_save_dir)
    Environment.track_generator(track, track_select=track_select)
    env = Environment.Environment(track, rl_parameters['max_steps'])
    gui = GUI.GUI(track, cars, trace=True)
    car_objects = [Environment.Car(c) for c in cars]
    rl = RL.QLearning_NN(rl_prams, weights_save_dir=weights_save_dir)
    rl.generate_nn()
    if load_weights is not None:
        if load_weights == 'all':
            run = True
        else:
            rl.load_weights(load_weights)
    if random_seed is not None: rl.random_seed(random_seed)
    weight_names = sorted([name for name in glob.glob(weights_save_dir + '*')])
    weight_names_index = 0

    def initialize(run_state):
        env.compute_interaction(car_objects)
        for car in car_objects:
            car.reset()
            car.get_sensor_reading()
        if run_state == True:
            env.set_max_steps(1500)
            gui.remove_traces()
            gui.disable_trace()
            gui.set_run_select(gui.runs[1])
            gui.update_debug_info('[Testing]\n' +
                                  'Currently learned weights loaded')
        else:
            env.set_max_steps(rl_prams['max_steps'])
            gui.enable_trace()
            gui.set_run_select(gui.runs[0])
            gui.update_debug_info('[Training]\n')

    def check_run_button(current_state):
        if gui.get_run_select() == gui.runs[0] and current_state == True:
            print '\n\n\nLearning\n'
            initialize(run_state=False)
            return False
        if gui.get_run_select() == gui.runs[1] and run == False:
            print '\n\n\nRun only\n'
            initialize(run_state=True)
            return True
        return None

    initialize(run_state=run)
    while (1):
        new_run_state = check_run_button(current_state=run)
        if new_run_state is not None: run = new_run_state
        if run == True:
            for i, car in enumerate(car_objects):
                terminal = rl.run_step(car, env, dt)
                if terminal is not None:
                    print 'Car', i, ':', terminal
                    if i == 0:
                        if load_weights == 'all' and weight_names_index < len(
                                weight_names):
                            rl.load_weights(weight_names[weight_names_index])
                            gui.update_debug_info(
                                '[Testing]\n' + 'Weights loaded:\n' +
                                weight_names[weight_names_index])
                            weight_names_index += 1
                gui.update(i, car.get_state())
            env.compute_interaction(car_objects)
            gui.refresh()
        else:
            terminal, debug, epoch, avg_loss, final_score, cross_score = rl.learn_step(
                car_objects[0], env, dt)
            if terminal is not None:
                if debug is not None:
                    gui.update_debug_info(debug)
                    gui.update_graph(epoch, avg_loss, gui.graphs[0])
                    gui.update_graph(epoch, final_score, gui.graphs[1])
                    gui.update_graph(epoch, cross_score, gui.graphs[2])
                    gui.refresh()
                gui.update(0, terminal, draw_car=False, force_end_line=True)
                gui.refresh()
            if rl.epoch % 100 == 0:
                gui.update(0, car_objects[0].get_state(), draw_car=True)
                gui.refresh()
            else:
                gui.update(0, car_objects[0].get_state(), draw_car=False)
            env.render()
            action = RL.choose_action(observation)
            if int(observation[5]) < 6:
                # print("can not change")
                action = "n"
            # print(action)
            observation_, reward, done = env.switch_light(action)
            t_reward += reward
            RL.save_memory(observation, action, reward, observation_)
            if step > 500 and step % 5 == 0:
                RL.learn()

            observation = observation_
            if done:
                print(t_reward)
                break


if __name__ == "__main__":
    env = map_env.Map()
    mode = sys.argv[1]
    env.after(100, traffic_baseline())
    env.destroy()
    if mode == 'RL':
        env = map_env.Map()
        RL = RL.QLearningTable(env.action_space)

    elif mode == 'DQN':
        env = map_env.Map()
        RL = RL.DeepQNetwork(num_actions=2, num_features=6, actions=['y', 'n'])
    env.after(100, traffic())
Esempio n. 30
0
import RL
import os

RL.init()
# RL.restore()

def sweep(data_dir='experience/'):
  # for f in ["2"]:
  for f in os.listdir(data_dir):
    if f.isdigit():
        filename = data_dir + f
        print("Training on " + filename)
        RL.train(filename)
    else:
        print("Not training on file:", f)
  RL.save()
  #RL.writeGraph()

while True:
  sweep()
Esempio n. 31
0
        # Q-learning with expanded state representation
        start_time_expanded = time.time()
        Q, avg_reward, state_action_count = rl.learn_Q(
            env, n_sims, gamma = 1, omega = omega, epsilon = epsilon, init_val = init_val,
            episode_file=path_fun("hand_state"), warmup=warmup)
        print("Number of explored states: " + str(len(Q)))
        print("Cumulative avg. reward = " + str(avg_reward))
        time_to_completion_expanded = time.time() - start_time_expanded"""

        print("----- Starting Q-learning for sum-based state space -----")
        # Q-learning with player sum state representation
        start_time_sum = time.time()
        sumQ, sum_avg_reward, sum_state_action_count = rl.learn_Q(
            sum_env,
            n_sims,
            omega=omega,
            epsilon=epsilon,
            init_val=init_val,
            episode_file=path_fun("sum_state"),
            warmup=warmup)
        time_to_completion_sum = time.time() - start_time_sum
        print("Number of explored states (sum states): " + str(len(sumQ)))
        print("Cumulative avg. reward = " + str(sum_avg_reward))
        """print("Training time: \n " +
              "Expanded state space MC: {} \n Expanded state space: {} \n Sum state space: {}".format(
                 time_to_completion_MC, time_to_completion_expanded, time_to_completion_sum))


        # Convert Q (extended state) to sum state representation and make 3D plots
        # Extended state MC-learning
        Q_conv_MC = rl.convert_to_sum_states(Q_MC, env)
        V_conv_MC = rl.convert_to_value_function(Q_conv_MC)
    data_size = round(
        (2 + args.input_moment_order + 1) * args.input_moment_order / 2)
elif args.input == 'wavefunction':
    data_size = 2 * (x_n - 10 * 2)

# we do not plot when we do parallelized computation

#import plot
#plot.set_parameters(x=x, x_max=x_max, dt=time_step, num_of_episodes=num_of_episodes, probability=probability,
#    reward_multiply=reward_multiply, read_length=read_length, controls_per_unit_time=controls_per_unit_time)

# set the reinforcement learning settings
if __name__ == '__main__':
    import RL
    RL.set_parameters(control_interval=control_interval,
                      t_max=t_max,
                      F_max=args.F_max,
                      failing_reward=failing_reward)

################################## end learning setting


# Below is the worker function for subprocesses, which carries out the control simulations and pushes the experiences and records to queues that are collected and handled by other processes. (Quantum simulation is implemented in a compiled C module)
# Because too many processes using CUDA will occupy a huge amount of GPU memory, we avoid using CUDA in these workers. Instead, these workers ask a manager process when they want to evaluate the neural network, and only the manager process is allowed to use CUDA to evaluate the neural network for the controls.
def Control(net, pipes, shared_buffer, seed, idx):
    simulation = __import__('simulation')
    # seeding
    random = np.random.RandomState(seed)
    simulation.set_seed(random.randint(0, 2**31 - 1))
    # preparing pipes
    MemoryQueue, ResultsQueue, ActionPipe, EndEvent, PauseEvent = pipes
    state_data_to_manager = np.frombuffer(shared_buffer, dtype='float32')
Esempio n. 33
0
                        num_of_data_per_time_unit)  # 3600
    read_control_step_length = control_interval // coarse_grain
    data_size = 2 * read_length
    shape_measurement_data = (2, read_length)

# we do not plot when we do parallelized computation

#import plot
#plot.set_parameters(x=x, x_max=x_max, dt=time_step, num_of_episodes=num_of_episodes, probability=probability,
#    reward_multiply=reward_multiply, read_length=read_length, controls_per_half_period=controls_per_half_period)

# set the reinforcement learning settings
if __name__ == '__main__':
    import RL
    RL.set_parameters(control_interval=control_interval,
                      t_max=t_max,
                      F_max=args.F_max)
    if args.input == 'measurements':
        RL.set_parameters(read_step_length=read_control_step_length)

################################## end learning setting


# Below is the worker function for subprocesses, which carries out the control simulations and pushes the experiences and records to queues that are collected and handled by other processes. (Quantum simulation is implemented in a compiled C module)
# Because too many processes using CUDA will occupy a huge amount of GPU memory, we avoid using CUDA in these workers. Instead, these workers ask a manager process when they want to evaluate the neural network, and only the manager process is allowed to use CUDA to evaluate the neural network for the controls.
def Control(net, pipes, shared_buffer, seed, idx):
    simulation = __import__('simulation')
    # seeding
    random.seed(seed)
    np.random.seed(seed)
    simulation.set_seed(seed)
Esempio n. 34
0
import numpy as np
import MDP
import RL
''' Construct simple MDP as described in Lecture 2a Slides 13-14'''
T = np.array([[[0.5, 0.5, 0, 0], [0, 1, 0, 0], [0.5, 0.5, 0, 0], [0, 1, 0, 0]],
              [[1, 0, 0, 0], [0.5, 0, 0, 0.5], [0.5, 0, 0.5, 0],
               [0, 0, 0.5, 0.5]]])
R = np.array([[0, 0, 10, 10], [0, 0, 10, 10]])
discount = 0.9
mdp = MDP.MDP(T, R, discount)
rlProblem = RL.RL(mdp, np.random.normal)

# Test Q-learning
[Q,
 policy] = rlProblem.qLearning(s0=0,
                               initialQ=np.zeros([mdp.nActions, mdp.nStates]),
                               nEpisodes=1000,
                               nSteps=100,
                               epsilon=0.3)
print("\nQ-learning results")
print(Q)
print(policy)

# import numpy as np
# import MDP
# import RL
#
#
# ''' Construct simple MDP as described in Lecture 2a Slides 13-14'''
# T = np.array([[[0.5,0.5,0,0],[0,1,0,0],[0.5,0.5,0,0],[0,1,0,0]],[[1,0,0,0],[0.5,0,0,0.5],[0.5,0,0.5,0],[0,0,0.5,0.5]]])
# R = np.array([[0,0,10,10],[0,0,10,10]])
    for decks in [1, 2, 6, 8, inf]:
        print("----- deck number equal to {} -----".format(decks))
        # set seed
        seed = 31233
        # init envs.
        env = bjk.BlackjackEnvExtend(decks=decks, seed=seed)
        sum_env = bjk_base.BlackjackEnvBase(decks=decks, seed=seed)

        print("----- Starting MC training on expanded state space -----")
        # MC-learning wit expanded state representation
        start_time_MC = time.time()
        Q_MC, MC_avg_reward, state_action_count = rl.learn_MC(
            env,
            n_sims,
            gamma=1,
            epsilon=epsilon,
            init_val=init_val,
            episode_file=path_fun("hand_MC_state"),
            warmup=warmup)
        print("Number of explored states: " + str(len(Q_MC)))
        print("Cumulative avg. reward = " + str(MC_avg_reward))
        time_to_completion_MC = time.time() - start_time_MC

        print("----- Starting Q-learning on expanded state space -----")
        # Q-learning with expanded state representation
        start_time_expanded = time.time()
        Q, avg_reward, state_action_count = rl.learn_Q(
            env,
            n_sims,
            gamma=1,
            omega=omega,