def policy_visualize(Q, env, decks): Q = rl.convert_to_sum_states(Q, env) Q_ = q_with_optimalaction(Q) optQ = rl.fill_missing_sum_states(rl.filter_states(Q_), default_value=0.5) data = pd.DataFrame(list(optQ.items())) for i in data[0]: if i == data[0][0]: x = np.array(i[0]) y = np.array(i[1]) z = np.array(i[2]) else: x = np.append(x, i[0]) y = np.append(y, i[1]) z = np.append(z, i[2]) data["player_hand"] = x data["show_card"] = y data["use_ace"] = z data.drop(0, axis=1, inplace=True) use_ace_set = data[data["use_ace"] == True] nouse_ace_set = data[data["use_ace"] == False] use_ace_set = use_ace_set.pivot(index="player_hand", columns="show_card", values=1).sort_index(ascending=False) nouse_ace_set = nouse_ace_set.pivot(index="player_hand", columns="show_card", values=1).sort_index(ascending=False) """ax1, ax2 = plt.axes() ax1.set_title("Optimal Policy with use ace") ax2.set_title("Optimal Policy without use ace") fig1 = sns.heatmap(use_ace_set, ax = ax1).get_figure() fig2 = sns.heatmap(nouse_ace_set, ax = ax2).get_figure() fig1.savefig("figures/Optimal Policy with use ace in {}deck.jpg".format(decks)) fig2.savefig("figures/Optimal Policy without use ace in {}decks.jpg".format(decks))""" fig, ax = plt.subplots(1, 2, figsize=(20, 10)) fig.suptitle("optimal policy in {}decks".format(decks), fontsize=16) ax[0].set_title("with use ace") ax[1].set_title("without use ace") color = ["k", "w", "g"] cmap = sns.color_palette(color, n_colors=3) sns.heatmap(use_ace_set, ax=ax[0], cmap=cmap, linewidths=.5, linecolor="lightgray", cbar_kws={"ticks": [0., 0.5, 1.]}) sns.heatmap(nouse_ace_set, ax=ax[1], cmap=cmap, linewidths=.5, linecolor="lightgray", cbar_kws={"ticks": [0., 0.5, 1.]}) fig.savefig("figures/Optimal Policy in {}deck.jpg".format(decks))
def traffic(): for i in range(100): observation = env.reset() t_reward = 0 step = 0 r1 = rnd r2 = rnd r1.seed(1) r2.seed(2) while True: step += 1 # time.sleep(0.1) cars(r1, r2) env.render() action = RL.choose_action(observation) if int(observation[5]) < 6: # print("can not change") action = "n" # print(action) observation_, reward, done = env.switch_light(action) t_reward += reward RL.save_memory(observation, action, reward, observation_) if step > 500 and step % 5 == 0: RL.learn() observation = observation_ if done: print(t_reward) break
def __init__(self, graph_path='models/simpleDQN.pb', reload_every=60 * 60): self.graph_path = graph_path #self.sess = None #self.load_graph() self.reload_every = reload_every self.counter = 0 self.simple_controller = ssbm.SimpleControllerState() RL.restore()
def __init__(self, graph_path='models/simpleDQN.pb', reload_every=60*60): self.graph_path = graph_path #self.sess = None #self.load_graph() self.reload_every = reload_every self.counter = 0 self.simple_controller = ssbm.SimpleControllerState() RL.restore()
def sweep(data_dir='experience/'): # for f in ["2"]: for f in os.listdir(data_dir): if f.isdigit(): filename = data_dir + f print("Training on " + filename) RL.train(filename) else: print("Not training on file:", f) RL.save()
def sweep(data_dir='experience/'): # for f in ["2"]: for f in os.listdir(data_dir): if f.isdigit(): filename = data_dir + f print("Training on " + filename) RL.train(filename) else: print("Not training on file:", f) RL.save()
def get_action(self, state): scores = RL.scoreActions(state) score, best_action = max(zip(scores, ssbm.simpleControllerStates), key=lambda x: x[0]) #print(score, best_action) self.epsilon = RL.getEpsilon() if flip(self.epsilon): self.simple_controller = ssbm.SimpleControllerState.randomValue() else: self.simple_controller = best_action
def get_action(self, state): scores = RL.scoreActions(state) score, best_action = max(zip(scores, ssbm.simpleControllerStates), key=lambda x: x[0]) #print(score, best_action) self.epsilon = RL.getEpsilon() if flip(self.epsilon): self.simple_controller = ssbm.SimpleControllerState.randomValue() else: self.simple_controller = best_action
def advance(self, state, pad): self.counter += 1 if self.counter >= self.reload_every: #self.load_graph() print("RL.restore()") RL.restore() self.counter = 0 self.get_action(state) if self.counter % 60 == 0: print("Frame %d of recording." % self.counter) print(self.simple_controller) print(self.epsilon) pad.send_controller(self.simple_controller.realController())
def advance(self, state, pad): self.counter += 1 if self.counter >= self.reload_every: #self.load_graph() print("RL.restore()") RL.restore() self.counter = 0 self.get_action(state) if self.counter % 60 == 0: print("Frame %d of recording." % self.counter) print(self.simple_controller) print(self.epsilon) pad.send_controller(self.simple_controller.realController())
def test_rl(model_name: str, trained_model_name: str) -> dict: """ Tests the RL agent Note that the parameters of the trained and tested RL agent need to be the same for these parameters: - Antigens included - Max age - state type - obs method :param model_name: name of the model to be stored :param trained_model_name: name of the agent trained to be evaluated :return: dict containing all evaluation metrics """ model_name = model_name + "_RL" print('- start testing RL model') results = RL.solve(supply_distribution=supply_distribution, demand_distribution=demand_distribution, model_name=model_name, export_model='results/model/' + trained_model_name + '/best_model', max_age=parameters['max_age'], demand=parameters['demand'], doi=parameters['doi'], n_warm_start_days=parameters['n_warm_start_days'], n_days=parameters['n_days'], obs_method=parameters['rl']['obs_method'], state_type=parameters['rl']['state_type']) print('- complete testing RL model') return results[0]
def play(): board = Tic.Tic(size) nn = RL.RL([squ, 10 * squ, 10 * squ, 10 * squ, squ]) sess = tf.Session() saver = tf.train.Saver() ckpt = tf.train.get_checkpoint_state(path) saver.restore(sess, ckpt.model_checkpoint_path) done = False i = 0 ai = 0 #0 is x, 1 is o agent = -1 * (ai * 2 - 1) winner = 0 while not done: loc = [-1, -1] if i % 2 == ai: a, m = sess.run([nn.predict, nn.out], feed_dict={nn.input: board.state(agent)}) a = a[0] print(m) loc[0] = int(a / size) loc[1] = a % size board.play(agent, loc) else: board.print() validPlay = False while not validPlay: text = input("Please enter play position 'row,column': ") loc = text.split(',') loc[0] = int(loc[0]) loc[1] = int(loc[1]) validPlay = board.valid(loc) if not validPlay: print("INVALID PLAY: Please choose another position") board.play(-1 * agent, loc) output = board.done() done = output[0] winner = output[1] i += 1 board.print() if winner == 0: print("Tie!") elif winner == agent: print("Computer Wins") else: print("Human Wins")
def make_user_features(userId, prodId, date, ratings, recommend, review_words, review_text): user_features = pd.DataFrame() unique_user = list(np.unique(userId)) user_features.insert(0, "userId", unique_user) # 1. MNR user_features.insert(1, "mnr", MNR(userId, date)) #2. PR user_features.insert(2, "PR", PR_NR(userId, ratings, "PR")) #3. NR user_features.insert(3, "NR", PR_NR(userId, ratings, "NR")) #4. avgRD user_features.insert(4, "avgRD", avgRD(userId, prodId, ratings, us_pr="user")) #5. WRD #did not do. #6. BST user_features.insert(5, "BST", BST_user(userId, date)) #7. ERD #did not do. #8. ETG #did not do. #9. RL #use review_text #remove 0:2000 later... user_features.insert(6, "RL", RL(userId, review_words[0:3000])) #uses review content to find TFIDF. vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=2000, stop_words='english') # Max Features would mean # creating a feature matrix out of the most 2000 frequent words accross text documents. TFIDF = vectorizer.fit_transform(review_text) #10. ACS user_features.insert(7, "ACS", ACS(userId, TFIDF)) #11. MCS user_features.insert(8, "MCS", MCS(userId, TFIDF)) #write to a csv file and exit the function user_features.to_csv( '/Users/anaghakaranam/Desktop/Opinion_Spam/coding-playground/feature_csvs/user_features.csv', index=None, header=True)
def __init__(self, agent, memory): self.agent = agent self.directions = [ np.array([1, 0]), np.array([-1, 0]), np.array([0, -1]), np.array([0, 1])] self.senses = [] self.brain = RL.QLearn(numActions = len(self.directions), memory = memory) self.scheduledAction = None self.learningModule = None
def main(): for i in range(1, MAX_EPISODES): print(i, "of episodes", end="\n") start_time = time.time() observation = env.reset() for j in range(MAX_STEP_EPISODES): env.render() action = RL.choose_action(observation) if j < 5: action = 0 observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward, False) if done: RL.store_transition(observation, action, 0.0, True) RL.learn() break observation = observation_ end_time = time.time() plot_.plot_graph((end_time - start_time), i) env.close() RL.store_net()
def __init__(self, model=None, path=None, reload_every=60 * 60, swap=False, memory=0, delay=0, **kwargs): self.model = RL.Model(model, path, swap=swap, mode=RL.Mode.PLAY, memory=memory, **kwargs) self.reload_every = reload_every self.counter = 0 self.action = 0 self.actions = util.CircularQueue(delay + 1, 0) self.memory = util.CircularQueue(array=((memory + 1) * ssbm.SimpleStateAction)()) self.model.restore()
def train_rl(model_name: str) -> str: """ Trains the model using the parameters defined :param model_name: name of the model to save :return: str, name of the stored model """ # model name model_name = model_name + "_RL" print('- start training RL model') trained_model_name = RL.train( supply_distribution=supply_distribution, # global demand_distribution=demand_distribution, # global model_name=model_name, # in loop max_age=parameters['max_age'], demand=parameters['demand'], max_day=parameters['rl']['max_day'], obs_method=parameters['rl']['obs_method'], doi=parameters['doi'], training_timesteps_list=parameters['rl']['training_interval'], tblog=parameters['rl']['tb_log']) print('- Complete training RL model') return trained_model_name
def __initAI__(self): self.efficiencyPlot = view.Plot() self.aiCollection = ai.AICollection() self.positionMonitors = [] self.trainedAI = ai.TrainedAI(goalId = 'e', wallId = '#', statisticsPlot = self.efficiencyPlot) self.eyesight = ai.Eyesight(1) self.smell = ai.Smell('e') self.memory = RL.QMemory() self.savedMemory = None self.savedMemoryNo = -1 self.timer = utils.Timer(0.5) self.efficiencyPlot.show() def updatePositionMonitors(): for monitor in self.positionMonitors: monitor.update(self.scene) self.timer.addToTick(lambda: self.aiCollection.think(self.scene)) self.timer.addToTick(updatePositionMonitors)
def PlotValueFunction(AI): if hasattr(AI, 'QueryQBestAction') and callable( getattr(AI, 'QueryQBestAction')): # Update plot of optimal value function (only of position and velocity)a X, Y = np.meshgrid(range(0, int(BASEY + 30), 20), range(-10, 10, 1)) Z = np.zeros(X.shape) for yy in xrange(X.shape[0]): for xx in xrange(X.shape[1]): Z[yy, xx] = AI.QueryQBestAction( RL.FB_GS(0, X[yy, xx], 0, Y[yy, xx], [{ 'x': 0, 'y': 0 }, { 'x': 0, 'y': 0 }], [{ 'x': 0, 'y': 0 }, { 'x': 0, 'y': 0 }])) fig = plt.figure() ax = fig.gca(projection='3d') ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm, linewidth=0, antialiased=False) plt.savefig('optimalQ.png') plt.close(fig)
import RL import os RL.init() # RL.restore() def sweep(data_dir='experience/'): # for f in ["2"]: for f in os.listdir(data_dir): if f.isdigit(): filename = data_dir + f print("Training on " + filename) RL.train(filename) else: print("Not training on file:", f) RL.save() #RL.writeGraph() while True: sweep()
Pl[5,0,5]=0.1 Pl[6,0,6]=1 Pl[0,1,0]=1 Pl[1,1,1]=0 Pl[1,1,0]=1 Pl[2,1,1]=1 Pl[3,1,2]=1 Pl[4,1,3]=1 Pl[5,1,4]=1 Pl[6,1,5]=1 Rl = np.zeros((7,2)) Rl[[0,6],:]=1 absorv = np.zeros((7,1)) absorv[[0,6]]=1 fmdp = RL.finiteMDP(7,2,0.9,Pl,Rl,absorv) J,traj = fmdp.runPolicy(10000,3,poltype = "exploration") #choose this value data = np.load("Q1.npz") Qr = fmdp.traces2Q(traj) if np.sqrt(sum(sum((data['Q1']-Qr)**2)))<1: print("Aproximação de Q dentro do previsto. OK\n") else: print("Aproximação de Q fora do previsto. FAILED\n") J,traj = fmdp.runPolicy(3,3,poltype = "exploitation", polpar = Qr) if np.sqrt(sum(sum((data['traj2']-traj)**2)))<1: print("Trajectória óptima. OK\n") else: print("Trajectória não óptima. FAILED\n")
def experiment(device, reward_system, PIPEGAP, BATCH_SIZE, learning_rate, MEMORY_SIZE, GAMMA, EPS_START, EPS_END, EPS_DECAY, OBSERVE, FRAME_PER_ACTION, TARGET_UPDATE, num_episodes, save_model=False, load_model=False, load_model_path_prefix=None): expected_q_value = 0 policy_net = RL.DQN().to(device) target_net = RL.DQN().to(device) if load_model: policy_net.load_state_dict( torch.load(load_model_path_prefix + "_policy_net.mdl")) target_net.load_state_dict( torch.load(load_model_path_prefix + "_target_net.mdl")) else: target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate) memory = RL.ReplayMemory(MEMORY_SIZE) #Setup Game environment game = FlappyBird.FlappyBird(pipe_gap=PIPEGAP) env = PLE(game, fps=30, display_screen=True, force_fps=True, reward_values=reward_system) #Setup plot RLplot.plot_init() episode_durations = [] # Main part with game execution env.init() steps_done = 0 infinity = False for i_episode in range(num_episodes): # Initialize the environment and state env.reset_game() state = env.getScreenRGB() state = RLip.BCHW_format(state) frames = (state, state, state, state) state = RLip.last_4_frames(state, frames[1], frames[2], frames[3]) for t in count(): # Select an action action, steps_done = RL.select_action(state, policy_net, steps_done, device, EPS_START, EPS_END, EPS_DECAY, OBSERVE) if steps_done % FRAME_PER_ACTION != 0: action = torch.tensor([[1]], device=device, dtype=torch.long) # Perform an action reward = env.act(env.getActionSet()[action[0, 0]]) next_state = env.getScreenRGB() done = env.game_over() reward = torch.tensor([reward], device=device) # Formatting next state for network if not done: next_state = RLip.BCHW_format(next_state) frames = (next_state, frames[0], frames[1], frames[2]) next_state = RLip.last_4_frames(next_state, frames[1], frames[2], frames[3]) else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # edit # Move to the next state state = next_state # Print Log of training info if steps_done <= OBSERVE: state_of_training = "observe" elif steps_done > OBSERVE and steps_done <= OBSERVE + EPS_DECAY: state_of_training = "explore" else: state_of_training = "train" print("TIMESTEP", steps_done, "/ STATE", state_of_training,\ "/ ACTION", action[0,0].data,"/ REWARD", reward[0].data,"/ Expected_Q",expected_q_value) # Perform one step of the optimization (on the target network) if steps_done > OBSERVE: RL.optimize_model(policy_net, target_net, memory, optimizer, device, BATCH_SIZE, GAMMA) if done: episode_durations.append(t + 1) RLplot.plot_durations(episode_durations) break if t > 10000: infinity = True episode_durations.append(t + 1) RLplot.plot_durations(episode_durations) break else: if done: break # Update the target network if i_episode % TARGET_UPDATE == 0 and steps_done > OBSERVE: target_net.load_state_dict(policy_net.state_dict()) if infinity: break # End training process # Save experiment result data = { "data": episode_durations, 'pipe_gap': PIPEGAP, 'reward_values': reward_system, 'BATCH_SIZE': BATCH_SIZE, 'learning_rate': learning_rate, 'MEMORY_SIZE': MEMORY_SIZE, 'GAMMA': GAMMA, 'EPS_START': EPS_START, 'EPS_END': EPS_END, 'EPS_DECAY': EPS_DECAY, 'OBSERVE': OBSERVE, 'FRAME_PER_ACTION': FRAME_PER_ACTION, 'TARGET_UPDATE': TARGET_UPDATE, 'num_episodes': num_episodes } filenameprefix = './result/Expe_' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') filename = filenameprefix + '.pkl' with open(filename, 'wb') as f: pickle.dump(data, f, pickle.HIGHEST_PROTOCOL) # Save model if said so if save_model: torch.save(policy_net.state_dict(), filenameprefix + '_policy_net.mdl') torch.save(target_net.state_dict(), filenameprefix + '_target_net.mdl') # Save plot figure plotname = filenameprefix + '.png' RLplot.plot_end(plotname)
os.system("sbatch " + slurmfile) #os.system("sbatch -N 1 -c 2 --mem=8000 --time=6-23:00:00 slurm_scripts/" + jobname + ".slurm &") init = False init = True if dry_run: print("NOT starting jobs:") else: print("Starting jobs:") # init model for the first time if init: import RL model = RL.Model(mode=RL.Mode.TRAIN, gpu=False, **job_dicts['train']) model.init() model.save() train_name = "trainer_" + exp_name train_command = "python3 -u train.py" + job_flags['train'] slurm_script(train_name, train_command, gpu=True) #sys.exit() agent_count = 0 agent_command = "python3 -u run.py" + job_flags['agent'] for c1 in characters: for c2 in characters: command = agent_command + " --p1 %s --p2 %s" % (c1, c2)
import RL import random env = RL.Env() jernej = RL.Player() for i in range(env.STEPS): action = [jernej.move_o(), jernej.move_p()] #action = [random.choice([-5, 5]), random.choice([-.05, .05])] env.step(action) env.render() if env.done: print(f'Crashed in episode step: {env.episode_step}') env.reset()
import gym import RL from draw_graph import Plot import time env = gym.make('CartPole-v0') env = env.unwrapped plot_ = Plot() MAX_EPISODES = 2000 MAX_STEP_EPISODES = 5000 RL = RL.PolicyGradient(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], n_hidden=10, learning_rate=0.01, reward_decay=0.99, epsilon=0.90) def main(): for i in range(1, MAX_EPISODES): print(i, "of episodes", end="\n") start_time = time.time() observation = env.reset() for j in range(MAX_STEP_EPISODES): env.render() action = RL.choose_action(observation) if j < 5: action = 0 observation_, reward, done, info = env.step(action)
start_time_expanded = time.time() Q, avg_reward, state_action_count = rl.learn_Q( env, n_sims, gamma = 1, omega = omega, epsilon = epsilon, init_val = init_val, episode_file=path_fun("hand_state"), warmup=warmup) print("Number of explored states: " + str(len(Q))) print("Cumulative avg. reward = " + str(avg_reward)) time_to_completion_expanded = time.time() - start_time_expanded """ print("----- Starting Q-learning for sum-based state space -----") # Q-learning with player sum state representation start_time_sum = time.time() sumQ, sum_avg_reward, sum_state_action_count = rl.learn_Q( sum_env, n_sims, omega=omega, epsilon=epsilon, init_val=init_val, episode_file=path_fun("sum_state"), warmup=warmup) time_to_completion_sum = time.time() - start_time_sum print("Number of explored states (sum states): " + str(len(sumQ))) print("Cumulative avg. reward = " + str(sum_avg_reward)) """ print("Training time: \n " + "Expanded state space MC: {} \n Expanded state space: {} \n Sum state space: {}".format( time_to_completion_MC, time_to_completion_expanded, time_to_completion_sum))
type=int, default=0, help="how many frames to remember") args = parser.parse_args() if args.name is None: args.name = args.model if args.path is None: args.path = "saves/%s/" % args.name experience_dir = args.path + 'experience/' os.makedirs(experience_dir, exist_ok=True) model = RL.Model(mode=RL.Mode.TRAIN, **args.__dict__) # do this in RL? if args.init: model.init() model.save() else: model.restore() import numpy as np def sweep(data_dir='experience/'): i = 0 start_time = time.time() files = os.listdir(data_dir)
def reinfrocement_neural_network_control(load_weights=None, run_only=False, track_select='SS', random_seed=None, rl_prams=None): run = run_only weights_save_dir = "./weights/" if not os.path.exists(weights_save_dir): os.makedirs(weights_save_dir) Environment.track_generator(track, track_select=track_select) env = Environment.Environment(track, rl_parameters['max_steps']) gui = GUI.GUI(track, cars, trace=True) car_objects = [Environment.Car(c) for c in cars] rl = RL.QLearning_NN(rl_prams, weights_save_dir=weights_save_dir) rl.generate_nn() if load_weights is not None: if load_weights == 'all': run = True else: rl.load_weights(load_weights) if random_seed is not None: rl.random_seed(random_seed) weight_names = sorted([name for name in glob.glob(weights_save_dir + '*')]) weight_names_index = 0 def initialize(run_state): env.compute_interaction(car_objects) for car in car_objects: car.reset() car.get_sensor_reading() if run_state == True: env.set_max_steps(1500) gui.remove_traces() gui.disable_trace() gui.set_run_select(gui.runs[1]) gui.update_debug_info('[Testing]\n' + 'Currently learned weights loaded') else: env.set_max_steps(rl_prams['max_steps']) gui.enable_trace() gui.set_run_select(gui.runs[0]) gui.update_debug_info('[Training]\n') def check_run_button(current_state): if gui.get_run_select() == gui.runs[0] and current_state == True: print '\n\n\nLearning\n' initialize(run_state=False) return False if gui.get_run_select() == gui.runs[1] and run == False: print '\n\n\nRun only\n' initialize(run_state=True) return True return None initialize(run_state=run) while (1): new_run_state = check_run_button(current_state=run) if new_run_state is not None: run = new_run_state if run == True: for i, car in enumerate(car_objects): terminal = rl.run_step(car, env, dt) if terminal is not None: print 'Car', i, ':', terminal if i == 0: if load_weights == 'all' and weight_names_index < len( weight_names): rl.load_weights(weight_names[weight_names_index]) gui.update_debug_info( '[Testing]\n' + 'Weights loaded:\n' + weight_names[weight_names_index]) weight_names_index += 1 gui.update(i, car.get_state()) env.compute_interaction(car_objects) gui.refresh() else: terminal, debug, epoch, avg_loss, final_score, cross_score = rl.learn_step( car_objects[0], env, dt) if terminal is not None: if debug is not None: gui.update_debug_info(debug) gui.update_graph(epoch, avg_loss, gui.graphs[0]) gui.update_graph(epoch, final_score, gui.graphs[1]) gui.update_graph(epoch, cross_score, gui.graphs[2]) gui.refresh() gui.update(0, terminal, draw_car=False, force_end_line=True) gui.refresh() if rl.epoch % 100 == 0: gui.update(0, car_objects[0].get_state(), draw_car=True) gui.refresh() else: gui.update(0, car_objects[0].get_state(), draw_car=False)
env.render() action = RL.choose_action(observation) if int(observation[5]) < 6: # print("can not change") action = "n" # print(action) observation_, reward, done = env.switch_light(action) t_reward += reward RL.save_memory(observation, action, reward, observation_) if step > 500 and step % 5 == 0: RL.learn() observation = observation_ if done: print(t_reward) break if __name__ == "__main__": env = map_env.Map() mode = sys.argv[1] env.after(100, traffic_baseline()) env.destroy() if mode == 'RL': env = map_env.Map() RL = RL.QLearningTable(env.action_space) elif mode == 'DQN': env = map_env.Map() RL = RL.DeepQNetwork(num_actions=2, num_features=6, actions=['y', 'n']) env.after(100, traffic())
import RL import os RL.init() # RL.restore() def sweep(data_dir='experience/'): # for f in ["2"]: for f in os.listdir(data_dir): if f.isdigit(): filename = data_dir + f print("Training on " + filename) RL.train(filename) else: print("Not training on file:", f) RL.save() #RL.writeGraph() while True: sweep()
# Q-learning with expanded state representation start_time_expanded = time.time() Q, avg_reward, state_action_count = rl.learn_Q( env, n_sims, gamma = 1, omega = omega, epsilon = epsilon, init_val = init_val, episode_file=path_fun("hand_state"), warmup=warmup) print("Number of explored states: " + str(len(Q))) print("Cumulative avg. reward = " + str(avg_reward)) time_to_completion_expanded = time.time() - start_time_expanded""" print("----- Starting Q-learning for sum-based state space -----") # Q-learning with player sum state representation start_time_sum = time.time() sumQ, sum_avg_reward, sum_state_action_count = rl.learn_Q( sum_env, n_sims, omega=omega, epsilon=epsilon, init_val=init_val, episode_file=path_fun("sum_state"), warmup=warmup) time_to_completion_sum = time.time() - start_time_sum print("Number of explored states (sum states): " + str(len(sumQ))) print("Cumulative avg. reward = " + str(sum_avg_reward)) """print("Training time: \n " + "Expanded state space MC: {} \n Expanded state space: {} \n Sum state space: {}".format( time_to_completion_MC, time_to_completion_expanded, time_to_completion_sum)) # Convert Q (extended state) to sum state representation and make 3D plots # Extended state MC-learning Q_conv_MC = rl.convert_to_sum_states(Q_MC, env) V_conv_MC = rl.convert_to_value_function(Q_conv_MC)
data_size = round( (2 + args.input_moment_order + 1) * args.input_moment_order / 2) elif args.input == 'wavefunction': data_size = 2 * (x_n - 10 * 2) # we do not plot when we do parallelized computation #import plot #plot.set_parameters(x=x, x_max=x_max, dt=time_step, num_of_episodes=num_of_episodes, probability=probability, # reward_multiply=reward_multiply, read_length=read_length, controls_per_unit_time=controls_per_unit_time) # set the reinforcement learning settings if __name__ == '__main__': import RL RL.set_parameters(control_interval=control_interval, t_max=t_max, F_max=args.F_max, failing_reward=failing_reward) ################################## end learning setting # Below is the worker function for subprocesses, which carries out the control simulations and pushes the experiences and records to queues that are collected and handled by other processes. (Quantum simulation is implemented in a compiled C module) # Because too many processes using CUDA will occupy a huge amount of GPU memory, we avoid using CUDA in these workers. Instead, these workers ask a manager process when they want to evaluate the neural network, and only the manager process is allowed to use CUDA to evaluate the neural network for the controls. def Control(net, pipes, shared_buffer, seed, idx): simulation = __import__('simulation') # seeding random = np.random.RandomState(seed) simulation.set_seed(random.randint(0, 2**31 - 1)) # preparing pipes MemoryQueue, ResultsQueue, ActionPipe, EndEvent, PauseEvent = pipes state_data_to_manager = np.frombuffer(shared_buffer, dtype='float32')
num_of_data_per_time_unit) # 3600 read_control_step_length = control_interval // coarse_grain data_size = 2 * read_length shape_measurement_data = (2, read_length) # we do not plot when we do parallelized computation #import plot #plot.set_parameters(x=x, x_max=x_max, dt=time_step, num_of_episodes=num_of_episodes, probability=probability, # reward_multiply=reward_multiply, read_length=read_length, controls_per_half_period=controls_per_half_period) # set the reinforcement learning settings if __name__ == '__main__': import RL RL.set_parameters(control_interval=control_interval, t_max=t_max, F_max=args.F_max) if args.input == 'measurements': RL.set_parameters(read_step_length=read_control_step_length) ################################## end learning setting # Below is the worker function for subprocesses, which carries out the control simulations and pushes the experiences and records to queues that are collected and handled by other processes. (Quantum simulation is implemented in a compiled C module) # Because too many processes using CUDA will occupy a huge amount of GPU memory, we avoid using CUDA in these workers. Instead, these workers ask a manager process when they want to evaluate the neural network, and only the manager process is allowed to use CUDA to evaluate the neural network for the controls. def Control(net, pipes, shared_buffer, seed, idx): simulation = __import__('simulation') # seeding random.seed(seed) np.random.seed(seed) simulation.set_seed(seed)
import numpy as np import MDP import RL ''' Construct simple MDP as described in Lecture 2a Slides 13-14''' T = np.array([[[0.5, 0.5, 0, 0], [0, 1, 0, 0], [0.5, 0.5, 0, 0], [0, 1, 0, 0]], [[1, 0, 0, 0], [0.5, 0, 0, 0.5], [0.5, 0, 0.5, 0], [0, 0, 0.5, 0.5]]]) R = np.array([[0, 0, 10, 10], [0, 0, 10, 10]]) discount = 0.9 mdp = MDP.MDP(T, R, discount) rlProblem = RL.RL(mdp, np.random.normal) # Test Q-learning [Q, policy] = rlProblem.qLearning(s0=0, initialQ=np.zeros([mdp.nActions, mdp.nStates]), nEpisodes=1000, nSteps=100, epsilon=0.3) print("\nQ-learning results") print(Q) print(policy) # import numpy as np # import MDP # import RL # # # ''' Construct simple MDP as described in Lecture 2a Slides 13-14''' # T = np.array([[[0.5,0.5,0,0],[0,1,0,0],[0.5,0.5,0,0],[0,1,0,0]],[[1,0,0,0],[0.5,0,0,0.5],[0.5,0,0.5,0],[0,0,0.5,0.5]]]) # R = np.array([[0,0,10,10],[0,0,10,10]])
for decks in [1, 2, 6, 8, inf]: print("----- deck number equal to {} -----".format(decks)) # set seed seed = 31233 # init envs. env = bjk.BlackjackEnvExtend(decks=decks, seed=seed) sum_env = bjk_base.BlackjackEnvBase(decks=decks, seed=seed) print("----- Starting MC training on expanded state space -----") # MC-learning wit expanded state representation start_time_MC = time.time() Q_MC, MC_avg_reward, state_action_count = rl.learn_MC( env, n_sims, gamma=1, epsilon=epsilon, init_val=init_val, episode_file=path_fun("hand_MC_state"), warmup=warmup) print("Number of explored states: " + str(len(Q_MC))) print("Cumulative avg. reward = " + str(MC_avg_reward)) time_to_completion_MC = time.time() - start_time_MC print("----- Starting Q-learning on expanded state space -----") # Q-learning with expanded state representation start_time_expanded = time.time() Q, avg_reward, state_action_count = rl.learn_Q( env, n_sims, gamma=1, omega=omega,