def main(): choice = input( "Enter 1 to select question3 agent, enter 2 to select bonus agent: ") if choice == '1': Q1 = float(input("Enter value of Q1: ")) alpha = float(input("Enter value of alpha: ")) epsilon = float(input("Enter value of epsilon: ")) agent = BanditAgent(Q1, alpha, epsilon) if choice == '2': Q1 = float(input("Enter value of Q1: ")) c = float(input("Enter value of c: ")) alpha = float(input("Enter value of alpha: ")) agent = Ucb_BanditAgent(Q1, c, alpha) name = input("Input output file name: ") max_steps = 1000 # max number of steps in an episode num_runs = 2000 # number of repetitions of the experiment # Create and pass agent and environment objects to RLGlue environment = BanditEnv() rlglue = RLGlue(environment, agent) del agent, environment # don't use these anymore # run the experiment optimalAction = BanditExp(rlglue, num_runs, max_steps) result = optimalAction / num_runs print(result) with open(name + '.csv', 'w') as out_file: for i in range(max_steps): out_file.write("%f\n" % result[i])
def getPlotPoint(actor_type, param_list): reward_hist = np.zeros(steps) for run in range(runs): env = BanditEnv(rng, num_actions, 4, stationary) actor = actor_list[actor_type](env, *param_list) rewards, _ = actor.run(steps) reward_hist += (rewards - reward_hist)/(run+1) return(np.mean(reward_hist[len(reward_hist)//2:])) # use last half of steps
def __init__(self): super(Sim, self).__init__() self.BurstDaGain = float(1) self.SetTags( "BurstDaGain", 'min:"0" step:"0.1" desc:"strength of dopamine bursts: 1 default -- reduce for PD OFF, increase for PD ON"' ) self.DipDaGain = float(1) self.SetTags( "DipDaGain", 'min:"0" step:"0.1" desc:"strength of dopamine dips: 1 default -- reduce to siulate D2 agonists"' ) self.Net = pbwm.Network() self.SetTags( "Net", 'view:"no-inline" desc:"the network -- click to view / edit parameters for layers, prjns, etc"' ) self.TrnEpcLog = etable.Table() self.SetTags("TrnEpcLog", 'view:"no-inline" desc:"training epoch-level log data"') self.TstEpcLog = etable.Table() self.SetTags("TstEpcLog", 'view:"no-inline" desc:"testing epoch-level log data"') self.TstTrlLog = etable.Table() self.SetTags("TstTrlLog", 'view:"no-inline" desc:"testing trial-level log data"') self.MtxInputWts = etensor.Float32() self.SetTags( "MtxInputWts", 'view:"no-inline" desc:"weights from input to hidden layer"') self.RunLog = etable.Table() self.SetTags("RunLog", 'view:"no-inline" desc:"summary log of each run"') self.RunStats = etable.Table() self.SetTags("RunStats", 'view:"no-inline" desc:"aggregate stats on all runs"') self.Params = params.Sets() self.SetTags("Params", 'view:"no-inline" desc:"full collection of param sets"') self.ParamSet = str() self.SetTags( "ParamSet", 'view:"-" desc:"which set of *additional* parameters to use -- always applies Base and optionaly this next if set -- can use multiple names separated by spaces (don\'t put spaces in ParamSet names!)"' ) self.MaxRuns = int(1) self.SetTags("MaxRuns", 'desc:"maximum number of model runs to perform"') self.MaxEpcs = int(30) self.SetTags("MaxEpcs", 'desc:"maximum number of epochs to run per model run"') self.MaxTrls = int(100) self.SetTags("MaxTrls", 'desc:"maximum number of training trials per epoch"') self.TrainEnv = BanditEnv() self.SetTags("TrainEnv", 'desc:"Training environment -- bandit environment"') self.Time = leabra.Time() self.SetTags("Time", 'desc:"leabra timing parameters and state"') self.ViewOn = True self.SetTags( "ViewOn", 'desc:"whether to update the network view while running"') self.TrainUpdt = leabra.TimeScales.AlphaCycle self.SetTags( "TrainUpdt", 'desc:"at what time scale to update the display during training? Anything longer than Epoch updates at Epoch in this model"' ) self.TestUpdt = leabra.TimeScales.AlphaCycle self.SetTags( "TestUpdt", 'desc:"at what time scale to update the display during testing? Anything longer than Epoch updates at Epoch in this model"' ) self.TstRecLays = go.Slice_string(["MatrixGo", "MatrixNoGo"]) self.SetTags( "TstRecLays", 'desc:"names of layers to record activations etc of during testing"' ) # internal state - view:"-" self.Win = 0 self.SetTags("Win", 'view:"-" desc:"main GUI window"') self.NetView = 0 self.SetTags("NetView", 'view:"-" desc:"the network viewer"') self.ToolBar = 0 self.SetTags("ToolBar", 'view:"-" desc:"the master toolbar"') self.WtsGrid = 0 self.SetTags("WtsGrid", 'view:"-" desc:"the weights grid view"') self.TrnEpcPlot = 0 self.SetTags("TrnEpcPlot", 'view:"-" desc:"the training epoch plot"') self.TstEpcPlot = 0 self.SetTags("TstEpcPlot", 'view:"-" desc:"the testing epoch plot"') self.TstTrlPlot = 0 self.SetTags("TstTrlPlot", 'view:"-" desc:"the test-trial plot"') self.RunPlot = 0 self.SetTags("RunPlot", 'view:"-" desc:"the run plot"') self.TrnEpcFile = 0 self.SetTags("TrnEpcFile", 'view:"-" desc:"log file"') self.RunFile = 0 self.SetTags("RunFile", 'view:"-" desc:"log file"') self.ValsTsrs = {} self.SetTags("ValsTsrs", 'view:"-" desc:"for holding layer values"') self.IsRunning = False self.SetTags("IsRunning", 'view:"-" desc:"true if sim is running"') self.StopNow = False self.SetTags("StopNow", 'view:"-" desc:"flag to stop running"') self.NeedsNewRun = False self.SetTags( "NeedsNewRun", 'view:"-" desc:"flag to initialize NewRun if last one finished"') self.RndSeed = int(1) self.SetTags("RndSeed", 'view:"-" desc:"the current random seed"') self.vp = 0 self.SetTags("vp", 'view:"-" desc:"viewport"')
import numpy as np import random import matplotlib.pyplot as plt from bandit_env import BanditEnv from actors import grad runs = 2000 steps = 1000 num_actions = 10 rng = random.Random(1234) for baseline in [False, True]: for alpha in [0.1, 0.4]: percent_correct_action = np.zeros(steps) for run in range(runs): env = BanditEnv(rng, num_actions, 4) actor = grad.GradientActor(env, alpha, baseline) _, correct_actions = actor.run(steps) percent_correct_action += correct_actions plt.plot(range(steps), percent_correct_action / runs * 100, label="baseline=" + str(baseline) + ", alpha=" + str(alpha)) plt.ylim([0, 100]) plt.ylabel("% Optimal action") plt.xlabel("Steps") plt.legend() plt.show()