senorList = [item['state'] for item in state if item is not None] nonFinalState = torch.tensor(senorList, dtype=torch.float32, device=device) return nonFinalState, nonFinalMask env = CooperativeSimpleMazeTwoD(config=config) N_S = env.stateDim N_A = env.nbActions netParameter = dict() netParameter['n_feature'] = N_S netParameter['n_hidden'] = [100] netParameter['n_output'] = N_A policyNet = MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) targetNet = MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) optimizers = optim.Adam(policyNet.parameters(), lr=config['learningRate']) agent = DQNAgent(config, policyNet, targetNet, env, optimizers, torch.nn.MSELoss(reduction='none'), N_A, stateProcessor=stateProcessor) agent.train() nTraj = 100
config['logFlag'] = True config['logFileName'] = 'SimpleMazeLog/DoubleQtraj' + mapName config['logFrequency'] = 50 config['netUpdateOption'] = 'doubleQ' env = SimpleMazeTwoD(mapName) N_S = env.stateDim N_A = env.nbActions netParameter = dict() netParameter['n_feature'] = N_S netParameter['n_hidden'] = [100] netParameter['n_output'] = N_A policyNet = MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) print(policyNet.state_dict()) targetNet = deepcopy(policyNet) optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate']) agent = DQNAgent(policyNet, targetNet, env, optimizer, torch.nn.MSELoss(), N_S, N_A,
config['device'] = 'cpu' # Get the environment and extract the number of actions. # env = CartPoleEnvCustom() trainEnv = gym.make("CartPole-v0") testEnv = gym.make("CartPole-v0") N_S = trainEnv.observation_space.shape[0] N_A = trainEnv.action_space.n netParameter = dict() netParameter['n_feature'] = N_S netParameter['n_hidden'] = [40, 40] netParameter['n_output'] = N_A actorNet = MultiLayerNetLogSoftmax(netParameter['n_feature'], netParameter['n_hidden'], N_A) criticNet = MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], 1) optimizer1 = optim.Adam(actorNet.parameters(), lr=config['learningRate']) optimizer2 = optim.Adam(criticNet.parameters(), lr=config['learningRate']) agent = ActorCriticTwoNet(actorNet, criticNet, [trainEnv, testEnv], [optimizer1, optimizer2], torch.nn.MSELoss(), N_A, config) agent.train() agent.test(100)
finalState = torch.tensor(sensorList, device=device, dtype=torch.float32) return nonFinalState, nonFinalMask, finalState, finalMask env = CooperativeSimpleMazeTwoD(config) N_S = env.stateDim N_A = env.nbActions netParameter = dict() netParameter['n_feature'] = N_S netParameter['n_hidden'] = [128] netParameter['n_output'] = N_A nPeriods = config['numStages'] policyNets = [MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) for _ in range(nPeriods)] targetNets = [MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) for _ in range(nPeriods)] optimizers = [optim.Adam(net.parameters(), lr=config['learningRate']) for net in policyNets] agent = StackedDQNAgent(config, policyNets, targetNets, env, optimizers, torch.nn.MSELoss(reduction='none'), N_A, stateProcessor=stateProcessor) policyFlag = True if policyFlag: for n in range(nPeriods):
return nonFinalState, nonFinalMask, finalState, finalMask env = CooperativeSimpleMazeTwoD(config) N_S = env.stateDim N_A = env.nbActions netParameter = dict() netParameter['n_feature'] = N_S netParameter['n_hidden'] = [128] netParameter['n_output'] = N_A nPeriods = config['numStages'] policyNets = [ MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) for _ in range(nPeriods) ] targetNets = [ MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) for _ in range(nPeriods) ] optimizers = [ optim.Adam(net.parameters(), lr=config['learningRate']) for net in policyNets ] agent = StackedDQNAgent(config, policyNets,
config['trainBatchSize'] = 32 config['gamma'] = 0.9 config['learningRate'] = 0.001 config['netGradClip'] = 1 config['logFlag'] = False config['logFrequency'] = 100 config['priorityMemoryOption'] = False config['netUpdateOption'] = 'doubleQ' config['netUpdateFrequency'] = 1 config['priorityMemory_absErrUpper'] = 5 config['numWorkers'] = 4 env = StablizerOneD() N_S = env.stateDim N_A = env.nbActions netParameter = dict() netParameter['n_feature'] = N_S netParameter['n_hidden'] = [4] netParameter['n_output'] = N_A policyNet = MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) optimizer = SharedAdam(policyNet.parameters(), lr=1.0) agent = DQNA3CMaster(config, policyNet, env, optimizer, torch.nn.MSELoss(reduction='none'), N_A) agent.test_multiProcess()
config['mapHeight'] = 6 config['numAgents'] = 2 env = TwoAgentCooperativeTransport(config) N_S = env.stateDim N_A = env.nbActions numAgents = env.numAgents netParameter = dict() netParameter['n_feature'] = N_S netParameter['n_hidden'] = [128] netParameter['n_output'] = N_A policyNets = [ MultiLayerNetRegression(N_S[n], netParameter['n_hidden'], N_A[n]) for n in range(numAgents) ] targetNets = [ MultiLayerNetRegression(N_S[n], netParameter['n_hidden'], N_A[n]) for n in range(numAgents) ] optimizers = [ optim.Adam(net.parameters(), lr=config['learningRate']) for net in policyNets ] agent = MADQNAgent(config, policyNets, targetNets, env, optimizers, torch.nn.MSELoss(reduction='none'), N_A)