def __init__(self, config, localNet, env, globalNets, globalOptimizer, netLossFunc, nbAction, rank, globalEpisodeCount, globalEpisodeReward, globalRunningAvgReward, resultQueue, logFolder, stateProcessor = None, lock = None): self.globalPolicyNet = globalNets[0] self.globalTargetNet = globalNets[1] self.rank = rank self.globalOptimizer = globalOptimizer self.localNet = localNet mp.Process.__init__(self) DQNAgent.__init__(self. config, localNet, None, None, netLossFunc, nbAction, stateProcessor, ) self.totalStep = 0 self.updateGlobalFrequency = 10 if 'updateGlobalFrequency' in self.config: self.updateGlobalFrequency = self.config['updateGlobalFrequency'] self.globalEpisodeCount = globalEpisodeCount self.globalEpisodeReward = globalEpisodeReward self.globalRunningAvgReward = globalRunningAvgReward self.resultQueue = resultQueue self.dirName = logFolder self.randomSeed = 1 + self.rank if 'randomSeed' in self.config: self.randomSeed = self.config['randomSeed'] + self.rank torch.manual_seed(self.randomSeed) self.nStepForward = 1 if 'nStepForward' in self.config: self.nStepForward = self.config['nStepForward'] self.targetNetUpdateEpisode = 10 if 'targetNetUpdateEpisode' in self.config: self.targetNetUpdateEpisode = self.config['targetNetUpdateEpisode'] self.nStepBuffer = [] # only use vanilla replay memory self.memory = ReplayMemory(self.memoryCapacity) self.priorityMemoryOption = False # use synthetic lock or not self.synchLock = False if 'synchLock' in self.config: self.synchLock = self.config['synchLock'] self.lock = lock self.device = 'cpu' if 'device' in self.config and torch.cuda.is_available(): self.device = self.config['device'] torch.cuda.manual_seed(self.randomSeed) self.localNet = self.localNet.cuda()
env = DynamicMazeMultiMap(config) N_S = env.stateDim[0] N_A = env.nbActions policyNet = MulChanConvNet(N_S, 128, N_A) targetNet = deepcopy(policyNet) optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate']) agent = DQNAgent(config, policyNet, targetNet, env, optimizer, torch.nn.MSELoss(reduction='none'), N_A, stateProcessor=stateProcessor, experienceProcessor=experienceProcessor) trainFlag = True testFlag = True if trainFlag: if config['loadExistingModel']: checkpoint = torch.load(config['saveModelFile']) agent.policyNet.load_state_dict(checkpoint['model_state_dict']) agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) if config['loadCheckpointFlag']:
netParameter['n_output'] = N_A policyNet = MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) print(policyNet.state_dict()) targetNet = deepcopy(policyNet) optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate']) agent = DQNAgent(policyNet, targetNet, env, optimizer, torch.nn.MSELoss(), N_S, N_A, config=config) policy = deepcopy(env.map) for i in range(policy.shape[0]): for j in range(policy.shape[1]): if env.map[i, j] == 0: policy[i, j] = -1 else: policy[i, j] = agent.getPolicy(np.array([i, j])) np.savetxt('DoubleQSimpleMazePolicyBeforeTrain' + mapName + '.txt', policy, fmt='%d',
config['logFrequency'] = 100 config['priorityMemoryOption'] = False config['netUpdateOption'] = 'doubleQ' config['netUpdateFrequency'] = 1 config['priorityMemory_absErrUpper'] = 5 import gym from pybullet_envs.bullet.racecarGymEnv import RacecarGymEnv env = RacecarGymEnv(renders=True, isDiscrete=True) N_S = env.observation_space.shape[0] N_A = env.action_space.n netParameter = dict() netParameter['n_feature'] = N_S netParameter['n_hidden'] = [100] netParameter['n_output'] = N_A policyNet = MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) targetNet = deepcopy(policyNet) optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate']) agent = DQNAgent(config, policyNet, targetNet, env, optimizer, torch.nn.MSELoss(reduction='none'), N_A) agent.train()
netParameter = dict() netParameter['n_feature'] = N_S netParameter['n_hidden'] = [100] netParameter['n_output'] = N_A policyNet = MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) targetNet = MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) optimizers = optim.Adam(policyNet.parameters(), lr=config['learningRate']) agent = DQNAgent(config, policyNet, targetNet, env, optimizers, torch.nn.MSELoss(reduction='none'), N_A, stateProcessor=stateProcessor) agent.train() nTraj = 100 nSteps = 80 # test for starting from second stage for i in range(nTraj): state = agent.env.reset() agent.env.stageID = 1 state['stageID'] = agent.env.stageID
N_A = env.nbActions netParameter = dict() netParameter['n_feature'] = N_S netParameter['n_hidden'] = [100] netParameter['n_output'] = N_A policyNet = MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) targetNet = deepcopy(policyNet) optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate']) agent = DQNAgent(config, policyNet, targetNet, env, optimizer, torch.nn.MSELoss(reduction='none'), N_A) xSet = np.linspace(-1, 1, 100) policy = np.zeros_like(xSet) for i, x in enumerate(xSet): policy[i] = agent.getPolicy(np.array([x])) np.savetxt('StabilizerPolicyBeforeTrain.txt', policy, fmt='%d') #agent.perform_random_exploration(10) agent.train() #storeMemory = ReplayMemory(100000) agent.testPolicyNet(100) #storeMemory.write_to_text('testPolicyMemory.txt')
env.reset() N_S = env.stateDim[0] N_A = env.nbActions policyNet = ConvNet(N_S, N_A) #print(policyNet.state_dict()) targetNet = deepcopy(policyNet) optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate']) agent = DQNAgent(policyNet, targetNet, env, optimizer, torch.nn.MSELoss(), N_A, config=config) policy = deepcopy(env.mapMat) for i in range(policy.shape[0]): for j in range(policy.shape[1]): if env.mapMat[i, j] == 1: policy[i, j] = -1 else: sensorInfo = env.agent.getSensorInfoFromPos(np.array([i, j])) policy[i, j] = agent.getPolicy(sensorInfo) np.savetxt('DynamicMazePolicyBeforeTrain' + mapName + '.txt', policy,
env = DynamicMaze(config) env.reset() N_S = env.stateDim[0] N_A = env.nbActions policyNet = MulChanConvNet(N_S, 100, N_A) #print(policyNet.state_dict()) targetNet = deepcopy(policyNet) optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate']) agent = DQNAgent(policyNet, targetNet, env, optimizer, torch.nn.MSELoss(), N_A, stateProcessor=stateProcessor, config=config) policy = deepcopy(env.mapMat) for i in range(policy.shape[0]): for j in range(policy.shape[1]): if env.mapMat[i, j] == 1: policy[i, j] = -1 else: sensorInfo = env.agent.getSensorInfoFromPos(np.array([i,j])) distance = np.array([1, 1]) - np.array([i, j]) state = {'sensor': sensorInfo, 'target': distance} policy[i, j] = agent.getPolicy(state) np.savetxt('DynamicMazePolicyBeforeTrain' + mapName + '.txt', policy, fmt='%d', delimiter='\t') #
netParameter['n_feature'] = N_S netParameter['n_hidden'] = [100] netParameter['n_output'] = N_A policyNet = MultiLayerNetRegression(netParameter['n_feature'], netParameter['n_hidden'], netParameter['n_output']) targetNet = deepcopy(policyNet) optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate']) agent = DQNAgent(policyNet, targetNet, env, optimizer, torch.nn.MSELoss(reduction='none'), N_A, config=config) xSet = np.linspace(-1, 1, 100) policy = np.zeros_like(xSet) for i, x in enumerate(xSet): policy[i] = agent.getPolicy(np.array([x])) np.savetxt('StabilizerPolicyBeforeTrain.txt', policy, fmt='%d') #agent.perform_random_exploration(10) agent.train() storeMemory = ReplayMemory(100000) agent.testPolicyNet(100, storeMemory)
N_S = env.stateDim[0] N_A = env.nbActions policyNet = MulChanConvNet(N_S, 100, N_A) #print(policyNet.state_dict()) targetNet = deepcopy(policyNet) optimizer = optim.Adam(policyNet.parameters(), lr=config['learningRate']) agent = DQNAgent(policyNet, targetNet, env, optimizer, torch.nn.MSELoss(reduction='none'), N_A, stateProcessor=stateProcessor, config=config) trainFlag = True testFlag = True if trainFlag: if config['loadExistingModel']: checkpoint = torch.load(config['saveModelFile']) agent.policyNet.load_state_dict(checkpoint['model_state_dict']) agent.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) plotPolicyFlag = True