def _init_model(self): """init model from parameters""" self.env, env_continuous, self.num_states, self.num_actions = get_env_info( self.env_id) # seeding torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Policy(self.num_states, self.num_actions).to(device) self.value_net = Value(self.num_states).to(device) self.running_state = ZFilter((self.num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_ppo.p from {}/{}_ppo.p".format( self.env_id, self.model_path, self.env_id)) data = pickle.load( open('{}/{}_ppo.p'.format(self.model_path, self.env_id), "rb")) self.policy_net, self.value_net, self.running_state = data.policy_net, data.value_net, data.running_state self.collector = MemoryCollector(self.env, self.policy_net, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v)
def __init__(self, eps, lr, gamma, batch_size, tau, max_memory, lambda_1, lambda_2, lambda_3, n_steps, l_margin): # Input Parameters self.eps = eps # eps-greedy self.gamma = gamma # discount factor self.batch_size = batch_size self.tau = tau # frequency of target replacement self.ed = 0.005 # bonus for demonstration # todo they aren't used self.ea = 0.001 # todo they aren't used self.l_margin = l_margin self.n_steps = n_steps self.lambda1 = lambda_1 # n-step return self.lambda2 = lambda_2 # supervised loss self.lambda3 = lambda_3 # L2 self.counter = 0 # target replacement counter # todo change to iter_counter self.replay = Memory(capacity=max_memory) self.loss = nn.MSELoss() self.policy = Policy() # todo change not have to pass architecture self.opt = optim.Adam(self.policy.predictNet.parameters(), lr=lr, weight_decay=lambda_3) self.replay.e = 0 self.demoReplay = ddict(list) self.noisy = hasattr(self.policy.predictNet, "sample")
def NetworkBase_timeStep(self, time, supportDepressionImpact, concealDiscriminateImpact, discriminateConcealImpact, discriminateDepressionImpact, concealDepressionImpact, support=None, conceal=None, discrimination=None, attitude=None, depression=None, policyScore=None, bias=0): ONLY_NON_DISCRIMINATORY = 1 ONLY_DISCRIMINATORY = 2 # "Natural gap" between passing of enforced policies TIME_GAP = 5 # Considers the cases where the type of policy is externally # enforced (not proposed at random in simulation) if (policyScore or bias) and time % TIME_GAP == 0: newPolicy = Policy(time, score=policyScore, biasPass=bias) # Converst from the numerical bias to a boolean for if # the scores are bias towards discriminatory or support if bias == ONLY_NON_DISCRIMINATORY: onlyDisc = False else: onlyDisc = True self.NetworkBase_enforcePolicy(time, score=policyScore, onlyDisc=onlyDisc) else: newPolicy = Policy(time) newPolicy.Policy_considerPolicy(self, time, self.policyCap) self.NetworkBase_updatePolicyScore(time) for agentID in self.Agents: self.Agents[agentID].Agent_updateAgent(time, supportDepressionImpact, concealDiscriminateImpact, discriminateConcealImpact, discriminateDepressionImpact, concealDepressionImpact, support, conceal, discrimination, attitude, depression)
def JN(domain: Domain, policy: Policy.Policy, N): # method to return the Expected value after N turn with a policy in a domain if N == 0: return 0 else: R = domain.reward(domain.state, policy.action(domain.state)) domain.moves(policy.action(domain.state)) return R + domain.gamma * JN(domain, policy, N-1)
def __init__(self, world_space, **kwargs): self.init_var = kwargs.pop("init_variance") Policy.__init__(self, world_space, **kwargs) if self.type == Policy.STATE_VALUES: self.vals = np.random.normal(loc=0, scale=self.init_var, size=self._s_dim) elif self.type == Policy.ACTION_STATE_VALUES: self.vals = np.random.normal(loc=0, scale=self.init_var, size=np.append(self._s_dim, self._num_a) ) else: raise ValueError("kwarg value_type is invalid")
def __init__(self, learning_rate=0.01, FILE="Model/goodPolicy.pth"): self.FILE = FILE self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.policy = Policy().to(self.device) self.policy.load_state_dict(torch.load(self.FILE)) self.policy.eval() self.criterion = nn.CrossEntropyLoss() self.learning_rate = learning_rate self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.learning_rate)
def MatrixJN(domain: Domain, policy: Policy.Policy, N): # method to return the list of Matrix of Expected value after N turn with a policy in a domain L = [np.array([[0. for k in range(domain.n)] for l in range(domain.m)])] for h in range(1, N): L.append(np.array([[0. for k in range(domain.n)] for l in range(domain.m)])) for i in range(domain.n): for j in range(domain.m): L[-1][j][i] = domain.reward([i, j], policy.action([i, j])) L[-1][j][i] += domain.gamma * (1 - domain.beta) * L[-2][min(max(j + policy.action([i, j])[1], 0), domain.m - 1)][min(max(i + policy.action([i, j])[0], 0), domain.n - 1)] L[-1][j][i] += domain.gamma * domain.beta * L[-2][0][0] return L
def get_soft_policy_from_qf_dict( qf_dict: SAf, softmax: bool, epsilon: float ) -> Policy: if softmax: ret = Policy({s: find_softmax_action_probs(v) for s, v in qf_dict.items()}) else: ret = Policy({s: find_epsilon_action_probs(v, epsilon) for s, v in qf_dict.items()}) return ret
def eval(model_type=model_type, model_path=model_path): if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' env = LunarLander() if model_type == 'policy': model = Policy(env.observation_dim, env.action_dim) elif model_type == 'dqn': model = Network(env.observation_dim, env.action_dim) model.to(device) model.load_state_dict(torch.load(model_path)) model.eval() episodes = 50 wins = 0 frames = [] fuel_left = [] for i in range(episodes): if i % 10 == 0: print(f"On episode {i}") frame_count = 0 env.reset() state = env.get_state() while True: frame_count += 1 action = model( torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)).argmax() state, reward, done = env.step(action) if done: if env.won: wins += 1 frames.append(frame_count) fuel_left.append(env.rocket.fuel) break env.close() if wins > 0: print(f"wins: {wins}") print(f"mean frames on wins {np.mean(frames)}") print(f"std frames on wins {np.std(frames, ddof=1)}") print(f"min frames on wins {np.min(frames)}") print(f"max frames on wins {np.max(frames)}") print(f"mean fuel on wins {np.mean(fuel_left)}") print(f"std fuel on wins {np.std(fuel_left, ddof=1)}") print(f"min fuel on wins {np.min(fuel_left)}") print(f"max fuel on wins {np.max(fuel_left)}") else: print("The model had 0 wins. Statistics can't be calculated")
def load_weights(self, load_from): checkpoint = torch.load(load_from) qnet_params = checkpoint['critic_params'] policy_params = checkpoint['actor_params'] self.actor_local = Policy(policy_params['actor_params']) self.actor_local.load_state_dict( checkpoint['actor_params']['state_dict']) self.critic_local = QNetwork(qnet_params['critic_params']) self.critic_local.load_state_dict( checkpoint['critic_params']['state_dict']) return self
def __init__(self, env="LunarLanderContinuous-v2", gamma=0.99): self.policy = Policy(env_id=env) self.env = gym.make(env) self.runs = Runs(gamma=gamma) self.plotter = VisdomPlotter(env_name=env) self.device_cpu = torch.device("cpu") if torch.cuda.is_available(): self.use_gpu = True self.device = torch.device("cuda") else: self.use_gpu = False self.device = torch.device("cpu")
def __init__(self, element, ns, loggerParentName=None, debug=False): if loggerParentName: loggerName = loggerParentName + ".ReplicationPolicy" else: loggerName = "ReplicationPolicy" self.logger = logging.getLogger(loggerName) if debug: self.logger.setLevel(logging.DEBUG) else: self.logger.setLevel(logging.INFO) Policy.__init__(self, element, ns, loggerName, debug) self.dataset = None self.actions = []
def NetworkBase_enforcePolicy(self, time, score=None, onlyDisc=False): ONLY_NON_DISCRIMINATORY = 1 ONLY_DISCRIMINATORY = 2 if self.policyScore + score > self.policyCap: return if score: enforcedPolicy = Policy(time=time, score=score) else: # Maps from boolean value to the ints specified above biasType = int(onlyDisc) + 1 enforcedPolicy = Policy(time=time, biasPass=biasType) self.NetworkBase_addToPolicies(enforcedPolicy, time)
def one_step_lookahead(self, V, pe=0): new_policy_mat = [[[None for y in range(self.length)] for x in range(self.width)] for dir in range(self.num_dirs)] for state in self.env.states: adj_states = self.env.getAdjStates(state) max_action_value = float("-inf") best_action = None for action_tuple in action_space: move, rotate = action_tuple action = Action(move, rotate) action_value = 0 for nxt_state in adj_states: nxt_x, nxt_y, nxt_dir = nxt_state.getState() action_value += self.get_trans_prob( pe, state, action, nxt_state) * V[nxt_dir][nxt_x][nxt_y] if action_value > max_action_value: max_action_value = action_value best_action = action cur_x, cur_y, cur_dir = state.getState() new_policy_mat[cur_dir][cur_x][cur_y] = best_action new_policy = Policy(new_policy_mat) return new_policy
def get_value_func_dict(self, pol: Policy): sa_dict = self.mdp_rep.state_action_dict vf_dict = {s: 0.0 for s in sa_dict.keys()} act_gen_dict = { s: get_rv_gen_func_single(pol.get_state_probabilities(s)) for s in sa_dict.keys() } episodes = 0 updates = 0 while episodes < self.num_episodes: state = self.mdp_rep.init_state_gen() steps = 0 terminate = False while not terminate: action = act_gen_dict[state]() next_state, reward = \ self.mdp_rep.state_reward_gen_dict[state][action]() vf_dict[state] += self.learning_rate *\ (updates / self.learning_rate_decay + 1) ** -0.5 *\ (reward + self.mdp_rep.gamma * vf_dict[next_state] - vf_dict[state]) updates += 1 steps += 1 terminate = steps >= self.max_steps or \ state in self.mdp_rep.terminal_states state = next_state episodes += 1 return vf_dict
def create_greedy_worker(networks, counter, config): logger = Logger("output_{0}_greedy.out".format(config['experiment'])) environment = HFOEnv(port=6321, seed=86868686, numOpponents=1) environment.connectToServer() w_args = (100000, networks["learning"], environment, Policy(logger=logger), logger, counter) return mp.Process(target=policy_worker.run, args=w_args)
def run(self): if not self.ready:raise Exception("StOP Parameters have not been filled") d_star = ceil(log(6/((1-self.gamma)*self.epsilon))/log(1/self.gamma)) self.state_list[self.init.value] = self.init lpi = 0 #last policy id """Must be corrected. Action list is know in the state""" for action in self.generator.get_actions(self.init): pol = Policy() pol.set_parameters(lpi, 1) pol.add_state(self.init, action, 0) #is it really 0 ? I think so... lpi = self.add_policy(pol, lb, ub, lpi) d = self.delta/(d_star*self.generator.bf(self.init)) #s_u is a couple (state, action) self.sample_eff(pol, self.init, action, self.m(1,d)) while True: candidate_policies = [] for action in self.generator.get_actions(self.init): value_tr(self.init, action) #fait je sais pas trop quoi... #ici il faut sortir la meilleure policy pour cette action #ici il faut sortir les deux meilleures actions et policies associees p1,p2,a1,a2 if p1.lb + self.epsilon >= p2.ub: return [p1, a1] if p2.depth >= p1.depth: a = a1 p = p1 else: a = a2 p = p2 #calculation of K: K = 1 for i in range(0, p.depth): #meriterait d'etre verifie en termes d'indices K *= self.generator.pessimistic_action_number(self.init, i) ** self.generator.pessimistic_children_number(self.init, i) #multiplication du branching factor... d = self.delta/(d_star*K)#???
def toString(self): str = 'Policy:' str += Policy.toString(self); str += self.dataset.toString('\t') str += '\tActions:\n' for action in self.actions: str += action.toString('\t\t') return str
def main(): address_old = 'localhost' port_old = 27017 address_new = '123.56.65.17' port_new = 27017 Area.insert_area(address_old, port_old, address_new, port_new) WeatherTranslation.insert_weather_translation(address_new, port_new) Policy.insert_policy(address_new, port_new) # RecommendHistory.insert_recommend_history(address_new, port_new) PolicyMap.insert_policy_map(address_new, port_new) News.insert_news(address_new, port_new) print("OK")
def __init__(self, params): self.params = params self.__state_dim = params['state_dim'] self.__action_dim = params['action_dim'] self.__buffer_size = params['buffer_size'] self.__batch_size = params['batch_size'] self.__gamma = params['gamma'] self.__tau = params['tau'] self.__lr = params['lr'] self.__update_every = params['update_every'] eps = params['eps'] eps_decay = params['eps_decay'] min_eps = params['min_eps'] seed = params['seed'] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network critic_params = dict() critic_params['seed'] = seed critic_params['arch_params'] = params['arch_params_critic'] self.critic_local = QNetwork(critic_params).to(device) self.critic_target = QNetwork(critic_params).to(device) self.optimizer_critic = optim.Adam(self.critic_local.parameters(), lr=self.__lr) #Policy actor_params = dict() actor_params['seed'] = seed actor_params['arch_params'] = params['arch_params_actor'] actor_params['noise_type'] = params['noise_type'] actor_params['eps'] = eps actor_params['eps_decay'] = eps_decay actor_params['min_eps'] = min_eps actor_params['arch_params'] = params['arch_params_actor'] self.actor_local = Policy(actor_params).to(device) self.actor_target = Policy(actor_params).to(device) self.optimizer_actor = optim.Adam(self.actor_local.parameters(), lr=self.__lr) self.__memory = ReplayBuffer(self.__buffer_size, self.__batch_size) self.__t_step = 0
def main(): env = gym.make('CartPole-v1') pi = Policy(LEARNING_RATE, GAMMA) score = 0.0 print_interval = 20 for n_epi in range(10000): s = env.reset() done = False while not done: prob = pi(torch.from_numpy(s).float()) m = Categorical(prob) a = m.sample() s_prime, r, done, info = env.step(a.item()) pi.put_data((r, prob[a])) s = s_prime score += r pi.train_net() if n_epi % print_interval == 0 and n_epi != 0: print("# of episode :{}, avg score: {}".format( n_epi, score / print_interval)) score = 0.0 env.close()
def __init__( self, _NAUDIO_COMMANDS, #scalar, number of possible audio commands _EEG_INPUT_SHAPE, #shape, (ntimepoints, nchan, nfreqs) _LOGDIR, #pass in directory to write summaries and whatnot _POLICY_LR=1e-4, #scalar, policy learning rate _VALUE_LR=1e-3, #scalar, value learning rate _REWARD_MA_LEN=100, #scalar _LSTM_CELLS=[ 30, 30, 30 ] #lstm dimensions, (cell0_size, cell1_size, ...) when total length is number of cells ): # These should not be changed by user but may change later in architechture self._InputShape = list(_EEG_INPUT_SHAPE) self._LSTMCells = list(_LSTM_CELLS) self._LSTMUnrollLength = 1 self._ValueDiscount = 1.0 self._Policy = Policy(_LEARNING_RATE=_POLICY_LR, _ACTIONSPACE_SIZE=_NAUDIO_COMMANDS) self._Value = Value(_LEARNING_RATE=_VALUE_LR, _DISCOUNT_RATE=self._ValueDiscount) self._Reward = Reward(_INPUT_SHAPE=_EEG_INPUT_SHAPE, _MA_LENGTH=_REWARD_MA_LEN) self._Shared = Shared(_CELLS=_LSTM_CELLS, _UNROLL_LENGTH=self._LSTMUnrollLength) # We store a version of the hidden state which we pass in every iteration self._HiddenStateShape = (len(_LSTM_CELLS), 2, self._LSTMUnrollLength, _LSTM_CELLS[-1]) self._LocalHiddenState = np.zeros(self._HiddenStateShape) # Save the logdir self.mLogdir = _LOGDIR self._buildModel() self._buildSummaries() self._buildFeedDicts() self._initSession()
def __evaluate_general_award(self, battle_info, card): free_pos = battle_info.field.get_empty_pos(self_side_flag=True) best_pos = -1 best_award = -1 for pos in free_pos: award = self.get_policy_award(battle_info, card, pos) print("Testing Card " + str(card.id) + " in pos" + str(pos) + " AWard:" + str(award)) if award > best_award: best_pos = pos best_award = award return Policy(card, best_pos, best_award)
def setUp(self): self.ss = (7,9) self.a_map = OrderedDict() self.a_map['U'] = (1,1) self.a_map['D'] = (-1,-1) self.a_map['R'] = (1,0) self.a_map['L'] = (1,-2) self.ws = WorldSpace(self.ss, self.a_map) self.p_kw = {} self.p_kw['discount_factor'] = 1 self.p_kw['exploration_factor'] = 0.95 self.p_kw['is_static'] = False self.p_kw['learn_rate'] = 0.001 self.policy = Policy(self.ws, **self.p_kw) self.p_kw['value_type'] = Policy.STATE_VALUES self.p_kw['init_variance'] = 0.01 self.tab_pol = TabularPolicy(self.ws, **self.p_kw)
def policy_iteration(self, tol=1e-4) -> DetPolicy: ''' Find the optimal policy using policy iteration ''' pol = Policy({ s: {a: 1. / len(v) for a in v} for s, v in self.state_action_dict.items() }) vf = self.find_value_func_dict(pol) epsilon = tol * 1e4 while epsilon >= tol: pol = self.find_improved_policy(pol) new_vf = self.find_value_func_dict(pol) epsilon = max(abs(new_vf[s] - v) for s, v in vf.items()) vf = new_vf return pol
class TestTabularPolicy(unittest.TestCase): def setUp(self): self.ss = (7,9) self.a_map = OrderedDict() self.a_map['U'] = (1,1) self.a_map['D'] = (-1,-1) self.a_map['R'] = (1,0) self.a_map['L'] = (1,-2) self.ws = WorldSpace(self.ss, self.a_map) self.p_kw = {} self.p_kw['discount_factor'] = 1 self.p_kw['exploration_factor'] = 0.95 self.p_kw['is_static'] = False self.p_kw['learn_rate'] = 0.001 self.policy = Policy(self.ws, **self.p_kw) self.p_kw['value_type'] = Policy.STATE_VALUES self.p_kw['init_variance'] = 0.01 self.tab_pol = TabularPolicy(self.ws, **self.p_kw) # Test that the TabularPolicy enforces discrete states and creates a value matrix for the state_space def test_TabularPolicy(self): self.assertTrue( np.all(self.tab_pol.vals.shape == self.ss) ) self.assertTrue( self.tab_pol.IsValidState((0,0)) ) self.assertTrue( self.tab_pol.IsValidState((1,3)) ) self.assertFalse( self.tab_pol.IsValidState((0.1,0)) ) # non integer states are invalid for TabularPolicy self.assertFalse( self.tab_pol.IsValidState((3,2.3)) ) # non integer states are invalid for TabularPolicy self.assertFalse( self.tab_pol.IsValidState(2.3) ) # Make sure there's no index error self.assertFalse( self.tab_pol.IsValidState('a') ) # Make sure there's no index error or type error self.assertTrue( self.tab_pol.IsValidStateAction((1,3), 0) ) self.assertFalse( self.tab_pol.IsValidStateAction((1,3.3), 0) ) # non integer states are invalid for TabularPolicy self.assertTrue( self.policy.IsValidStateAction((1,3.3), 0) ) # non integer states are valid for Policy self.p_kw['value_type'] = Policy.ACTION_STATE_VALUES self.tab_pol = TabularPolicy(self.ws, **self.p_kw) self.assertTrue( np.all(self.tab_pol.vals.shape == np.append(self.ss, len(self.a_map)) ) )
def create_workers(config, logger): counter = mp.Value('i', 0) learning_network = network_factory.create_network( num_layers=config["num_layers"], hidden_size=config["hidden_size"]) target_network = network_factory.create_network( num_layers=config["num_layers"], hidden_size=config["hidden_size"]) learning_network.load_state_dict(target_network.state_dict()) optimizer = SharedAdam(learning_network.parameters(), lr=config["learning_rate"]) optimizer.share_memory() workers = [] for idx in range(0, config["n_workers"]): networks = {"learning": learning_network, "target": target_network} # environment = create_environment(idx) policy = Policy(epsilon=config["startingEpsilons"][idx], numUpdates=config["numPolicyUpdates"], minEpsilon=config["minEpsilons"][idx], logger=logger) trainingArgs = (idx, networks, optimizer, counter, policy, config, logger) p = mp.Process(target=Worker.train, args=trainingArgs) logger.log("Starting process: {0}".format(idx)) p.start() logger.log("Process started: {0}".format(idx)) workers.append(p) logger.log("Worker Appended: {0}".format(idx)) logger.log("Creating the greedy worker") p = create_greedy_worker(networks, counter, config) p.start() workers.append(p) logger.log("Greedy worker started and appended") return workers, target_network
def value_iter(self, discount, pe=0): prev_value = np.zeros((self.num_dirs, self.width, self.length)) new_policy_matrix = [[[None for y in range(self.length)] for x in range(self.width)] for dir in range(self.num_dirs)] converge = False while not converge: # print("\nValue Iteration ") new_value = np.zeros((self.num_dirs, self.width, self.length)) for cur_state in self.env.states: cur_x, cur_y, cur_dir = cur_state.getState() adj_states = self.env.getAdjStates(cur_state) best_action = None max_action_value = float("-inf") for action_tuple in action_space: move, rotate = action_tuple action = Action(move, rotate) action_value = 0 for nxt_state in adj_states: x, y, dir = nxt_state.getState() action_value += self.get_trans_prob( pe, cur_state, action, nxt_state) * (self.get_reward(cur_state) + discount * prev_value[dir][x][y]) if action_value > max_action_value: best_action = action max_action_value = action_value new_policy_matrix[cur_dir][cur_x][cur_y] = best_action new_value[cur_dir][cur_x][cur_y] = max_action_value diff = np.sum(np.abs(new_value - prev_value)) # print("Value diff: ", diff) if np.array_equal(new_value, prev_value): converge = True prev_value = new_value new_policy = Policy(new_policy_matrix) return new_policy, new_value
def parsePolicy(self, fname): width = None height = 0 policy = [] with open(fname) as file: while True: line = file.readline() if not line: break if width != None and (len(line) - 1) != width: raise Exception("Input width inconsistent") width = len(line) - 1 height += 1 rowActions = self.parseLine(line) policy.extend(rowActions) policy = Policy(policy) policy.setWidth(width) policy.setHeight(height) return (policy)
# 'policy' or 'dqn' to choose which type of model to evaluate model_type = 'policy' # model_type = 'dqn' model_path = "policies/22-1-2021_13-44/policy0.tar" env = LunarLander() env.reset() exit_program = False if torch.cuda.is_available(): device = 'cuda' else: device = 'cpu' if model_type == 'policy': model = Policy(env.observation_dim, env.action_dim) elif model_type == 'dqn': model = Network(env.observation_dim, env.action_dim) model.to(device) model.load_state_dict(torch.load(model_path)) model.eval() state = env.get_state() while not exit_program: env.render() action = model( torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0)).argmax() state, reward, done = env.step(action)
def parse(self): Policy.parse(self) self.parseDataSets(self.root.findall(self.ns+'dataset')) self.parseActions(self.root.findall(self.ns+'actions'))
class PPO: def __init__(self, env_id, render=False, num_process=4, min_batch_size=2048, lr_p=3e-4, lr_v=3e-4, gamma=0.99, tau=0.95, clip_epsilon=0.2, ppo_epochs=10, ppo_mini_batch_size=64, seed=1, model_path=None): self.env_id = env_id self.gamma = gamma self.tau = tau self.ppo_epochs = ppo_epochs self.ppo_mini_batch_size = ppo_mini_batch_size self.clip_epsilon = clip_epsilon self.render = render self.num_process = num_process self.lr_p = lr_p self.lr_v = lr_v self.min_batch_size = min_batch_size self.model_path = model_path self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.env, env_continuous, self.num_states, self.num_actions = get_env_info( self.env_id) # seeding torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Policy(self.num_states, self.num_actions).to(device) self.value_net = Value(self.num_states).to(device) self.running_state = ZFilter((self.num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_ppo.p from {}/{}_ppo.p".format( self.env_id, self.model_path, self.env_id)) data = pickle.load( open('{}/{}_ppo.p'.format(self.model_path, self.env_id), "rb")) self.policy_net, self.value_net, self.running_state = data.policy_net, data.value_net, data.running_state self.collector = MemoryCollector(self.env, self.policy_net, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v) def choose_action(self, state): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.policy_net.get_action_log_prob(state) action = action.cpu().numpy()[0] return action def eval(self, i_iter, render=False): state = self.env.reset() test_reward = 0 while True: if render: self.env.render() state = self.running_state(state) action = self.choose_action(state) state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalar("rewards/total_reward", log['total_reward'], i_iter) writer.add_scalar("rewards/average_reward", log['avg_reward'], i_iter) writer.add_scalar("rewards/min_reward", log['min_episode_reward'], i_iter) writer.add_scalar("rewards/max_reward", log['max_episode_reward'], i_iter) writer.add_scalar("rewards/num_steps", log['num_steps'], i_iter) batch, permuted_batch = memory.sample() # sample all items in memory # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) batch_log_prob = FLOAT(batch.log_prob).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) alg_step_stats = {} if self.ppo_mini_batch_size: batch_size = batch_state.shape[0] mini_batch_num = int( math.ceil(batch_size / self.ppo_mini_batch_size)) # update with mini-batch for _ in range(self.ppo_epochs): index = torch.randperm(batch_size) for i in range(mini_batch_num): ind = index[slice( i * self.ppo_mini_batch_size, min(batch_size, (i + 1) * self.ppo_mini_batch_size))] state, action, returns, advantages, old_log_pis = batch_state[ind], batch_action[ind], \ batch_return[ ind], batch_advantage[ind], \ batch_log_prob[ ind] alg_step_stats = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, state, action, returns, advantages, old_log_pis, self.clip_epsilon, 1e-3) else: for _ in range(self.ppo_epochs): alg_step_stats = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.clip_epsilon, 1e-3) return alg_step_stats def save(self, save_path): """save model""" check_path(save_path) pickle.dump((self.policy_net, self.value_net, self.running_state), open('{}/{}_ppo_encoder.p'.format(save_path, self.env_id), 'wb'))
def train_policies(self, load_best_policy=False, load_reinforcement=False): if load_reinforcement: for i in range(0, game_setting.K): policy = Policy(self.game_setting) file_name = policy.load_reinforcement_model(i) self.policies.append([policy, file_name, 0, 0]) return if load_best_policy: start = 1 policy = Policy(self.game_setting) nr_of_training_cases = policy.load_best_model() self.policies.append([policy, nr_of_training_cases, 0, 0]) else: start = 0 policy = Policy(self.game_setting) max_cases = min( policy.import_data_and_train(max_cases=self.max_cases, test_nr_of_cases=True), self.max_cases) if self.negative_training_power > 0: for i in range(start, self.K): nr_of_cases = max( 0, max_cases // ((i + 1)**self.negative_training_power)) if nr_of_cases > 0: policy = Policy(self.game_setting) actual_nr_of_cases = policy.import_data_and_train( max_cases=nr_of_cases) else: policy = Policy(self.game_setting, no_model=True) actual_nr_of_cases = 0 self.policies.append([policy, actual_nr_of_cases, 0, 0]) else: for i in range(start, self.K): policy = Policy(self.game_setting) nr_of_cases = max( int(max_cases * (self.K - i - 1) / (self.K - 1)), 0) if nr_of_cases > 0: actual_nr_of_cases = policy.import_data_and_train( max_cases=nr_of_cases) self.policies.append([policy, actual_nr_of_cases, 0, 0]) else: self.policies.append([policy, 0, 0, 0])
import ConfigParser import sys import traceback import IRecv_Module as IM import logger from BaseThread import BaseThread from MPI_Wrapper import Client from MPI_Wrapper import Tags from Policy import Policy from Task import SampleTask from Task import TaskStatus from WorkerRegistry import WorkerStatus policy = Policy() log = logger.getLogger('WorkerAgent') wlog = None def MSG_wrapper(**kwd): return json.dumps(kwd) class HeartbeatThread(BaseThread): """ ping to master to update status """ def __init__(self, client, worker_agent): BaseThread.__init__(self, name='HeartbeatThread')
def __init__(self): Policy.__init__(self)
def policy(w): ''' Calculate policy for given world''' p = Policy(w) p.policyIteration(turbo=True) return p