コード例 #1
0
    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, self.num_states, self.num_actions = get_env_info(
            self.env_id)

        # seeding
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Policy(self.num_states, self.num_actions).to(device)

        self.value_net = Value(self.num_states).to(device)
        self.running_state = ZFilter((self.num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_ppo.p from {}/{}_ppo.p".format(
                self.env_id, self.model_path, self.env_id))
            data = pickle.load(
                open('{}/{}_ppo.p'.format(self.model_path, self.env_id), "rb"))
            self.policy_net, self.value_net, self.running_state = data.policy_net, data.value_net, data.running_state

        self.collector = MemoryCollector(self.env,
                                         self.policy_net,
                                         render=self.render,
                                         running_state=self.running_state,
                                         num_process=self.num_process)

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_v = optim.Adam(self.value_net.parameters(),
                                      lr=self.lr_v)
コード例 #2
0
ファイル: Agent.py プロジェクト: meowatthemoon/DQfD
    def __init__(self, eps, lr, gamma, batch_size, tau, max_memory, lambda_1,
                 lambda_2, lambda_3, n_steps, l_margin):
        # Input Parameters
        self.eps = eps  # eps-greedy
        self.gamma = gamma  # discount factor
        self.batch_size = batch_size
        self.tau = tau  # frequency of target replacement
        self.ed = 0.005  # bonus for demonstration # todo they aren't used
        self.ea = 0.001  # todo they aren't used
        self.l_margin = l_margin
        self.n_steps = n_steps
        self.lambda1 = lambda_1  # n-step return
        self.lambda2 = lambda_2  # supervised loss
        self.lambda3 = lambda_3  # L2

        self.counter = 0  # target replacement counter # todo change to iter_counter
        self.replay = Memory(capacity=max_memory)
        self.loss = nn.MSELoss()
        self.policy = Policy()  # todo change not have to pass architecture
        self.opt = optim.Adam(self.policy.predictNet.parameters(),
                              lr=lr,
                              weight_decay=lambda_3)

        self.replay.e = 0
        self.demoReplay = ddict(list)

        self.noisy = hasattr(self.policy.predictNet, "sample")
コード例 #3
0
    def NetworkBase_timeStep(self, time, supportDepressionImpact, 
        concealDiscriminateImpact, discriminateConcealImpact, 
        discriminateDepressionImpact, concealDepressionImpact,
        support=None, conceal=None, discrimination=None, 
        attitude=None, depression=None, policyScore=None, bias=0): 
        ONLY_NON_DISCRIMINATORY = 1
        ONLY_DISCRIMINATORY = 2

        # "Natural gap" between passing of enforced policies
        TIME_GAP = 5

        # Considers the cases where the type of policy is externally
        # enforced (not proposed at random in simulation)
        if (policyScore or bias) and time % TIME_GAP == 0:
            newPolicy = Policy(time, score=policyScore, biasPass=bias)

            # Converst from the numerical bias to a boolean for if
            # the scores are bias towards discriminatory or support
            if bias == ONLY_NON_DISCRIMINATORY: onlyDisc = False
            else: onlyDisc = True

            self.NetworkBase_enforcePolicy(time, score=policyScore, 
                onlyDisc=onlyDisc)

        else:
            newPolicy = Policy(time)
            newPolicy.Policy_considerPolicy(self, time, self.policyCap)
        
        self.NetworkBase_updatePolicyScore(time)
        for agentID in self.Agents:
            self.Agents[agentID].Agent_updateAgent(time, supportDepressionImpact,
                concealDiscriminateImpact, discriminateConcealImpact, 
                discriminateDepressionImpact, concealDepressionImpact,
                support, conceal, discrimination, attitude, depression)
コード例 #4
0
def JN(domain: Domain, policy: Policy.Policy, N):
    # method to return the Expected value after N turn with a policy in a domain
    if N == 0:
        return 0
    else:
        R = domain.reward(domain.state, policy.action(domain.state))
        domain.moves(policy.action(domain.state))
        return R + domain.gamma * JN(domain, policy, N-1)
コード例 #5
0
ファイル: TabularPolicy.py プロジェクト: SwappyG/RL_Practise
	def __init__(self, world_space, **kwargs):
		self.init_var = kwargs.pop("init_variance")
		Policy.__init__(self, world_space, **kwargs)

		if self.type == Policy.STATE_VALUES:
			self.vals = np.random.normal(loc=0, scale=self.init_var, size=self._s_dim)
		elif self.type == Policy.ACTION_STATE_VALUES:
			self.vals = np.random.normal(loc=0, scale=self.init_var, size=np.append(self._s_dim, self._num_a) )
		else:
			raise ValueError("kwarg value_type is invalid")
コード例 #6
0
ファイル: Learner.py プロジェクト: APM150/CartPole_v0
 def __init__(self, learning_rate=0.01, FILE="Model/goodPolicy.pth"):
     self.FILE = FILE
     self.device = torch.device(
         'cuda' if torch.cuda.is_available() else 'cpu')
     self.policy = Policy().to(self.device)
     self.policy.load_state_dict(torch.load(self.FILE))
     self.policy.eval()
     self.criterion = nn.CrossEntropyLoss()
     self.learning_rate = learning_rate
     self.optimizer = torch.optim.Adam(self.policy.parameters(),
                                       lr=self.learning_rate)
コード例 #7
0
def MatrixJN(domain: Domain, policy: Policy.Policy, N):
    # method to return the list of Matrix of Expected value after N turn with a policy in a domain
    L = [np.array([[0. for k in range(domain.n)] for l in range(domain.m)])]
    for h in range(1, N):
        L.append(np.array([[0. for k in range(domain.n)] for l in range(domain.m)]))
        for i in range(domain.n):
            for j in range(domain.m):
                L[-1][j][i] = domain.reward([i, j], policy.action([i, j]))
                L[-1][j][i] += domain.gamma * (1 - domain.beta) * L[-2][min(max(j + policy.action([i, j])[1], 0), domain.m - 1)][min(max(i + policy.action([i, j])[0], 0), domain.n - 1)]
                L[-1][j][i] += domain.gamma * domain.beta * L[-2][0][0]
    return L
コード例 #8
0
def get_soft_policy_from_qf_dict(
    qf_dict: SAf,
    softmax: bool,
    epsilon: float
) -> Policy:
    if softmax:
        ret = Policy({s: find_softmax_action_probs(v) for s, v in
                      qf_dict.items()})
    else:
        ret = Policy({s: find_epsilon_action_probs(v, epsilon) for s, v in
                      qf_dict.items()})
    return ret
コード例 #9
0
def eval(model_type=model_type, model_path=model_path):
    if torch.cuda.is_available():
        device = 'cuda'
    else:
        device = 'cpu'

    env = LunarLander()

    if model_type == 'policy':
        model = Policy(env.observation_dim, env.action_dim)
    elif model_type == 'dqn':
        model = Network(env.observation_dim, env.action_dim)
    model.to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()

    episodes = 50
    wins = 0
    frames = []
    fuel_left = []
    for i in range(episodes):
        if i % 10 == 0:
            print(f"On episode {i}")
        frame_count = 0

        env.reset()
        state = env.get_state()
        while True:
            frame_count += 1

            action = model(
                torch.tensor(state, dtype=torch.float32,
                             device=device).unsqueeze(0)).argmax()

            state, reward, done = env.step(action)

            if done:
                if env.won:
                    wins += 1
                    frames.append(frame_count)
                    fuel_left.append(env.rocket.fuel)
                break
        env.close()

    if wins > 0:
        print(f"wins: {wins}")
        print(f"mean frames on wins {np.mean(frames)}")
        print(f"std frames on wins {np.std(frames, ddof=1)}")
        print(f"min frames on wins {np.min(frames)}")
        print(f"max frames on wins {np.max(frames)}")

        print(f"mean fuel on wins {np.mean(fuel_left)}")
        print(f"std fuel on wins {np.std(fuel_left, ddof=1)}")
        print(f"min fuel on wins {np.min(fuel_left)}")
        print(f"max fuel on wins {np.max(fuel_left)}")
    else:
        print("The model had 0 wins. Statistics can't be calculated")
コード例 #10
0
    def load_weights(self, load_from):
        checkpoint = torch.load(load_from)
        qnet_params = checkpoint['critic_params']
        policy_params = checkpoint['actor_params']

        self.actor_local = Policy(policy_params['actor_params'])
        self.actor_local.load_state_dict(
            checkpoint['actor_params']['state_dict'])

        self.critic_local = QNetwork(qnet_params['critic_params'])
        self.critic_local.load_state_dict(
            checkpoint['critic_params']['state_dict'])
        return self
コード例 #11
0
ファイル: Looper.py プロジェクト: aakash94/RLSamples
    def __init__(self, env="LunarLanderContinuous-v2", gamma=0.99):
        self.policy = Policy(env_id=env)
        self.env = gym.make(env)
        self.runs = Runs(gamma=gamma)

        self.plotter = VisdomPlotter(env_name=env)

        self.device_cpu = torch.device("cpu")
        if torch.cuda.is_available():
            self.use_gpu = True
            self.device = torch.device("cuda")
        else:
            self.use_gpu = False
            self.device = torch.device("cpu")
コード例 #12
0
    def __init__(self, element, ns, loggerParentName=None, debug=False):

        if loggerParentName: loggerName = loggerParentName + ".ReplicationPolicy"
        else: loggerName = "ReplicationPolicy"
        self.logger = logging.getLogger(loggerName)

        if debug:
            self.logger.setLevel(logging.DEBUG)
        else:
            self.logger.setLevel(logging.INFO)        

        Policy.__init__(self, element, ns, loggerName, debug)
        self.dataset = None
        self.actions = []
コード例 #13
0
    def NetworkBase_enforcePolicy(self, time, score=None, onlyDisc=False):
        ONLY_NON_DISCRIMINATORY = 1
        ONLY_DISCRIMINATORY = 2 

        if self.policyScore + score > self.policyCap:
            return

        if score:
            enforcedPolicy = Policy(time=time, score=score)
        else:
            # Maps from boolean value to the ints specified above
            biasType = int(onlyDisc) + 1
            enforcedPolicy = Policy(time=time, biasPass=biasType)
            
        self.NetworkBase_addToPolicies(enforcedPolicy, time)
コード例 #14
0
ファイル: Agent.py プロジェクト: pillumina/robotics_pset2
    def one_step_lookahead(self, V, pe=0):
        new_policy_mat = [[[None for y in range(self.length)]
                           for x in range(self.width)]
                          for dir in range(self.num_dirs)]
        for state in self.env.states:
            adj_states = self.env.getAdjStates(state)
            max_action_value = float("-inf")
            best_action = None
            for action_tuple in action_space:
                move, rotate = action_tuple
                action = Action(move, rotate)
                action_value = 0
                for nxt_state in adj_states:
                    nxt_x, nxt_y, nxt_dir = nxt_state.getState()
                    action_value += self.get_trans_prob(
                        pe, state, action,
                        nxt_state) * V[nxt_dir][nxt_x][nxt_y]
                if action_value > max_action_value:
                    max_action_value = action_value
                    best_action = action

            cur_x, cur_y, cur_dir = state.getState()
            new_policy_mat[cur_dir][cur_x][cur_y] = best_action

        new_policy = Policy(new_policy_mat)
        return new_policy
コード例 #15
0
ファイル: TD0.py プロジェクト: jialongw327/RL-For-Finance
    def get_value_func_dict(self, pol: Policy):
        sa_dict = self.mdp_rep.state_action_dict
        vf_dict = {s: 0.0 for s in sa_dict.keys()}
        act_gen_dict = {
            s: get_rv_gen_func_single(pol.get_state_probabilities(s))
            for s in sa_dict.keys()
        }
        episodes = 0
        updates = 0

        while episodes < self.num_episodes:
            state = self.mdp_rep.init_state_gen()
            steps = 0
            terminate = False

            while not terminate:
                action = act_gen_dict[state]()
                next_state, reward = \
                    self.mdp_rep.state_reward_gen_dict[state][action]()
                vf_dict[state] += self.learning_rate *\
                    (updates / self.learning_rate_decay + 1) ** -0.5 *\
                    (reward + self.mdp_rep.gamma * vf_dict[next_state] -
                     vf_dict[state])
                updates += 1
                steps += 1
                terminate = steps >= self.max_steps or \
                    state in self.mdp_rep.terminal_states
                state = next_state

            episodes += 1

        return vf_dict
コード例 #16
0
ファイル: worker_factory.py プロジェクト: dzhv/RL-assignment
def create_greedy_worker(networks, counter, config):
    logger = Logger("output_{0}_greedy.out".format(config['experiment']))
    environment = HFOEnv(port=6321, seed=86868686, numOpponents=1)
    environment.connectToServer()

    w_args = (100000, networks["learning"], environment, Policy(logger=logger),
              logger, counter)
    return mp.Process(target=policy_worker.run, args=w_args)
コード例 #17
0
ファイル: Algo.py プロジェクト: eltonio450/StOP-Alogrithm
    def run(self):
        if not self.ready:raise Exception("StOP Parameters have not been filled")
        
        d_star = ceil(log(6/((1-self.gamma)*self.epsilon))/log(1/self.gamma))

        self.state_list[self.init.value] = self.init
        lpi = 0 #last policy id

        """Must be corrected. Action list is know in the state""" 
        for action in self.generator.get_actions(self.init):
            pol = Policy()
            pol.set_parameters(lpi, 1)
            pol.add_state(self.init, action, 0) #is it really 0 ? I think so... 
            lpi = self.add_policy(pol, lb, ub, lpi)

            d = self.delta/(d_star*self.generator.bf(self.init))
             
            #s_u is a couple (state, action)
            self.sample_eff(pol, self.init, action, self.m(1,d))

        while True:
            candidate_policies = []
            for action in self.generator.get_actions(self.init):
                value_tr(self.init, action) #fait je sais pas trop quoi...
                #ici il faut sortir la meilleure policy pour cette action
            #ici il faut sortir les deux meilleures actions et policies associees
            p1,p2,a1,a2
            
            if p1.lb + self.epsilon >= p2.ub:
                return [p1, a1]

            if p2.depth >= p1.depth:
                a = a1
                p = p1
            else:
                a = a2
                p = p2

            #calculation of K:
            K = 1
            for i in range(0, p.depth): #meriterait d'etre verifie en termes d'indices
                K *= self.generator.pessimistic_action_number(self.init, i) ** self.generator.pessimistic_children_number(self.init, i)

            #multiplication du branching factor...
            d = self.delta/(d_star*K)#???
コード例 #18
0
    def toString(self):

        str = 'Policy:'
        str += Policy.toString(self);
        str += self.dataset.toString('\t')
        str += '\tActions:\n'
        for action in self.actions:
            str += action.toString('\t\t')
        return str
コード例 #19
0
ファイル: mainNew.py プロジェクト: hezhensong/MongoConvertor
def main():
    address_old = 'localhost'
    port_old = 27017

    address_new = '123.56.65.17'
    port_new = 27017

    Area.insert_area(address_old, port_old, address_new, port_new)
    
    WeatherTranslation.insert_weather_translation(address_new, port_new)

    Policy.insert_policy(address_new, port_new)

#    RecommendHistory.insert_recommend_history(address_new, port_new)

    PolicyMap.insert_policy_map(address_new, port_new)

    News.insert_news(address_new, port_new)

    print("OK")
コード例 #20
0
    def __init__(self, params):
        self.params = params
        self.__state_dim = params['state_dim']
        self.__action_dim = params['action_dim']
        self.__buffer_size = params['buffer_size']
        self.__batch_size = params['batch_size']
        self.__gamma = params['gamma']
        self.__tau = params['tau']
        self.__lr = params['lr']
        self.__update_every = params['update_every']
        eps = params['eps']
        eps_decay = params['eps_decay']
        min_eps = params['min_eps']
        seed = params['seed']
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Q-Network
        critic_params = dict()
        critic_params['seed'] = seed
        critic_params['arch_params'] = params['arch_params_critic']
        self.critic_local = QNetwork(critic_params).to(device)
        self.critic_target = QNetwork(critic_params).to(device)
        self.optimizer_critic = optim.Adam(self.critic_local.parameters(),
                                           lr=self.__lr)

        #Policy
        actor_params = dict()
        actor_params['seed'] = seed
        actor_params['arch_params'] = params['arch_params_actor']
        actor_params['noise_type'] = params['noise_type']
        actor_params['eps'] = eps
        actor_params['eps_decay'] = eps_decay
        actor_params['min_eps'] = min_eps
        actor_params['arch_params'] = params['arch_params_actor']
        self.actor_local = Policy(actor_params).to(device)
        self.actor_target = Policy(actor_params).to(device)
        self.optimizer_actor = optim.Adam(self.actor_local.parameters(),
                                          lr=self.__lr)

        self.__memory = ReplayBuffer(self.__buffer_size, self.__batch_size)
        self.__t_step = 0
コード例 #21
0
ファイル: REINFORCE.py プロジェクト: sumin123/RLstudy
def main():
    env = gym.make('CartPole-v1')
    pi = Policy(LEARNING_RATE, GAMMA)
    score = 0.0
    print_interval = 20

    for n_epi in range(10000):
        s = env.reset()
        done = False

        while not done:
            prob = pi(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample()
            s_prime, r, done, info = env.step(a.item())
            pi.put_data((r, prob[a]))
            s = s_prime
            score += r

        pi.train_net()
        if n_epi % print_interval == 0 and n_epi != 0:
            print("# of episode :{}, avg score: {}".format(
                n_epi, score / print_interval))
            score = 0.0
    env.close()
コード例 #22
0
ファイル: Model.py プロジェクト: jruel4/Simmie-ARCH
    def __init__(
        self,
        _NAUDIO_COMMANDS,  #scalar, number of possible audio commands
        _EEG_INPUT_SHAPE,  #shape, (ntimepoints, nchan, nfreqs)
        _LOGDIR,  #pass in directory to write summaries and whatnot
        _POLICY_LR=1e-4,  #scalar, policy learning rate
        _VALUE_LR=1e-3,  #scalar, value learning rate
        _REWARD_MA_LEN=100,  #scalar
        _LSTM_CELLS=[
            30, 30, 30
        ]  #lstm dimensions, (cell0_size, cell1_size, ...) when total length is number of cells
    ):

        # These should not be changed by user but may change later in architechture
        self._InputShape = list(_EEG_INPUT_SHAPE)
        self._LSTMCells = list(_LSTM_CELLS)
        self._LSTMUnrollLength = 1
        self._ValueDiscount = 1.0

        self._Policy = Policy(_LEARNING_RATE=_POLICY_LR,
                              _ACTIONSPACE_SIZE=_NAUDIO_COMMANDS)
        self._Value = Value(_LEARNING_RATE=_VALUE_LR,
                            _DISCOUNT_RATE=self._ValueDiscount)
        self._Reward = Reward(_INPUT_SHAPE=_EEG_INPUT_SHAPE,
                              _MA_LENGTH=_REWARD_MA_LEN)
        self._Shared = Shared(_CELLS=_LSTM_CELLS,
                              _UNROLL_LENGTH=self._LSTMUnrollLength)

        # We store a version of the hidden state which we pass in every iteration
        self._HiddenStateShape = (len(_LSTM_CELLS), 2, self._LSTMUnrollLength,
                                  _LSTM_CELLS[-1])
        self._LocalHiddenState = np.zeros(self._HiddenStateShape)

        # Save the logdir
        self.mLogdir = _LOGDIR

        self._buildModel()
        self._buildSummaries()
        self._buildFeedDicts()
        self._initSession()
コード例 #23
0
    def __evaluate_general_award(self, battle_info, card):
        free_pos = battle_info.field.get_empty_pos(self_side_flag=True)

        best_pos = -1
        best_award = -1
        for pos in free_pos:
            award = self.get_policy_award(battle_info, card, pos)
            print("Testing Card " + str(card.id) + " in pos" + str(pos) +
                  " AWard:" + str(award))
            if award > best_award:
                best_pos = pos
                best_award = award

        return Policy(card, best_pos, best_award)
コード例 #24
0
ファイル: TabularPolicy.py プロジェクト: SwappyG/RL_Practise
		def setUp(self):
			self.ss = (7,9)

			self.a_map = OrderedDict()
			self.a_map['U'] = (1,1)
			self.a_map['D'] = (-1,-1)
			self.a_map['R'] = (1,0)
			self.a_map['L'] = (1,-2)

			self.ws = WorldSpace(self.ss, self.a_map)

			self.p_kw = {}

			self.p_kw['discount_factor'] = 1
			self.p_kw['exploration_factor'] = 0.95
			self.p_kw['is_static'] = False
			self.p_kw['learn_rate'] = 0.001

			self.policy = Policy(self.ws, **self.p_kw)

			self.p_kw['value_type'] = Policy.STATE_VALUES
			self.p_kw['init_variance'] = 0.01
			self.tab_pol = TabularPolicy(self.ws, **self.p_kw)
コード例 #25
0
ファイル: MDP.py プロジェクト: jialongw327/RL-For-Finance
 def policy_iteration(self, tol=1e-4) -> DetPolicy:
     ''' Find the optimal policy using policy iteration '''
     pol = Policy({
         s: {a: 1. / len(v)
             for a in v}
         for s, v in self.state_action_dict.items()
     })
     vf = self.find_value_func_dict(pol)
     epsilon = tol * 1e4
     while epsilon >= tol:
         pol = self.find_improved_policy(pol)
         new_vf = self.find_value_func_dict(pol)
         epsilon = max(abs(new_vf[s] - v) for s, v in vf.items())
         vf = new_vf
     return pol
コード例 #26
0
ファイル: TabularPolicy.py プロジェクト: SwappyG/RL_Practise
	class TestTabularPolicy(unittest.TestCase):

		def setUp(self):
			self.ss = (7,9)

			self.a_map = OrderedDict()
			self.a_map['U'] = (1,1)
			self.a_map['D'] = (-1,-1)
			self.a_map['R'] = (1,0)
			self.a_map['L'] = (1,-2)

			self.ws = WorldSpace(self.ss, self.a_map)

			self.p_kw = {}

			self.p_kw['discount_factor'] = 1
			self.p_kw['exploration_factor'] = 0.95
			self.p_kw['is_static'] = False
			self.p_kw['learn_rate'] = 0.001

			self.policy = Policy(self.ws, **self.p_kw)

			self.p_kw['value_type'] = Policy.STATE_VALUES
			self.p_kw['init_variance'] = 0.01
			self.tab_pol = TabularPolicy(self.ws, **self.p_kw)

		# Test that the TabularPolicy enforces discrete states and creates a value matrix for the state_space
		def test_TabularPolicy(self):

			self.assertTrue( np.all(self.tab_pol.vals.shape == self.ss) )

			self.assertTrue( self.tab_pol.IsValidState((0,0)) )
			self.assertTrue( self.tab_pol.IsValidState((1,3)) )
			self.assertFalse( self.tab_pol.IsValidState((0.1,0)) ) # non integer states are invalid for TabularPolicy
			self.assertFalse( self.tab_pol.IsValidState((3,2.3)) ) # non integer states are invalid for TabularPolicy
			self.assertFalse( self.tab_pol.IsValidState(2.3) ) # Make sure there's no index error
			self.assertFalse( self.tab_pol.IsValidState('a') ) # Make sure there's no index error or type error

			self.assertTrue( self.tab_pol.IsValidStateAction((1,3), 0) )
			self.assertFalse( self.tab_pol.IsValidStateAction((1,3.3), 0) ) # non integer states are invalid for TabularPolicy
			self.assertTrue( self.policy.IsValidStateAction((1,3.3), 0) ) # non integer states are valid for Policy

			self.p_kw['value_type'] = Policy.ACTION_STATE_VALUES
			self.tab_pol = TabularPolicy(self.ws, **self.p_kw)
			self.assertTrue( np.all(self.tab_pol.vals.shape == np.append(self.ss, len(self.a_map)) ) )
コード例 #27
0
ファイル: worker_factory.py プロジェクト: dzhv/RL-assignment
def create_workers(config, logger):
    counter = mp.Value('i', 0)

    learning_network = network_factory.create_network(
        num_layers=config["num_layers"], hidden_size=config["hidden_size"])
    target_network = network_factory.create_network(
        num_layers=config["num_layers"], hidden_size=config["hidden_size"])
    learning_network.load_state_dict(target_network.state_dict())

    optimizer = SharedAdam(learning_network.parameters(),
                           lr=config["learning_rate"])
    optimizer.share_memory()

    workers = []
    for idx in range(0, config["n_workers"]):
        networks = {"learning": learning_network, "target": target_network}

        # environment = create_environment(idx)
        policy = Policy(epsilon=config["startingEpsilons"][idx],
                        numUpdates=config["numPolicyUpdates"],
                        minEpsilon=config["minEpsilons"][idx],
                        logger=logger)
        trainingArgs = (idx, networks, optimizer, counter, policy, config,
                        logger)
        p = mp.Process(target=Worker.train, args=trainingArgs)

        logger.log("Starting process: {0}".format(idx))

        p.start()

        logger.log("Process started: {0}".format(idx))
        workers.append(p)
        logger.log("Worker Appended: {0}".format(idx))

    logger.log("Creating the greedy worker")
    p = create_greedy_worker(networks, counter, config)
    p.start()
    workers.append(p)
    logger.log("Greedy worker started and appended")

    return workers, target_network
コード例 #28
0
ファイル: Agent.py プロジェクト: pillumina/robotics_pset2
    def value_iter(self, discount, pe=0):
        prev_value = np.zeros((self.num_dirs, self.width, self.length))
        new_policy_matrix = [[[None for y in range(self.length)]
                              for x in range(self.width)]
                             for dir in range(self.num_dirs)]

        converge = False

        while not converge:
            # print("\nValue Iteration ")
            new_value = np.zeros((self.num_dirs, self.width, self.length))
            for cur_state in self.env.states:
                cur_x, cur_y, cur_dir = cur_state.getState()
                adj_states = self.env.getAdjStates(cur_state)
                best_action = None
                max_action_value = float("-inf")
                for action_tuple in action_space:
                    move, rotate = action_tuple
                    action = Action(move, rotate)
                    action_value = 0
                    for nxt_state in adj_states:
                        x, y, dir = nxt_state.getState()
                        action_value += self.get_trans_prob(
                            pe, cur_state, action,
                            nxt_state) * (self.get_reward(cur_state) +
                                          discount * prev_value[dir][x][y])
                    if action_value > max_action_value:
                        best_action = action
                        max_action_value = action_value

                new_policy_matrix[cur_dir][cur_x][cur_y] = best_action
                new_value[cur_dir][cur_x][cur_y] = max_action_value

            diff = np.sum(np.abs(new_value - prev_value))
            # print("Value diff: ", diff)
            if np.array_equal(new_value, prev_value):
                converge = True
            prev_value = new_value
        new_policy = Policy(new_policy_matrix)
        return new_policy, new_value
コード例 #29
0
 def parsePolicy(self, fname):
     width = None
     height = 0
     policy = []
     with open(fname) as file:
         while True:
             line = file.readline()
             if not line:
                 break
             if width != None and (len(line) - 1) != width:
                 raise Exception("Input width inconsistent")
             width = len(line) - 1
             height += 1
             rowActions = self.parseLine(line)
             policy.extend(rowActions)
     policy = Policy(policy)
     policy.setWidth(width)
     policy.setHeight(height)
     return (policy)
コード例 #30
0
# 'policy' or 'dqn' to choose which type of model to evaluate
model_type = 'policy'
# model_type = 'dqn'
model_path = "policies/22-1-2021_13-44/policy0.tar"

env = LunarLander()
env.reset()
exit_program = False

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

if model_type == 'policy':
    model = Policy(env.observation_dim, env.action_dim)
elif model_type == 'dqn':
    model = Network(env.observation_dim, env.action_dim)
model.to(device)
model.load_state_dict(torch.load(model_path))
model.eval()
state = env.get_state()

while not exit_program:
    env.render()
    action = model(
        torch.tensor(state, dtype=torch.float32,
                     device=device).unsqueeze(0)).argmax()

    state, reward, done = env.step(action)
コード例 #31
0
    def parse(self):

        Policy.parse(self)
        self.parseDataSets(self.root.findall(self.ns+'dataset'))
        self.parseActions(self.root.findall(self.ns+'actions'))
コード例 #32
0
class PPO:
    def __init__(self,
                 env_id,
                 render=False,
                 num_process=4,
                 min_batch_size=2048,
                 lr_p=3e-4,
                 lr_v=3e-4,
                 gamma=0.99,
                 tau=0.95,
                 clip_epsilon=0.2,
                 ppo_epochs=10,
                 ppo_mini_batch_size=64,
                 seed=1,
                 model_path=None):
        self.env_id = env_id
        self.gamma = gamma
        self.tau = tau
        self.ppo_epochs = ppo_epochs
        self.ppo_mini_batch_size = ppo_mini_batch_size
        self.clip_epsilon = clip_epsilon
        self.render = render
        self.num_process = num_process
        self.lr_p = lr_p
        self.lr_v = lr_v
        self.min_batch_size = min_batch_size
        self.model_path = model_path
        self.seed = seed

        self._init_model()

    def _init_model(self):
        """init model from parameters"""
        self.env, env_continuous, self.num_states, self.num_actions = get_env_info(
            self.env_id)

        # seeding
        torch.manual_seed(self.seed)
        self.env.seed(self.seed)

        self.policy_net = Policy(self.num_states, self.num_actions).to(device)

        self.value_net = Value(self.num_states).to(device)
        self.running_state = ZFilter((self.num_states, ), clip=5)

        if self.model_path:
            print("Loading Saved Model {}_ppo.p from {}/{}_ppo.p".format(
                self.env_id, self.model_path, self.env_id))
            data = pickle.load(
                open('{}/{}_ppo.p'.format(self.model_path, self.env_id), "rb"))
            self.policy_net, self.value_net, self.running_state = data.policy_net, data.value_net, data.running_state

        self.collector = MemoryCollector(self.env,
                                         self.policy_net,
                                         render=self.render,
                                         running_state=self.running_state,
                                         num_process=self.num_process)

        self.optimizer_p = optim.Adam(self.policy_net.parameters(),
                                      lr=self.lr_p)
        self.optimizer_v = optim.Adam(self.value_net.parameters(),
                                      lr=self.lr_v)

    def choose_action(self, state):
        """select action"""
        state = FLOAT(state).unsqueeze(0).to(device)
        with torch.no_grad():
            action, log_prob = self.policy_net.get_action_log_prob(state)

        action = action.cpu().numpy()[0]
        return action

    def eval(self, i_iter, render=False):
        state = self.env.reset()
        test_reward = 0
        while True:
            if render:
                self.env.render()
            state = self.running_state(state)
            action = self.choose_action(state)
            state, reward, done, _ = self.env.step(action)

            test_reward += reward
            if done:
                break
        print(f"Iter: {i_iter}, test Reward: {test_reward}")
        self.env.close()

    def learn(self, writer, i_iter):
        """learn model"""
        memory, log = self.collector.collect_samples(self.min_batch_size)

        print(
            f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, "
            f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, "
            f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}"
        )

        # record reward information
        writer.add_scalar("rewards/total_reward", log['total_reward'], i_iter)
        writer.add_scalar("rewards/average_reward", log['avg_reward'], i_iter)
        writer.add_scalar("rewards/min_reward", log['min_episode_reward'],
                          i_iter)
        writer.add_scalar("rewards/max_reward", log['max_episode_reward'],
                          i_iter)
        writer.add_scalar("rewards/num_steps", log['num_steps'], i_iter)

        batch, permuted_batch = memory.sample()  # sample all items in memory
        #  ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob')
        batch_state = FLOAT(batch.state).to(device)
        batch_action = FLOAT(batch.action).to(device)
        batch_reward = FLOAT(batch.reward).to(device)
        batch_next_state = FLOAT(batch.next_state).to(device)
        batch_mask = FLOAT(batch.mask).to(device)
        batch_log_prob = FLOAT(batch.log_prob).to(device)

        with torch.no_grad():
            batch_value = self.value_net(batch_state)

        batch_advantage, batch_return = estimate_advantages(
            batch_reward, batch_mask, batch_value, self.gamma, self.tau)

        alg_step_stats = {}
        if self.ppo_mini_batch_size:
            batch_size = batch_state.shape[0]
            mini_batch_num = int(
                math.ceil(batch_size / self.ppo_mini_batch_size))

            # update with mini-batch
            for _ in range(self.ppo_epochs):
                index = torch.randperm(batch_size)

                for i in range(mini_batch_num):
                    ind = index[slice(
                        i * self.ppo_mini_batch_size,
                        min(batch_size, (i + 1) * self.ppo_mini_batch_size))]
                    state, action, returns, advantages, old_log_pis = batch_state[ind], batch_action[ind], \
                        batch_return[
                        ind], batch_advantage[ind], \
                        batch_log_prob[
                        ind]

                    alg_step_stats = ppo_step(self.policy_net, self.value_net,
                                              self.optimizer_p,
                                              self.optimizer_v, 1, state,
                                              action, returns, advantages,
                                              old_log_pis, self.clip_epsilon,
                                              1e-3)
        else:
            for _ in range(self.ppo_epochs):
                alg_step_stats = ppo_step(self.policy_net, self.value_net,
                                          self.optimizer_p, self.optimizer_v,
                                          1, batch_state, batch_action,
                                          batch_return, batch_advantage,
                                          batch_log_prob, self.clip_epsilon,
                                          1e-3)

        return alg_step_stats

    def save(self, save_path):
        """save model"""
        check_path(save_path)
        pickle.dump((self.policy_net, self.value_net, self.running_state),
                    open('{}/{}_ppo_encoder.p'.format(save_path, self.env_id),
                         'wb'))
コード例 #33
0
    def train_policies(self, load_best_policy=False, load_reinforcement=False):
        if load_reinforcement:
            for i in range(0, game_setting.K):
                policy = Policy(self.game_setting)
                file_name = policy.load_reinforcement_model(i)
                self.policies.append([policy, file_name, 0, 0])
            return

        if load_best_policy:
            start = 1
            policy = Policy(self.game_setting)
            nr_of_training_cases = policy.load_best_model()
            self.policies.append([policy, nr_of_training_cases, 0, 0])
        else:
            start = 0
        policy = Policy(self.game_setting)

        max_cases = min(
            policy.import_data_and_train(max_cases=self.max_cases,
                                         test_nr_of_cases=True),
            self.max_cases)

        if self.negative_training_power > 0:
            for i in range(start, self.K):
                nr_of_cases = max(
                    0, max_cases // ((i + 1)**self.negative_training_power))

                if nr_of_cases > 0:
                    policy = Policy(self.game_setting)
                    actual_nr_of_cases = policy.import_data_and_train(
                        max_cases=nr_of_cases)
                else:
                    policy = Policy(self.game_setting, no_model=True)
                    actual_nr_of_cases = 0

                self.policies.append([policy, actual_nr_of_cases, 0, 0])
        else:
            for i in range(start, self.K):
                policy = Policy(self.game_setting)
                nr_of_cases = max(
                    int(max_cases * (self.K - i - 1) / (self.K - 1)), 0)
                if nr_of_cases > 0:
                    actual_nr_of_cases = policy.import_data_and_train(
                        max_cases=nr_of_cases)
                    self.policies.append([policy, actual_nr_of_cases, 0, 0])
                else:
                    self.policies.append([policy, 0, 0, 0])
コード例 #34
0
ファイル: WorkerAgent.py プロジェクト: packyzbq/DistJET
import ConfigParser
import sys
import traceback

import IRecv_Module as IM

import logger
from BaseThread import BaseThread
from MPI_Wrapper import Client
from MPI_Wrapper import Tags
from Policy import Policy
from Task import SampleTask
from Task import TaskStatus
from WorkerRegistry import WorkerStatus

policy = Policy()

log = logger.getLogger('WorkerAgent')
wlog = None


def MSG_wrapper(**kwd):
    return json.dumps(kwd)


class HeartbeatThread(BaseThread):
    """
    ping to master to update status
    """
    def __init__(self, client, worker_agent):
        BaseThread.__init__(self, name='HeartbeatThread')
コード例 #35
0
ファイル: Fifo.py プロジェクト: hachedeeme/hacheSO
 def __init__(self):
     Policy.__init__(self)
コード例 #36
0
ファイル: main.py プロジェクト: GreatAlexander/Games
def policy(w):
	''' Calculate policy for given world'''
	p = Policy(w)
	p.policyIteration(turbo=True)
	
	return p