Ejemplo n.º 1
0
    def __init__(self, state_size, action_size):
        self.epsilon = 0.8
        self.state_size = state_size
        self.action_size = action_size

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size)
        self.actor_target = Actor(self.state_size, self.action_size)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        #         self.exploration_mu = 0
        #         self.exploration_theta = 0.15
        #         self.exploration_sigma = 0.2
        #         self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 20000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.95  # discount factor
        self.tau = 0.002  # for soft update of target parameters

        self.stats = np.array([])
Ejemplo n.º 2
0
    def __init__(self,
                 gamma,
                 memory,
                 s,
                 a,
                 tau,
                 learningRate=1e-3,
                 criticpath=None,
                 actorpath=None):
        self.gamma = gamma
        self.memory = ReplayMemory(memory)
        self.actor = Actor(state=s, actions=a)
        self.critic = Critic(state=s, actions=a)
        if (not (criticpath == None)):
            self.critic.load_state_dict(torch.load(criticpath))
        if (not (actorpath == None)):
            self.actor.load_state_dict(torch.load(actorpath))
        self.targetActor = Actor(state=s, actions=a)
        self.targetActor.load_state_dict(self.actor.state_dict())
        self.targetCritic = Critic(state=s, actions=a)
        self.targetCritic.load_state_dict(self.critic.state_dict())
        self.tau = tau

        self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate)
        self.criticOptimizer = optim.Adam(self.critic.parameters(),
                                          learningRate)
        #more a dimensionality thing
        self.state = s
        self.action = a
        self.OUarray = np.zeros((1000, self.action), dtype="f")
        self.step = 0
Ejemplo n.º 3
0
 def __init__(self, sess, scale_u, params):
     self.sess = sess
     self.scale_u = scale_u
     self.__dict__.update(params)
     # CREATE INPUT PLACEHOLDERS
     self.create_input_placeholders()
     # INITIALIZE ACTOR & CRITIC MODELS
     self.agents = [
         Actor(self.sess, self.inputs, i, **self.actor_params)
         for i in [1, 2, 3]
     ]
     self.critic = Critic(self.sess, self.inputs, **self.critic_params)
     # INITIALIZE EXPLORATION MODEL
     self.noise_params = {
         k: np.fromstring(v, sep=",", dtype="f")
         for k, v in self.noise_params.items()
     }
     self.noise = [Noise(**self.noise_params) for _ in range(3)]
     # INITIALIZE REPLAY BUFFER
     self.memory = Memory(self.memory_size)
     # AVERAGE AGENT POLICIES
     avg_pi = [
         tf.reduce_mean(i, axis=0)
         for i in zip(*[x.pi.net_params for x in self.agents])
     ]
     self.avg_op = [
         tf.assign(i, j) for x in self.agents
         for i, j in zip(x.pi.net_params, avg_pi)
     ]
Ejemplo n.º 4
0
	def __init__(self, state_size, action_size, seed=0):
		'''Initlize the Agent.
		
		Parameters
		----------
		state_size : int
			The dimension of each state
		
		action_size : int
			The dimension of each action
		
		seed : int
			The random seed used to generate random numbers.
		'''
		self.state_size = state_size
		self.action_size = action_size
		random.seed(seed)

		#actor gives the best action for given state
		self.actor_local = Actor(state_size, action_size, seed).to(device)
		self.actor_target = Actor(state_size, action_size, seed).to(device)

		#evaluates the action
		self.critic_local = Critic(state_size, action_size, seed).to(device)
		self.critic_target = Critic(state_size, action_size, seed).to(device)

		self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LEARNING_RATE)
		self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=WEIGHT_DECAY)

		#Replay Memory
		self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)

		#Noise
		self.noise = OUNoise(action_size,seed)
		self.t_step = 0
Ejemplo n.º 5
0
def main():
    no_of_wins = 0
    no_of_ties = 0
    no_of_losses = 0
    print(" algorithm gaining experience")

    game_generator = Experiment(BOARD_DIMENSION)
    generalizer = Generalizer(19)  #19 because of 18 features + one constant w0

    critic = Critic(generalizer)
    for i in range(NUMBER_OF_EXaMPLES):
        board = game_generator.generate_board()

        performance_system = PerformanceMeasure(board, generalizer, critic,
                                                game_generator)
        result = performance_system.improve_system()

        examples, values = critic.fetch_training_examples()
        generalizer.set_training_examples(examples)
        generalizer.set_training_values(values)
        generalizer.LMS_weight_update_rule()

        if result == 100: no_of_wins += 1
        if result == -100: no_of_losses += 1
        if result == 0: no_of_ties += 1

    W = generalizer.get_weights()

    print(no_of_wins, no_of_ties, no_of_losses)
    """
        Playing against human...
    """

    while True:
        human_board = GameBoard([
            '-',
            '-',
            '-',
            '-',
            '-',
            '-',
            '-',
            '-',
            '-',
        ])

        vs_human = human_board.victor()
        while vs_human is None:
            x = int(input(" X coordinate: "))
            y = int(input(" Y coordinate: "))

            human_board._board[x * 3 + y] = 'X'

            computer_position = human_board.maximizer(W)
            human_board._board[computer_position] = 'O'

            print(human_board)

            vs_human = human_board.victor()
def start(GAME_NAME, MAX_EPISODE):
    env = gym.make(GAME_NAME)  # create enviornment
    actor = Actor(env.observation_space, env.action_space)  # create actor
    critic = Critic(env.observation_space, env.action_space)  # create critic
    reward_per_epi = []
    durations_per_epi = []
    l_A = []
    l_C = []
    MAX_EPISODE = MAX_EPISODE
    RENDER = False
    MAX_EP_STEPS = 1000
    #DISPLAY_REWARD_THRESHOLD=200

    #print ("begin.\n\n")
    for i_episode in range(MAX_EPISODE):
        s = env.reset()
        critic.reset()
        actor.reset()
        track_r = []
        for t in count():
            if RENDER: env.render()

            a = actor.choose_action(s)

            s_, r, done, info = env.step(a)
            #if done: r = -20             # Penalty if die
            track_r.append(r)

            td_error, abs_error = critic.learn(s, r, s_)  # Critic Learn
            actor.learn(s, a, td_error)  # Actor Learn

            s = s_

            #print ("... in episode (%d) step (%d)" % (i_episode+1,t))
            if is_ipython:
                display.clear_output(wait=True)
                display.display(plt.gcf())

            #env.render()

            if done or t >= MAX_EP_STEPS:  # Episode finished, print results
                ep_rs_sum = sum(track_r)
                #if 'running_reward' not in globals():
                #    running_reward = ep_rs_sum
                #else:
                #    running_reward = running_reward * 0.95 + ep_rs_sum * 0.05
                #if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True   # rendering
                running_reward_avg = ep_rs_sum / float(t)
                reward_per_epi.append(ep_rs_sum)
                durations_per_epi.append(t)
                l_A.append(np.mean(actor._loss_))
                l_C.append(np.mean(critic._loss_))
                #print("episode:", i_episode, "  reward:", ep_rs_sum)
                #plot(reward_per_epi, durations_per_epi, l_A, l_C)

                break

    return reward_per_epi, durations_per_epi, l_A, l_C
Ejemplo n.º 7
0
 def __init__(self):
     tf.reset_default_graph()
     self.sess = tf.Session()
     self.actor = Actor(self.sess, \
                     n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \
                     lr=Config.LEARNING_RATE_START, action_bound=[-math.pi, math.pi])
     self.critic = Critic(self.sess, \
                     n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \
                     lr=Config.LEARNING_RATE_START)
     self.sess.run(tf.global_variables_initializer())
Ejemplo n.º 8
0
    def __init__(self, state_size, action_size, num_agents):
        """
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents in the environment
        """
        random_seed = 10.0
        self.state_size = state_size
        self.action_size = action_size
        self.random_seed = random.seed(random_seed)
        self.num_agents = num_agents

        # Replay memory
        self.memory = ReplayBuf(action_size, BUFFER_SIZE, BATCH_SIZE,
                                self.random_seed)

        # Actor Networks
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Make sure the Actor Target Network has the same weight values as the Local Network
        for target, local in zip(self.actor_target.parameters(),
                                 self.actor_local.parameters()):
            target.data.copy_(local.data)

        # Critic Network (w/ Target Network)

        self.critic_local = Critic(state_size * num_agents,
                                   action_size * num_agents,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size * num_agents,
                                    action_size * num_agents,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)
        """
        self.critic_local = Critic(state_size, action_size, random_seed).to(device)
        self.critic_target = Critic(state_size, action_size, random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY)
        """

        # Make sure the Critic Target Network has the same weight values as the Local Network
        for target, local in zip(self.critic_target.parameters(),
                                 self.critic_local.parameters()):
            target.data.copy_(local.data)

        self.noise = Ornstein_Uhlenbeck_Noise(action_size, random_seed)
Ejemplo n.º 9
0
    def __init__(self, env, batchSize = 10, bufferSize = 100,
                 gamma = 0.98, actorLR = 1e-4, criticLR = 1e-3,
                 maxSteps = 200, targetUpdate = 1e-3, epsilon = 1,
                 decay = 0.99, rewardScale = 1e-3, logFile = 'run.log'):
        self.env = env
        self.gamma = gamma
        self.batchSize = batchSize
        self.bufferSize = bufferSize
        self.maxSteps = maxSteps + 1
        self.rewardScale = rewardScale
        self.epsilon = epsilon
        self.decay = decay

        # Useful helpers.
        self.actionDim = self.env.action_space.shape[0]
        self.stateDim = self.env.observation_space.shape[0]
        self.featureDim = self.actionDim + self.stateDim
        self.minAction = self.env.action_space.low
        self.maxAction = self.env.action_space.high

        # For scaling output action values.
        self.actionBiasZeroOne = self.minAction
        self.actionScaleZeroOne = self.maxAction - self.minAction
        self.actionBiasTanH = (self.maxAction + self.minAction) / 2.0
        self.actionScaleTanH = self.maxAction - self.actionBiasTanH 

        # Initialize noise process.
        self.noise = OUNoise(self.actionDim)

        # Initialize replay buffer.
        self.buffer = ReplayBuffer(self.bufferSize)

        # Initialize logging.
        logging.basicConfig(filename = logFile,
                            level = logging.INFO,
                            format = '[%(asctime)s] %(message)s',
                            datefmt = '%m/%d/%Y %I:%M:%S %p')
        logging.info('Initializing DRPG agent with passed settings.')

        # Tensorflow GPU optimization.
        config = tf.ConfigProto() # GPU fix?
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config = config)
        from keras import backend as K
        K.set_session(self.sess)

        # Make actor network (creates target model internally).
        self.actor = Actor(self.sess, self.maxSteps, self.featureDim,
                           self.actionDim, self.batchSize, targetUpdate,
                           actorLR, self.actionScaleTanH, self.actionBiasTanH)

        # Make critic network (creates target model internally).
        self.critic = Critic(self.sess, self.maxSteps, self.featureDim,
                             self.actionDim, self.batchSize, targetUpdate,
                             actorLR)
    def load_weights(self, load_from):
        checkpoint = torch.load(load_from)

        critic_params_and_state_dict = checkpoint[
            'critic_params_and_state_dict']
        actor_params_and_state_dict = checkpoint['actor_params_and_state_dict']

        self.actor = Actor(actor_params_and_state_dict['actor_params'])
        self.actor.load_state_dict(actor_params_and_state_dict['state_dict'])

        self.critic = Critic(critic_params_and_state_dict['critic_params'])
        self.critic.load_state_dict(critic_params_and_state_dict['state_dict'])
        return self
Ejemplo n.º 11
0
def main():

    wins = 0; ties = 0; loses = 0;

    print " Playing against itself... Please wait... "

    generator = Generator(BOARD_LENGTH)
    generalizer = Generalizer(19)
    critic = Critic(generalizer)

    for i in xrange(NO_OF_TRAINING_EXAMPLES):
        board = generator.generate_board()

        performance_system = PerformanceSystem(board, generalizer, critic, generator)
        result = performance_system.improve_system()

        examples, values = critic.get_training_examples()
        generalizer.set_training_examples(examples)
        generalizer.set_training_values(values)
        generalizer.gradient_descent()

        if result == 100: wins += 1
        if result == -100: loses += 1
        if result == 0: ties += 1



    W = generalizer.get_weights()

    print wins, ties, loses

    """
        Playing against human...
    """

    while True:
        human_board = Board(['-', '-', '-', '-', '-', '-', '-', '-', '-', ])

        vs_human = human_board.winner()
        while vs_human is None:
            x = int(raw_input(" X coordinate: "))
            y = int(raw_input(" Y coordinate: "))

            human_board._board[x*3 + y] = 'X'

            computer_position = human_board.max_learner_utility(W)
            human_board._board[computer_position] = 'O'

            print human_board

            vs_human = human_board.winner()
Ejemplo n.º 12
0
 def __init__(self, env, sess, LEARNING_RATE_ACTOR, LEARNING_RATE_CRITIC,
              NET_SIZE, MEMORY_LEN, REWARD_DISCOUNT, BATCH_SIZE, TAU,
              EXPLORATION_STEPS, VERBOSE, LOG_DIR_TF):
     self.env = env
     self.sess = sess
     self.observation_space = self.env.observation_space.shape[0]
     self.action_space = self.env.action_space.shape[0]
     self.REWARD_DISCOUNT = REWARD_DISCOUNT
     self.TAU = TAU
     self.BATCH_SIZE = BATCH_SIZE
     self.noise_state = np.zeros(self.action_space)
     self.EXPLORATION_STEPS = EXPLORATION_STEPS
     self.VERBOSE = VERBOSE
     self.LOG_DIR_TF = LOG_DIR_TF
     #check if action_space is symmetric
     if all(env.action_space.high == abs(env.action_space.low)):
         action_scale = env.action_space.high
     else:
         raise ActionSpaceNotSymmetricException
     self.actor = Actor(self.sess, self.observation_space,
                        self.action_space, LEARNING_RATE_ACTOR, NET_SIZE,
                        TAU, action_scale)
     self.critic = Critic(self.sess, self.observation_space,
                          self.action_space, LEARNING_RATE_CRITIC, NET_SIZE,
                          TAU)
     actor_network_variables = self.actor.network.get_variables()
     critic_q_net_variables = self.critic.q_net.get_variables()
     self.actor_target_update = self.actor.target_network.update_variables(
         actor_network_variables)
     self.critic_target_update = self.critic.target_q_net.update_variables(
         critic_q_net_variables)
     self.reward_pl = tf.placeholder(tf.float32, [None, 1],
                                     name='Reward_PL')
     self.done_pl = tf.placeholder(tf.bool, [None, 1], name='Done_PL')
     self.labels = tf.where(
         self.done_pl, self.reward_pl, self.reward_pl +
         tf.multiply(self.REWARD_DISCOUNT, self.critic.target_prediction))
     #self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE)
     self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE,
                                       self.observation_space,
                                       self.action_space)
     self.log_reward_pl = tf.placeholder(tf.float32, name='Reward_log_pl')
     self.reward_f = tf.add(0.0, self.log_reward_pl)
     tf.summary.scalar('reward', self.reward_f)
     init = tf.global_variables_initializer()
     self.sess.run(init)
     self.sess.run(self.actor.network.copy_to(self.actor.target_network))
     self.sess.run(self.critic.q_net.copy_to(self.critic.target_q_net))
     self.writer = tf.summary.FileWriter(self.LOG_DIR_TF, self.sess.graph)
     self.merged = tf.summary.merge_all()
Ejemplo n.º 13
0
    def __init__(self, env, sess):
        # Environment
        self.n_state = env.observation_space.shape[0]
        self.n_action = env.action_space.shape[0]

        # Neural Networks
        self.sess = sess
        self.actor = Actor(self.sess, self.n_state, self.n_action)
        self.critic = Critic(self.sess, self.n_state, self.n_action)

        # Replay Buffer
        self.replay_buffer = ReplayBuffer(BUFFER_SIZE)
        # Ornstein-Uhlenbeck Noise
        self.exploration_noise = OUNoise(self.n_action)
 def __init__(self, n_features, actions=None, is_continues=None):
     self.actions = actions
     self.is_continues = is_continues
     self.actor_net = Actor(n_features,
                            actions=actions,
                            is_continues=is_continues)
     self.critic_net = Critic(n_features)
     self.load_weights(self.actor_net)
     self.load_weights(self.critic_net)
     # we need a good teacher, so the teacher should learn faster than the actor
     self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(),
                                             Config.LR_ACTOR, (0.9, 0.99))
     self.optimizer_critic = torch.optim.Adam(self.critic_net.parameters(),
                                              Config.LR_CRITIC, (0.9, 0.99))
     self.gamma = Config.REWARD_DECAY
    def __init__(self, state_size, action_size, num_agents, random_seed):
        """
        Initialize an Agent

        Params
        ======
            state_size (int): state dimension
            action_size (int): action dimension
            num_agents (int): simultaneous running agents
            random_seed (int): random seed
        """

        self.state_size = state_size
        self.action_size = action_size
        self.num_agents = num_agents
        random.seed(random_seed)

        # Actor Network and its target network
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic Network and its target network
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise object
        self.noise = OUNoise((num_agents, action_size), random_seed)

        # Replay Memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                   EXPERIENCES_PER_SAMPLING, device,
                                   random_seed)

        # Initialize time step (for updating every UPDATE_NN_EVERY steps)
        self.t_step_nn = 0
        # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps)
        self.t_step_mem_par = 0
        # Initialize time step (for updating every UPDATE_MEM_EVERY steps)
        self.t_step_mem = 0
    def __init__(self):
        self.env = snake_env()
        self.state_dim = (self.env.size, self.env.size)
        self.action_dim = self.env.action_space
        self.actor = Actor(self.state_dim, self.action_dim, args.actor_lr)
        self.critic = Critic(self.state_dim, args.critic_lr)
        self.gamma = args.gamma

        if args.load_weights:
            self.actor.model.load_weights(args.load_weights)

        if args.dist_move_reward:
            self.env.set_reward(move_reward='-dist')

        # initialize video system only
        self.env.reset()
def run():
    # build environment using openai gym
    env = gym.make('MountainCar-v0')
    env = env.unwrapped
    sess = tf.Session()
    # create an actor and critic
    actor = Actor(sess, n_actions=n_actions, n_features=n_features, lr=lr_actor)
    critic = Critic(sess, n_features=n_features, lr=lr_critic)
    # build the two networks
    actor.build_net()
    critic.build_net()

    sess.run(tf.global_variables_initializer())

    # tf.summary.FileWriter("",sess.graph)
    # count steps
    step = 0
    # env.render()
    for episode in range(n_episodes):
        s = env.reset()
        # comment the render() to speed up
        # env.render()
        # s returned by gym is a vector, we need to transform it into a matrix
        s = s[np.newaxis, :]
        a = actor.choose_action(s)
        while (True):
            step += 1
            # a new transition
            s_, r, done, info = env.step(a)
            # in order to let s_ add one rank(matrix)
            s_ = s_[np.newaxis, :]
            a_ = actor.choose_action(s_)
            # calculate td_error
            td_error = critic.learn(s, s_)
            actor.learn(s, a, td_error)
            s = s_

            if step % 500 == 0:
                print(step, s_)

            if done:
                print('arrive')
                print(s_)
                break
Ejemplo n.º 18
0
    def __init__(self, n_features, action_bounds):
        self.n_features = n_features
        self.action_bounds = action_bounds

        self.eval_actor_net = Actor(n_features, action_bounds)
        self.load_weights(self.eval_actor_net)
        self.eval_actor_net.train()
        self.target_actor_net = copy.deepcopy(self.eval_actor_net)
        self.target_actor_net.eval()

        self.eval_critic_net1 = Critic(n_features, action_bounds)
        self.load_weights(self.eval_critic_net1)
        self.eval_critic_net1.train()

        self.eval_critic_net2 = Critic(n_features, action_bounds)
        self.load_weights(self.eval_critic_net2)
        self.eval_critic_net2.train()

        self.target_critic_net1 = copy.deepcopy(self.eval_critic_net1)
        self.target_critic_net1.eval()
        self.target_critic_net2 = copy.deepcopy(self.eval_critic_net2)
        self.target_critic_net2.eval()

        self.memory = Memory(Config.MEMORY_CAPACITY)
        self.batch_size = Config.BATCH_SIZE
        self.tau = Config.REPLACEMENT_SOFT_TAU

        # we need a good teacher, so the teacher should learn faster than the actor
        self.optimizer_actor = torch.optim.Adam(
            self.eval_actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99))
        # itertools.chain(self.encoder.parameters(), self.decoder.parameters())
        # self.optimizer_critic = \
        #     torch.optim.Adam([{'params': self.eval_critic_net1.parameters()},
        #                       {'params': self.eval_critic_net2.parameters()}], Config.LR_CRITIC, (0.9, 0.99))
        self.optimizer_critic1 = \
            torch.optim.Adam(self.eval_critic_net1.parameters(), Config.LR_CRITIC, (0.9, 0.99))
        self.optimizer_critic2 = \
            torch.optim.Adam(self.eval_critic_net2.parameters(), Config.LR_CRITIC, (0.9, 0.99))

        self.gamma = Config.REWARD_DECAY
        self.policy_noise_clip = Config.POLICY_NOISE_CLIP
        self.policy_delay = Config.DELAY_POLICY_UPDATE_ITER
        self.learn_iter = 0
Ejemplo n.º 19
0
    def __init__(self, state_size, action_size, action_low, action_high):
        # self.task = task
        self.state_size = state_size
        self.action_size = action_size
        self.action_low = action_low
        self.action_high = action_high

        # learning rates
        self.lr_actor = 1e-4
        self.lr_critic = 1e-3

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.lr_actor)
        self.actor_target = Actor(self.state_size, self.action_size, self.lr_actor)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size, self.lr_critic)
        self.critic_target = Critic(self.state_size, self.action_size, self.lr_critic)

        # store model architecture of actor and critic locally
        # keras.utils.plot_model(self.actor_local.model, '/home/danie/catkin_ws/src/ddpg/src/actor.png', show_shapes=True)        
        # keras.utils.plot_model(self.critic_local.model, '/home/danie/catkin_ws/src/ddpg/src/critic.png', show_shapes=True)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Initialize OU noise
        self.noise = OUNoise(action_size=self.action_size)

        # Currently testing with Gaussian noise instead of OU. Parameters for Gaussian follow
        self.noise_mean = 0.0
        self.noise_stddev = 0.2

        # Initialize replay buffer
        self.buffer_size = 1e6
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Parameters for DDPG
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters
    def __init__(self, params):
        self.action_size = params['action_size']
        self.state_size = params['state_size']
        self.num_agents = params['num_agents']
        self.buffer_size = params['buffer_size']
        self.batch_size = params['batch_size']
        self.__gamma = params['gamma']
        self.__tau = params['tau']
        self.__update_every = params['update_every']
        self.__save_to = params['save_to']
        self.__memory = ReplayBuffer(self.buffer_size, self.batch_size)
        self.__lr = params['lr']
        self.noise_type = params['noise_type']

        actor_params = dict()
        actor_params['arch_params_actor'] = params['arch_params_actor']
        actor_params['action_size'] = self.action_size
        actor_params['state_size'] = self.state_size
        actor_params['eps'] = params['eps']
        actor_params['eps_decay'] = params['eps_decay']
        actor_params['eps_min'] = params['min_eps']
        actor_params['noise_type'] = params['noise_type']
        self.actor = Actor(actor_params)
        self.actor_target = Actor(actor_params)
        self.optimizer_actor = optim.Adam(self.actor.parameters(),
                                          lr=self.__lr)
        self.scheduler_actor = optim.lr_scheduler.StepLR(self.optimizer_actor,
                                                         step_size=100,
                                                         gamma=0.95)

        critic_params = dict()
        critic_params['arch_params_critic'] = params['arch_params_critic']
        critic_params['action_size'] = self.action_size
        critic_params['state_size'] = self.state_size
        self.critic = Critic(critic_params)
        self.critic_target = Critic(critic_params)
        self.optimizer_critic = optim.Adam(self.critic.parameters(),
                                           lr=self.__lr)
        self.scheduler_critic = optim.lr_scheduler.StepLR(self.optimizer_actor,
                                                          step_size=100,
                                                          gamma=0.95)
        self.__t = 0
Ejemplo n.º 21
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(
            self.state_size, self.action_size, self.action_low,
            self.action_high)
        self.actor_target = Actor(
            self.state_size, self.action_size, self.action_low,
            self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(
            self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(
            self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0
        self.exploration_theta = 0.15
        self.exploration_sigma = 0.2
        self.noise = OUNoise(self.action_size, self.exploration_mu,
                             self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 100000
        self.batch_size = 64
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau_actor = 0.1  # for soft update of target parameters
        self.tau_critic = 0.1
Ejemplo n.º 22
0
    def __init__(self, state_dim, action_dim):
        self.name = 'ActorCritic'
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.time_update = 0
        self.sess = tf.InteractiveSession()

        #initialize actor and critic network
        self.actor = Actor(self.sess, state_dim, action_dim)
        self.critic = Critic(self.sess, state_dim, action_dim)

        # explore parameter
        self.sigma = 2
        self.sigma_decay = 0.5
        self.sigma_min = 0.1
        self.sigma_decay_step = 50000
        self.sigma_count = 0

        # save network
        self.saver = tf.train.Saver()
Ejemplo n.º 23
0
class NetworkAC(object):
    """docstring for NetworkAC."""
    def __init__(self):
        tf.reset_default_graph()
        self.sess = tf.Session()
        self.actor = Actor(self.sess, \
                        n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \
                        lr=Config.LEARNING_RATE_START, action_bound=[-math.pi, math.pi])
        self.critic = Critic(self.sess, \
                        n_features=Config.PLAYER_DIMENSION*(Config.DEFENDER_COUNT+Config.INTRUDER_COUNT), \
                        lr=Config.LEARNING_RATE_START)
        self.sess.run(tf.global_variables_initializer())

    def train(self, x, a, y, r):
        td_error = self.critic.learn(x, r, y)  # gradient = grad[r + gamma * V(y_) - V(x_)]
        self.actor.learn(x, a, td_error)  # true_gradient = grad[logPi(s,a) * td_error]

    def predict(self, state):
        action = self.actor.choose_action(state)
        value = self.critic.predict(state)
        return action, value
Ejemplo n.º 24
0
 def __init__(self,
              state_size=28,
              action_size=2,
              gamma=0.9,
              learning_rate_actor=0.0001,
              learning_rate_critic=0.01,
              tau=0.001,
              action_max=[1000, 2],
              batch_size=32):
     self.state_size = state_size
     self.action_size = action_size
     self.action_max = action_max
     self.batch_size = batch_size
     self.memory = deque(maxlen=5000)
     self.gamma = gamma  # discount rate
     self.learning_rate_actor = learning_rate_actor  # learning rate
     self.learning_rate_critic = learning_rate_critic
     self.tau = tau  # target transfer factor
     self.gpu_options = tf.GPUOptions()
     self.config = tf.ConfigProto(gpu_options=self.gpu_options)
     self.config.gpu_options.allow_growth = True
     self.sess = tf.Session(config=self.config)
     K.set_session(self.sess)
     self.actor = Actor(state_size=self.state_size,
                        action_size=self.action_size,
                        learning_rate=self.learning_rate_actor,
                        tau=self.tau,
                        sess=self.sess,
                        batch_size=self.batch_size,
                        action_max=self.action_max)
     self.critic = Critic(state_size=self.state_size,
                          action_size=self.action_size,
                          learning_rate=self.learning_rate_critic,
                          gamma=self.gamma,
                          tau=self.tau,
                          sess=self.sess,
                          batch_size=self.batch_size)
     self.grad_avg = 0
     self.grad_a = []
     self.critic_loss_a = []
Ejemplo n.º 25
0
    def __init__(self, a_dim, s_dim):
        self.sess = tf.Session()
        self.a_dim, self.s_dim = a_dim, s_dim
        self.LR_A = 0.001
        self.LR_C = 0.001
        self.CAPACITY = 10000
        self.BATCH_SIZE = 32
        self.BATCH_SIZE_g = 24
        self.SETTING = {
            'GAMMA': 0.9,
            'TAU': 0.01,
            'N_D_MAX': 1 / np.sqrt(self.s_dim),
            'N_D_MIN': -1 / np.sqrt(self.s_dim),
            'F_N_D_MAX': 3e-3,
            'F_N_D_MIN': -3e-3,
            'L2_DECAY': 0.01,
        }

        self.S = tf.placeholder(tf.float32,
                                shape=[None, self.s_dim],
                                name='State')
        self.S_ = tf.placeholder(tf.float32,
                                 shape=[None, self.s_dim],
                                 name='State_')
        self.R = tf.placeholder(tf.float32, shape=[None, 1], name='Reward')

        self.actor = Actor(self.sess, self.a_dim, self.s_dim, self.LR_A,
                           self.SETTING, self.S, self.S_)
        self.critic = Critic(self.sess, self.a_dim, self.s_dim, self.LR_C,
                             self.SETTING, self.S, self.S_, self.R,
                             self.actor.action, self.actor.action_)
        self.actor.add_grad_to_graph(self.critic.a_grads)

        self.memory = Memory(self.CAPACITY, s_dim * 2 + a_dim + 1,
                             self.BATCH_SIZE)
        # self.memory_g = Memory(self.CAPACITY, s_dim * 2 + a_dim + 1, self.BATCH_SIZE_g)

        self.sess.run(tf.global_variables_initializer())

        tf.summary.FileWriter('logs/', self.sess.graph)
Ejemplo n.º 26
0
    def __init__(self,
                 gamma,
                 s,
                 a,
                 learningRate=1e-3,
                 criticpath=None,
                 actorpath=None):
        self.gamma = gamma
        self.actor = Actor(state=s, actions=a, hidden1=180, hidden2=87)
        self.critic = Critic(state=s, actions=a, hidden1=250, hidden2=100)
        if (not (criticpath == None)):
            self.critic.load_state_dict(torch.load(criticpath))
        if (not (actorpath == None)):
            self.actor.load_state_dict(torch.load(actorpath))

        self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate)
        self.criticOptimizer = optim.Adam(self.critic.parameters(),
                                          learningRate)
        #more a dimensionality thing
        self.state = s
        self.action = a
        self.count = 0
Ejemplo n.º 27
0
    def __init__(self, state_size, action_size, num_agents):
        """
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            num_agents (int): number of agents in the environment
        """
        random_seed = 1

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(random_seed)
        self.num_agents = num_agents

        # Replay memory
        self.memory = ReplayBuf(action_size, BUFFER_SIZE, BATCH_SIZE,
                                random_seed)

        # Noise process
        self.noise = Ornstein_Uhlenbeck_Noise(action_size, random_seed)

        # Critic Networks
        self.critic_local = Critic(state_size, action_size,
                                   random_seed).to(device)
        self.critic_target = Critic(state_size, action_size,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Actor Networks
        self.actor_local = Actor(state_size, action_size,
                                 random_seed).to(device)
        self.actor_target = Actor(state_size, action_size,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)
Ejemplo n.º 28
0
    def __init__(self):
        self.sess = tf.Session()

        self.critic1 = Critic()

        self.tfs = tf.placeholder(tf.float32, [None, configs.S_DIM], 'state')

        pi, pi_params = self._build_anet('pi', trainable=True)
        oldpi, oldpi_params = self._build_anet('oldpi', trainable=False)

        with tf.variable_scope('sample_action'):
            self.sample_op = tf.squeeze(pi.sample(1),
                                        axis=0)  # choosing action
        with tf.variable_scope('update_oldpi'):
            self.update_oldpi_op = [
                oldp.assign(p) for p, oldp in zip(pi_params, oldpi_params)
            ]

        self.tfa = tf.placeholder(tf.float32, [None, configs.A_DIM], 'action')
        self.tfadv = tf.placeholder(tf.float32, [None, 1], 'advantage')

        with tf.variable_scope('loss'):
            with tf.variable_scope('surrogate'):
                #ratio = pi.prob(self.tfa) / (oldpi.prob(self.tfa) + 1e-5)
                ratio = tf.exp(
                    pi.log_prob(self.tfa) - oldpi.log_prob(self.tfa))
                surr = ratio * self.tfadv  #IMPORTANT !!!

            self.aloss = -tf.reduce_mean(
                tf.minimum(
                    surr,
                    tf.clip_by_value(ratio, 1. - configs.epsilon,
                                     1. + configs.epsilon) * self.tfadv))

        with tf.variable_scope('atrain'):
            self.atrain_op = tf.train.AdamOptimizer(configs.A_LR).minimize(
                self.aloss)

        self.sess.run(tf.global_variables_initializer())
Ejemplo n.º 29
0
    def __init__(self, n_features, action_bounds):
        self.n_features = n_features
        self.action_bounds = action_bounds

        self.eval_actor_net = Actor(n_features, action_bounds)
        self.load_weights(self.eval_actor_net)
        self.eval_actor_net.train()
        self.target_actor_net = Actor(n_features, action_bounds)
        self.target_actor_net.eval()
        self.eval_critic_net = Critic(n_features, action_bounds)
        self.load_weights(self.eval_critic_net)
        self.eval_critic_net.train()
        self.target_critic_net = Critic(n_features, action_bounds)
        self.target_critic_net.eval()

        self.memory = Memory(Config.MEMORY_CAPACITY)
        self.batch_size = Config.BATCH_SIZE
        self.tau = Config.REPLACEMENT_SOFT_TAU

        # we need a good teacher, so the teacher should learn faster than the actor
        self.optimizer_actor = torch.optim.Adam(self.eval_actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99))
        self.optimizer_critic = torch.optim.Adam(self.eval_critic_net.parameters(), Config.LR_CRITIC, (0.9, 0.99))
        self.gamma = Config.REWARD_DECAY
Ejemplo n.º 30
0
    def __init__(self, task):
        self.task = task
        self.state_size = task.state_size
        self.action_size = task.action_size
        self.action_low = task.action_low
        self.action_high = task.action_high

        # Actor (Policy) Model
        self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high)
        self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high)

        # Critic (Value) Model
        self.critic_local = Critic(self.state_size, self.action_size)
        self.critic_target = Critic(self.state_size, self.action_size)

        # Initialize target model parameters with local model parameters
        self.critic_target.model.set_weights(self.critic_local.model.get_weights())
        self.actor_target.model.set_weights(self.actor_local.model.get_weights())

        # Noise process
        self.exploration_mu = 0.0
        self.exploration_theta = 0.15 
        self.exploration_sigma = 0.2 
        self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma)

        # Replay memory
        self.buffer_size = 1000000
        self.batch_size = 128
        self.memory = ReplayBuffer(self.buffer_size, self.batch_size)

        # Algorithm parameters
        self.gamma = 0.99  # discount factor
        self.tau = 0.001  # for soft update of target parameters. Set it at a low value ,0.001, so newer (supposedly better values if algorithm works) values take more importance

        # Score tracker
        self.score = 0.
        self.best_score = -np.inf
Ejemplo n.º 31
0
GAMMA = 0.9     # reward discount in TD error
LR_A = 0.001    # learning rate for actor
LR_C = 0.01     # learning rate for critic


env = gym.make('CartPole-v0')
env.seed(1)  # reproducible
env = env.unwrapped

n_features = env.observation_space.shape[0]
n_actions = env.action_space.n

sess = tf.Session()

actor = Actor(sess,n_features=n_features,n_actions=n_actions,lr = LR_A)
critic = Critic(sess,n_features=n_features,gamma = GAMMA,lr = LR_C)

sess.run(tf.global_variables_initializer())

if OUTPUT_GRAPH:
    tf.summary.FileWriter("logs/", sess.graph)

for i_episode in range(MAX_EPISODE):

    s = env.reset()
    t = 0
    track_r = []
    while True:
        if RENDER: env.render()

        a = actor.choose_action(s)