Esempio n. 1
0
    def __init__(self, agent_filepath=""):
        Player.__init__(self)

        # Create the experience memory database
        if not os.path.exists(REPLAY_MEMORY_FILENAME):
            self.replay_memory = ReplayMemory()
        else:
            self.replay_memory = cPickle.load(open(REPLAY_MEMORY_FILENAME,
                                                   'r'))

        # Initialize the convolutional neural network
        self.network = MinecraftNet(agent_filepath)
        self.ae_network = FeatureNet()

        # Probability of selecting non-random action
        self.epsilon = STARTING_EPSILON

        # The total number of frames this agent has been trained on
        # through all the minibatch training
        self.frames_trained = 0

        # Load old epsilon and frames learned values
        self.load()

        self.cnn_action_map = self.initActionMap()

        # The current and previous sequences of game frames and actions
        self.current_seq = None
        self.previous_seq = None
        self.previous_action = None

        # Event logging
        self.log = LogFile("run.log", True)
Esempio n. 2
0
    def __init__(self,
                 gamma,
                 memory,
                 s,
                 a,
                 tau,
                 learningRate=1e-3,
                 criticpath=None,
                 actorpath=None):
        self.gamma = gamma
        self.memory = ReplayMemory(memory)
        self.actor = Actor(state=s, actions=a)
        self.critic = Critic(state=s, actions=a)
        if (not (criticpath == None)):
            self.critic.load_state_dict(torch.load(criticpath))
        if (not (actorpath == None)):
            self.actor.load_state_dict(torch.load(actorpath))
        self.targetActor = Actor(state=s, actions=a)
        self.targetActor.load_state_dict(self.actor.state_dict())
        self.targetCritic = Critic(state=s, actions=a)
        self.targetCritic.load_state_dict(self.critic.state_dict())
        self.tau = tau

        self.actorOptimizer = optim.Adam(self.actor.parameters(), learningRate)
        self.criticOptimizer = optim.Adam(self.critic.parameters(),
                                          learningRate)
        #more a dimensionality thing
        self.state = s
        self.action = a
        self.OUarray = np.zeros((1000, self.action), dtype="f")
        self.step = 0
    def __init__(self, path, model_path, target_model_path, actor_index):
        self.path = path
        self.model_path = model_path
        self.target_model_path = target_model_path
        self.actor_index = actor_index
        self.lr = 1e-3
        self.gamma = 0.95
        self.epsilon = 0.3
        self.batch_size = 32
        self.initial_exploration = 500
        self.N_STEP = 3
        self.step_reward = 0
        self.qf = DuelingQFunc()
        self.target_qf = DuelingQFunc()
        #model.state_dict():モデルの学習パラメータをとってきている
        self.target_qf.load_state_dict(self.qf.state_dict())

        self.optimizer = optim.Adam(self.qf.parameters(), lr=self.lr)
        self.criterion = nn.MSELoss()
        self.env = gym.make('CartPole-v0')
        self.obs_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.obs_queue = queue.Queue()
        self.reward_queue = queue.Queue()
        self.action_queue = queue.Queue()
        self.total_step = 0
        self.ten_step = 0
        self.temporal_memory = ReplayMemory()
Esempio n. 4
0
    def __init__(self,
                 gamma,
                 memory_size,
                 target_update_counter,
                 batch_size,
                 num_of_states,
                 num_of_actions,
                 ewc_importance=28,
                 si_importance=30):
        self.num_of_states = num_of_states
        self.num_of_actions = num_of_actions

        self.eval_model = Net(self.num_of_states, HIDDEN_SIZE,
                              self.num_of_actions)
        self.target_model = Net(self.num_of_states, HIDDEN_SIZE,
                                self.num_of_actions)

        self.optimizer = torch.optim.Adam(self.eval_model.parameters(),
                                          lr=0.001)
        # self.optimizer = torch.optim.SGD(self.eval_model.parameters(), lr=0.01)
        self.loss_func = nn.MSELoss()
        # self.loss_func = nn.MS
        # self.loss_func = nn.SmoothL1Loss()

        self.memory_size = memory_size
        self.memory = ReplayMemory(memory_size)
        self.old_memory = []
        self.learned_tasks = 0
        self.target_udpate_counter = target_update_counter
        self.learn_step_counter = 0
        self.batch_size = batch_size
        self.gamma = gamma
        self.epsilon = EPSILON_MAX
        self.ewc_importance = ewc_importance
        self.si_importance = si_importance
Esempio n. 5
0
    def __init__(self, state_num, action_num, device, CONFIG, action_list):
        
        self.action_list = action_list
        self.memory = ReplayMemory(CONFIG.MEMORY_CAPACITY)
        
        #== ENV PARAM ==
        self.state_num = state_num
        self.action_num = action_num
        
        #== PARAM ==
        self.EPSILON = CONFIG.EPSILON
        self.EPS_START = CONFIG.EPSILON
        self.EPS_END = CONFIG.EPSILON_END
        self.EPS_DECAY = CONFIG.MAX_EP_STEPS
        
        self.LR_C = CONFIG.LR_C
        self.LR_C_START = CONFIG.LR_C
        self.LR_C_END = CONFIG.LR_C_END
        self.LR_C_DECAY = CONFIG.MAX_EP_STEPS * CONFIG.MAX_EPISODES / 2
        
        self.BATCH_SIZE = CONFIG.BATCH_SIZE
        self.GAMMA = CONFIG.GAMMA
        self.MAX_MODEL = CONFIG.MAX_MODEL

        #== Target Network Update ==
        self.TAU = CONFIG.TAU
        self.HARD_UPDATE = CONFIG.HARD_UPDATE
        self.SOFT_UPDATE = CONFIG.SOFT_UPDATE
        
        #== DQN ==
        self.double = CONFIG.DOUBLE
        self.device = device
        self.build_network()
Esempio n. 6
0
    def testPushInxes(self):
        RepMem = ReplayMemory()

        for i in range(0, 200000):
            RepMem.push(1, 1, 1, 1)
            self.assertEqual(RepMem.indx, ((i + 1) % RepMem.size))
        self.assertTrue(RepMem.isFull())
Esempio n. 7
0
    def __init__(self, memory_cap, batch_size, resolution, action_count, session,
                 lr, gamma, epsilon_min, epsilon_decay_steps, epsilon_max, trace_length, hidden_size):

        self.model = Network(session=session, action_count=action_count,
                             resolution=resolution, lr=lr, batch_size=batch_size,
                             trace_length=trace_length, hidden_size=hidden_size, scope='main')

        self.target_model = Network(session=session, action_count=action_count,
                                    resolution=resolution, lr=lr, batch_size=batch_size,
                                    trace_length=trace_length, hidden_size=hidden_size, scope='target')

        self.memory = ReplayMemory(memory_cap=memory_cap, batch_size=batch_size,
                                   resolution=resolution, trace_length=trace_length)

        self.batch_size = batch_size
        self.resolution = resolution
        self.action_count = action_count
        self.gamma = gamma
        self.epsilon_min = epsilon_min
        self.epsilon_decay_steps = epsilon_decay_steps
        self.epsilon_max = epsilon_max
        self.hidden_size = hidden_size
        self.trace_length = trace_length
        self.epsilon = epsilon_max

        self.epsilon_decrease = (epsilon_max-epsilon_min)/epsilon_decay_steps

        self.min_buffer_size = batch_size*trace_length

        self.state_in = (np.zeros([1, self.hidden_size]), np.zeros([1, self.hidden_size]))
Esempio n. 8
0
    def __init__(self):
        AbstractPlayer.__init__(self)
        self.movementStrategy = EpsilonStrategy()
        self.replayMemory = ReplayMemory(MEMORY_CAPACITY)
        self.episode = 0

        networkOptions = [
            keras.layers.Dense(state_size,
                               input_dim=state_size,
                               activation='relu'),
            keras.layers.Dense(
                100,
                activation='relu',
                kernel_initializer=keras.initializers.he_normal()),
            keras.layers.Dense(
                100,
                activation='relu',
                kernel_initializer=keras.initializers.he_normal()),
            keras.layers.Dense(NUM_ACTIONS)
        ]

        self.policyNetwork = keras.Sequential(networkOptions)
        self.targetNetwork = keras.Sequential(networkOptions)
        self.policyNetwork.compile(
            optimizer=keras.optimizers.Adam(learning_rate=ALPHA),
            loss=keras.losses.mean_squared_error)
        print(self.policyNetwork.summary())
        try:
            self.policyNetwork.load_weights("./network/zelda-ddqn.h5")
            self.movementStrategy.epsilon = 0.01
            print('Model loaded')
        except:
            print('Model file not found')
	def __init__(self, training):

		# Create the environment
		self.environment = Environment()

		# Training or testing
		self.training = training

		# Set the initial training epsilon
		self.epsilon = 0.10

		# Get the number of actions for storing memories and Q-values etc.
		total_actions = self.environment.total_actions()
        
		# Training or testing
		if self.training:
			# Training : Set a learning rate
			self.learning_rate = 1e-2

			# Training: Set up the replay memory
			self.replay_memory = ReplayMemory(size=1000, num_actions=total_actions)

		else:
			# Testing: These are not needed
			self.learning_rate = None
			self.replay_memory = None

		# Create the neural network
		self.neural_network = NeuralNetwork(num_actions=total_actions, replay_memory=self.replay_memory)

		# This stores the rewards for each episode
		self.rewards = []
Esempio n. 10
0
    def __init__(self,
                 input_shape,
                 n_actions,
                 optimizer='RMSprop',
                 lr=1e-4,
                 gamma=0.99,
                 C=10000,
                 batch_size=32,
                 min_eps=0.1,
                 max_eps=1,
                 cutoff=1e6,
                 second_cuttof=2.5e6,
                 final_eps=0.01,
                 device='GPU',
                 clip=10):
        super().__init__(input_shape, n_actions, gamma, batch_size, min_eps,
                         max_eps, cutoff, device)

        self.memory = ReplayMemory(input_shape)
        self.policy_network = DuelingDDQN(input_shape, n_actions, self.device)
        self.target_network = DuelingDDQN(input_shape, n_actions, self.device)

        self.optimizer = getattr(torch.optim,
                                 optimizer)(self.policy_network.parameters(),
                                            lr=lr)
        self.criterion = torch.nn.MSELoss()

        self.second_cuttof = second_cuttof
        self.final_eps = final_eps

        self.C = C
        self.C_counter = 0
        self.clip = clip
Esempio n. 11
0
    def __init__(self,
                 num_action,
                 state_size,
                 goal_size,
                 max_eps=0.2,
                 min_eps=0.02,
                 eps_decay=0.95,
                 gamma=0.98,
                 lr=0.001,
                 batch_size=128,
                 buffer_size=1000000,
                 PER=False,
                 init_nn=None):

        self.PER = PER
        # Init Hyperparameters
        self.epsilon = max_eps
        self.min_eps = min_eps
        self.eps_decay = eps_decay
        self.gamma = gamma
        self.num_action = num_action
        self.batch_size = batch_size
        self.epsilon = max_eps
        self.memory = ReplayMemory(buffer_size, with_priorities=PER)

        self.state_size = state_size
        self.goal_size = goal_size

        # Initialize neural nets
        self.policy_net = NeuralNet(state_size, num_action, goal_size, lr)
        if init_nn is not None:
            self.policy_net = init_nn

        self.target_net = NeuralNet(state_size, num_action, goal_size, lr)
        self.target_net.set_weights(self.policy_net.get_weights())
Esempio n. 12
0
    def __init__(self, env_name, state_dim, action_dim):
        self.name = 'DriverAgent'  # name for uploading results
        self.env_name = env_name
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        # Tensorflow Session
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        # Actor & Critic Network
        self.actor = ActorNetwork(self.sess, state_dim, action_dim, BATCH_SIZE,
                                  TAU, LRA)
        self.critic = CriticNetwork(self.sess, state_dim, action_dim,
                                    BATCH_SIZE, TAU, LRA)

        # Replay Memory
        self.memory = ReplayMemory(MEMORY_SIZE)

        # Loss value
        self.loss = 0

        # loading networks. modify as you want
        self.saver = tf.train.Saver()
        if not os.path.exists(ckp_dir):
            print("Could not find old network weights")
        else:
            self.saver.restore(self.sess, os.path.join(ckp_dir, ckp_name))
            print("Successfully loaded:", ckp_name)
Esempio n. 13
0
    def reset_training(self):
        self.learn_step_counter = 0
        self.epsilon = EPSILON_MAX

        # Save a batch of old memories
        transitions = self.memory.sample(self.batch_size)
        self.old_memory = self.old_memory + transitions

        self.memory = ReplayMemory(self.memory_size)
Esempio n. 14
0
 def __init__(self):
     AbstractPlayer.__init__(self)
     self.movementStrategy = EpsilonStrategy()
     self.replayMemory = ReplayMemory(MEMORY_CAPACITY)
     self.episode = 0
     self.policyNetwork = self._build_compile_model()
     self.targetNetwork = self._build_compile_model()
     if self.episode == 0 and os.path.exists(
             "./celdas/network/zelda.index"):
         self.policyNetwork.load_weights("./celdas/network/zelda")
     print(self.policyNetwork.summary())
Esempio n. 15
0
 def __init__(self, actor, critic, memory, s, a, tau, epsilon=0.5):
     self.memory = ReplayMemory(memory)
     self.targetActor = copy.deepcopy(actor)
     self.targetCritic = copy.deepcopy(critic)
     self.tau = tau
     self.epsilon = epsilon
     #more a dimensionality thing
     self.state = s
     self.action = a
     self.OUarray = np.zeros((1000, self.action), dtype="f")
     self.step = 0
Esempio n. 16
0
    def __init__(self, predictor_func, grid,
                 time_limit=None, player=None, name=""):
        super().__init__(grid, time_limit, player)

        self.name = name

        self.predictor_func = predictor_func

            # TODO: needed?
        # the size of the state
        self.state_rows = grid[0]+grid[1]
        self.state_cols = self.state_rows
        self.state_depth = 1

        self.exploration = INITIAL_EXPLORATION
        self.final_exploration = FINAL_EXPLORATION
        self.expl_update = \
            (self.exploration - self.final_exploration) / EXPLORATION_STEPS

        self.gamma = DISCOUNT_GAMMA
        self.alpha = ALPHA

        self.batch_size = BATCH_SIZE

        self.max_gradient = GRADIENT_CLIPPING_NORM

        self.reg_param = REGULARIZATION_FACTOR

        if PRIORITY_REPLAY_BUFFER:
            self.replay_mem = \
                PriorityReplayMemory(REPLAY_BUFFER_SIZE, PRIORITY_ALPHA)
            self.beta = PRIORITY_BETA_INIT
            self.beta_update = (1.0 - PRIORITY_BETA_INIT) / PRIORITY_BETA_ITERS
        else:
            self.replay_mem = ReplayMemory(REPLAY_BUFFER_SIZE)

        self.update = True

        self.null_state = self.to_state(Board(grid[0], grid[1]))

        self.last_time = time.time()
        self.reset_summary()

        self.graph = tf.Graph()
        self.session = tf.Session(graph=self.graph)
        with self.graph.as_default():

            self.create_graph()
            self.session.run(tf.global_variables_initializer())
            self.session.run(self.target_set_op)
            self.load()
Esempio n. 17
0
 def __init__(self, env, sess, LEARNING_RATE_ACTOR, LEARNING_RATE_CRITIC,
              NET_SIZE, MEMORY_LEN, REWARD_DISCOUNT, BATCH_SIZE, TAU,
              EXPLORATION_STEPS, VERBOSE, LOG_DIR_TF):
     self.env = env
     self.sess = sess
     self.observation_space = self.env.observation_space.shape[0]
     self.action_space = self.env.action_space.shape[0]
     self.REWARD_DISCOUNT = REWARD_DISCOUNT
     self.TAU = TAU
     self.BATCH_SIZE = BATCH_SIZE
     self.noise_state = np.zeros(self.action_space)
     self.EXPLORATION_STEPS = EXPLORATION_STEPS
     self.VERBOSE = VERBOSE
     self.LOG_DIR_TF = LOG_DIR_TF
     #check if action_space is symmetric
     if all(env.action_space.high == abs(env.action_space.low)):
         action_scale = env.action_space.high
     else:
         raise ActionSpaceNotSymmetricException
     self.actor = Actor(self.sess, self.observation_space,
                        self.action_space, LEARNING_RATE_ACTOR, NET_SIZE,
                        TAU, action_scale)
     self.critic = Critic(self.sess, self.observation_space,
                          self.action_space, LEARNING_RATE_CRITIC, NET_SIZE,
                          TAU)
     actor_network_variables = self.actor.network.get_variables()
     critic_q_net_variables = self.critic.q_net.get_variables()
     self.actor_target_update = self.actor.target_network.update_variables(
         actor_network_variables)
     self.critic_target_update = self.critic.target_q_net.update_variables(
         critic_q_net_variables)
     self.reward_pl = tf.placeholder(tf.float32, [None, 1],
                                     name='Reward_PL')
     self.done_pl = tf.placeholder(tf.bool, [None, 1], name='Done_PL')
     self.labels = tf.where(
         self.done_pl, self.reward_pl, self.reward_pl +
         tf.multiply(self.REWARD_DISCOUNT, self.critic.target_prediction))
     #self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE)
     self.replay_memory = ReplayMemory(MEMORY_LEN, BATCH_SIZE,
                                       self.observation_space,
                                       self.action_space)
     self.log_reward_pl = tf.placeholder(tf.float32, name='Reward_log_pl')
     self.reward_f = tf.add(0.0, self.log_reward_pl)
     tf.summary.scalar('reward', self.reward_f)
     init = tf.global_variables_initializer()
     self.sess.run(init)
     self.sess.run(self.actor.network.copy_to(self.actor.target_network))
     self.sess.run(self.critic.q_net.copy_to(self.critic.target_q_net))
     self.writer = tf.summary.FileWriter(self.LOG_DIR_TF, self.sess.graph)
     self.merged = tf.summary.merge_all()
Esempio n. 18
0
 def __init__(self, state_dim, batch_size, action_dim, H, gamma,
              BATCH_SIZE):
     # The network
     self.action_dim = action_dim
     self.model = torch.nn.Sequential(
         torch.nn.Linear(state_dim, H),
         torch.nn.ReLU(),
         torch.nn.Linear(H, action_dim),
         torch.nn.ReLU(),
     )
     self.loss_fn = torch.nn.MSELoss(size_average=False)
     self.gamma = gamma
     self.memory = ReplayMemory(capacity=2000)
     self.BATCH_SIZE = BATCH_SIZE
Esempio n. 19
0
    def __init__(self,
                 env,
                 act_dim,
                 state_dim,
                 goal_dim,
                 act_range,
                 buffer_size=int(1e6),
                 gamma=0.98,
                 lr=0.001,
                 tau=0.95):
        """ Initialization
        """
        # Environment and A2C parameters
        self.act_dim = act_dim
        self.act_range = act_range
        self.env_dim = state_dim + goal_dim
        self.gamma = gamma
        self.lr = lr
        self.tau = tau
        self.env = env

        # Create actor and critic networks
        self.actor_network = Actor(self.env_dim, act_dim, act_range)
        self.actor_target_network = Actor(self.env_dim, act_dim, act_range)
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())

        self.critic_network = Critic(self.env_dim, act_dim, act_range)
        self.critic_target_network = Critic(self.env_dim, act_dim, act_range)
        self.actor_target_network.load_state_dict(
            self.actor_network.state_dict())

        sync_networks(self.actor_network)
        sync_networks(self.critic_network)

        # Optimizer
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(),
                                            lr=lr)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(),
                                             lr=lr)

        # Replay buffer
        # self.buffer = MemoryBuffer(buffer_size)
        self.buffer = ReplayMemory(buffer_size)

        # Normalizers
        self.goal_normalizer = Normalizer(
            goal_dim, default_clip_range=5)  # Clip between [-5, 5]
        self.state_normalizer = Normalizer(state_dim, default_clip_range=5)
Esempio n. 20
0
def train(sess, env, args, actors, critics, noise):
    load_models(actors, critics)

    summary_ops, summary_vars = build_summaries()
    init = tf.global_variables_initializer()
    sess.run(init)
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    for actor in actors:
        actor.update_target()
    for critic in critics:
        critic.update_target()

    replayMemory = ReplayMemory(int(args['buffer_size']),
                                int(args['random_seed']))

    for ep in range(int(args['max_episodes'])):

        if ep % 100 == 0:
            save_models(actors, critics)

        episode_reward = learn(actors, args, critics, env, ep, noise,
                               replayMemory, sess, summary_ops, summary_vars,
                               writer)
        print('|Reward: {}	| Episode: {:d}'.format(episode_reward, ep))
Esempio n. 21
0
    def __init__(self, agent_filepath=""):
        Player.__init__(self)

        # Create the experience memory database
        if not os.path.exists(REPLAY_MEMORY_FILENAME):
            self.replay_memory = ReplayMemory()
        else:
            self.replay_memory = cPickle.load(open(REPLAY_MEMORY_FILENAME, 'r'))
        
        # Initialize the convolutional neural network
        self.network = MinecraftNet(agent_filepath)   
        self.ae_network = FeatureNet()
        
        # Probability of selecting non-random action
        self.epsilon = STARTING_EPSILON
        
        # The total number of frames this agent has been trained on
        # through all the minibatch training
        self.frames_trained = 0

        # Load old epsilon and frames learned values
        self.load()
            
        self.cnn_action_map = self.initActionMap()
        
        # The current and previous sequences of game frames and actions
        self.current_seq = None
        self.previous_seq = None
        self.previous_action = None
        
        # Event logging
        self.log = LogFile("run.log", True)
Esempio n. 22
0
def main(config: Config):
    print(config)

    # Let's run it!
    for i in range(config.num_experiments):
        experiment_seed = config.seed + i * config.num_episodes
        memory = ReplayMemory(config.replay_memory_size)

        # We will seed the algorithm (for reproducability).
        random.seed(experiment_seed)
        torch.manual_seed(experiment_seed)
        env.seed(experiment_seed)

        q_model = QNetwork(config.device, config.num_hidden_q_model)
        curiousity_model = StatePredictor(2, 3,
                                          config.num_hidden_curiosity_model,
                                          config.device)

        for i in range(20, 29):
            episode_durations, episode_loss = run_episodes(train,
                                                           q_model,
                                                           curiousity_model,
                                                           memory,
                                                           env,
                                                           experiment_seed,
                                                           config,
                                                           experiment_number=i)
        # print(i, episode_durations, episode_loss)
        print("Finished experiment {}/{}".format(i + 1,
                                                 config.num_experiments))
Esempio n. 23
0
 def __init__(self):
     self.movementStrategy = EpsilonStrategy()
     self.replayMemory = ReplayMemory(MEMORY_CAPACITY)
     self.episode = 0
     self.policyNetwork = self._build_compile_model()
     self.targetNetwork = self._build_compile_model()
     if os.path.exists("./network/zelda-ddqn.h5"):
         print('Cargamos red')
         self.policyNetwork.load_weights("./network/zelda-ddqn.h5")
     print(self.policyNetwork.summary())
     self.exploreNext = False
     self.steps = 0
     self.averageLoss = 0
     self.averageReward = 0
     self.losses = []
     self.rewards = []
Esempio n. 24
0
    def __init__(self, U, random, eps_max_action, **kwargs):
        self.U = U
        if isinstance(self.U, int):
            self.U = list(range(self.U))
        self.envAction2index = {}
        for idx, u in enumerate(self.U):
            self.envAction2index[u] = idx
        self.Q = {}

        self.replay_memory = ReplayMemory(capacity=100, random=random)
        self.steps = 0
        self.random = random
        self.eps_max_action = eps_max_action
        self.sizes = kwargs.get("sizes", None)
        self.step_start = kwargs.get("step_start", 1e2)
        self.max_abs_delta_Q = 0
Esempio n. 25
0
    def __init__(self,
                 batch_size=4,
                 gamma=.999,
                 eps_start=.95,
                 eps_end=.05,
                 eps_decay=200,
                 target_update=10,
                 memory_size=5000):
        self.batch_size, self.gamma, self.eps_start, self.eps_end, self.eps_decay, self.target_update \
            = batch_size, gamma, eps_start, eps_end, eps_decay, target_update

        self.steps, self.threshold, self.policy, self.target = 0, eps_start, DQN(
        ), DQN()
        self.optimizer = optim.RMSprop(self.policy.parameters())
        self.memory = ReplayMemory(memory_size)
        self.training_history = []
Esempio n. 26
0
 def __init__(self, path, model_path, target_model_path):
     self.path = path
     self.model_path = model_path
     self.target_model_path = target_model_path
     self.lr = 1e-3
     self.gamma = 0.95
     self.epsilon = 0.3
     self.batch_size = 32
     self.N_STEP = 3
     self.qf = DuelingQFunc()
     self.target_qf = DuelingQFunc()
     #model.state_dict():モデルの学習パラメータをとってきている
     self.target_qf.load_state_dict(self.qf.state_dict())
     self.optimizer = optim.Adam(self.qf.parameters(), lr = self.lr)
     self.criterion = nn.MSELoss()
     self.memory = ReplayMemory()
     self.total_step = 0
Esempio n. 27
0
    def __init__(self):
        #with tf.device('/CPU:0'):
        self.agent = DDPG_Agent([96, 96, 9],
                                3,
                                regularizer_coeff=regularizer_coeff)

        self.cp_managers = []
        for opt, model, name in [[
                self.agent.actor_optimizer, self.agent.actor, "actor"
        ], [self.agent.critic_optimizer, self.agent.critic, "critic"]]:
            checkpoint = tf.train.Checkpoint(optimizer=opt, model=model)
            cp_manager = tf.train.CheckpointManager(
                checkpoint,
                os.path.join(LOG_DIR, name),
                3,
                keep_checkpoint_every_n_hours=4)
            checkpoint.restore(cp_manager.latest_checkpoint)
            self.cp_managers.append(cp_manager)

        self.memory = ReplayMemory(BATCH_SIZE,
                                   30000,
                                   300000,
                                   num_frames=9,
                                   gray_scale=True,
                                   normalize=True)
        #self.memory = ReplayMemory(BATCH_SIZE, 1000, 300000, gray_scale=False, normalize=True)

        self.env = gym.make("CarRacing-v0", verbose=0)

        self.train_writer = tf.summary.create_file_writer(
            os.path.join(LOG_DIR, "train"))
        self.test_writer = tf.summary.create_file_writer(
            os.path.join(LOG_DIR, "test"))

        self.episode_queue = Queue()
        self.parameter_queues = []

        self.do_render = False
        self.render_freq = 10
        self.train_freq = 2
        self.max_iteration = 800
        self.epsilon_max_step = 100000
        self.parameter_send_freq = 1000
Esempio n. 28
0
    def __init__(self, state_dim, action_dim, device, CONFIG):

        #== ENV PARAM ==
        self.state_dim = state_dim
        self.action_dim = action_dim

        #== PARAM ==
        self.LR_C = CONFIG.LR_C
        self.LR_C_START = CONFIG.LR_C
        self.LR_C_END = CONFIG.LR_C_END
        self.LR_C_DECAY = CONFIG.MAX_EP_STEPS * CONFIG.MAX_EPISODES / 2

        self.LR_A = CONFIG.LR_A
        self.LR_A_START = CONFIG.LR_A
        self.LR_A_END = CONFIG.LR_A_END
        self.LR_A_DECAY = CONFIG.MAX_EP_STEPS * CONFIG.MAX_EPISODES / 2

        self.BATCH_SIZE = CONFIG.BATCH_SIZE
        self.GAMMA = CONFIG.GAMMA
        self.MAX_MODEL = CONFIG.MAX_MODEL

        self.SIGMA = CONFIG.SIGMA

        #== CRITIC TARGET UPDATE PARAM ==
        self.double = CONFIG.DOUBLE
        self.TAU = CONFIG.TAU
        self.HARD_UPDATE = CONFIG.HARD_UPDATE
        self.SOFT_UPDATE = CONFIG.SOFT_UPDATE

        #== MODEL PARAM ==
        self.device = device

        #== MEMORY & MODEL ==
        self.memory = ReplayMemory(CONFIG.MEMORY_CAPACITY)
        self.build_network()

        self.random_process = OrnsteinUhlenbeck(action_dim,
                                                sigma=self.SIGMA,
                                                annealLen=CONFIG.MAX_EP_STEPS *
                                                2,
                                                dt=1)
        self.train = True
Esempio n. 29
0
    def __init__(self, num_states, num_actions):

        self.num_states = num_states
        self.num_actions = num_actions
        self.freq_update_target = 5  # set frequency of updating target
        self.count_replay = 0
        self.memory = ReplayMemory(10000)  # set capacity

        # Construct a neural network
        self.model = models.Sequential()
        self.model.add(
            layers.Dense(input_shape=(num_states, ),
                         units=128,
                         activation='relu'))
        self.model.add(layers.Dense(512, activation='relu'))
        self.model.add(layers.Dense(num_actions))
        self.model.summary()
        # Set how to train the model
        self.model.compile(loss='mse', optimizer=optimizers.SGD())
        self._target_model = models.clone_model(self.model)
Esempio n. 30
0
    def __init__(self,
                 allies,
                 opponents,
                 world_size,
                 n_games,
                 train_batch_size,
                 replay_mem_limit,
                 training_rate=10,
                 update_rate=500,
                 sim_moves_limit=30,
                 exploration_steps=200000,
                 exploration_range=(0.1, 1.0),
                 viz=None,
                 viz_execution=None,
                 train_saving=None):

        self.allies = allies
        self.opponents = opponents
        self.world_size = world_size
        self.moves_limit = sim_moves_limit
        self.training_rate = training_rate
        self.policy_dist_rate = update_rate
        self.exploration_steps = exploration_steps
        self.exploration_range = exploration_range
        self.exploration_step_value = \
            (exploration_range[1]-exploration_range[0])/exploration_steps
        self.experience_replay = ReplayMemory(batch_size=train_batch_size,
                                              table_size=replay_mem_limit)
        self.training_batch_size = train_batch_size
        self.n_games = n_games
        self.replay_mem_limit = replay_mem_limit
        self.environment = Environment(n_rows=world_size[0],
                                       n_cols=world_size[1],
                                       n_agents=allies,
                                       n_opponents=opponents)

        self.metrics = {"reward": list(), "loss": list()}

        self.viz = viz
        self.viz_execution = viz_execution
        self.train_saving = train_saving
Esempio n. 31
0
def train():
    # 创建环境
    game = FlappyBird()
    env_1 = PLE(game, fps=30, display_screen=False)
    env_2 = PLE(game, fps=30, display_screen=True)
    obs_dim = len(env_1.getGameState())
    act_dim = len(env_1.getActionSet())
    print('action set:', env_1.getActionSet())
    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))

    # 创建经验池
    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池

    # 根据parl框架构建agent
    model = Model(act_dim=act_dim)
    algorithm = DQN(model, act_dim=act_dim, gamma=GAMMA, lr=LEARNING_RATE)
    agent = Agent(algorithm,
                  obs_dim=obs_dim,
                  act_dim=act_dim,
                  e_greed=0.3,
                  e_greed_decrement=1e-6)

    # 加载模型
    save_path = './flappybird.ckpt'
    if os.path.exists(save_path):
        agent.restore(save_path)

    # 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
    while len(rpm) < MEMORY_WARMUP_SIZE:
        run_episode(env_1, agent, rpm)

    max_episode = 2000

    # 开始训练
    episode = 0
    while episode < max_episode:  # 训练max_episode个回合,test部分不计算入episode数量
        # train
        for i in range(0, 100):
            total_reward, steps = run_episode(env_1, agent, rpm)
            episode += 1

        # test
        eval_reward, steps = evaluate(env_2, agent)
        logger.info(
            '[episode:{}], e_greed:{:.6f}, steps:{}, test_reward:{}'.format(
                episode, agent.e_greed, steps, eval_reward))
        # 保存模型
        ckpt = './models/episode_{}.ckpt'.format(episode)
        agent.save(ckpt)

    # 训练结束,保存模型
    save_path = './flappybird.ckpt'
    agent.save(save_path)
Esempio n. 32
0
class CNNPlayer(Player):

    def __init__(self, agent_filepath=""):
        Player.__init__(self)

        # Create the experience memory database
        if not os.path.exists(REPLAY_MEMORY_FILENAME):
            self.replay_memory = ReplayMemory()
        else:
            self.replay_memory = cPickle.load(open(REPLAY_MEMORY_FILENAME, 'r'))
        
        # Initialize the convolutional neural network
        self.network = MinecraftNet(agent_filepath)   
        self.ae_network = FeatureNet()
        
        # Probability of selecting non-random action
        self.epsilon = STARTING_EPSILON
        
        # The total number of frames this agent has been trained on
        # through all the minibatch training
        self.frames_trained = 0

        # Load old epsilon and frames learned values
        self.load()
            
        self.cnn_action_map = self.initActionMap()
        
        # The current and previous sequences of game frames and actions
        self.current_seq = None
        self.previous_seq = None
        self.previous_action = None
        
        # Event logging
        self.log = LogFile("run.log", True)
        #self.log.logMessage("INITIAL NETWORK PARAMS: %s" % str(self.network.solver.net.params['ip1'][0].data[...]))

        
        
    # Create a map of all the CNN's legal actions
    # We will be able to pick the best move from this list based on the CNN's output
    def initActionMap(self):
        actions = []
        
        # Populate with all 18 legal actions
        # (break_block, updown_rot, leftright_rot, forwardback, leftright)
        actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=0.0, forwardback=0, leftright=0))
        actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=0.0, forwardback=1, leftright=0))
        actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=0.0, forwardback=-1, leftright=0))  
        
        actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=AGENT_ROTATION_SPEED, forwardback=0, leftright=0))
        actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=AGENT_ROTATION_SPEED, forwardback=1, leftright=0))
        actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=AGENT_ROTATION_SPEED, forwardback=-1, leftright=0))

        actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=-AGENT_ROTATION_SPEED, forwardback=0, leftright=0))
        actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=-AGENT_ROTATION_SPEED, forwardback=1, leftright=0))
        actions.append(Action.Action(False, updown_rot=0.0, leftright_rot=-AGENT_ROTATION_SPEED, forwardback=-1, leftright=0))    

        actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=0.0, forwardback=0, leftright=0))
        actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=0.0, forwardback=1, leftright=0))
        actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=0.0, forwardback=-1, leftright=0))  
        
        actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=AGENT_ROTATION_SPEED, forwardback=0, leftright=0))
        actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=AGENT_ROTATION_SPEED, forwardback=1, leftright=0))
        actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=AGENT_ROTATION_SPEED, forwardback=-1, leftright=0))

        actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=-AGENT_ROTATION_SPEED, forwardback=0, leftright=0))
        actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=-AGENT_ROTATION_SPEED, forwardback=1, leftright=0))
        actions.append(Action.Action(True, updown_rot=0.0, leftright_rot=-AGENT_ROTATION_SPEED, forwardback=-1, leftright=0))  
        
        return actions
    
    def getActionMapIndex(self, action):
        for i in range(len(self.cnn_action_map)):
            if action == self.cnn_action_map[i]:
                return i
        self.log.logError("ACTION %s NOT FOUND IN ACTION MAP" % str(action))
        sys.exit(1)
    
    
    def sequenceForward(self, seq):
        cnn_input = seq.toCNNInput()
        output = self.network.forward(cnn_input)
        return output
    
    def pickBestAction(self, seq):
        cnn_outputs = self.sequenceForward(seq)
        self.log.logMessage("REINFORCEMENT NET OUTPUT: " + str(cnn_outputs))
        
        max_output_index = 0
        max_output = cnn_outputs[0]
        for i in range(len(cnn_outputs)):
            if cnn_outputs[i] > max_output:
                max_output = cnn_outputs[i]
                max_output_index = i
                
        self.log.logMessage("BEST ACTION CHOSEN: %s" % str(self.cnn_action_map[max_output_index]))
        return self.cnn_action_map[max_output_index]
    
    def pickRandomAction(self):
        return random.choice(self.cnn_action_map)
    
    def load(self):
        if os.path.exists(CNNPLAYER_SAVE_FILENAME):
            f = open(CNNPLAYER_SAVE_FILENAME, 'r')
            tokens = f.read().split()
            self.epsilon, self.frames_trained = float(tokens[0]), int(tokens[1])
            f.close()
    
    
    def save(self):
        # Save the replay memory as a pickled file
        o = open(REPLAY_MEMORY_FILENAME, 'w')
        cPickle.dump(self.replay_memory, o)
        o.close()
        
        o = open(CNNPLAYER_SAVE_FILENAME, 'w')
        o.write("%.8f %d" % (self.epsilon, self.frames_trained))
        o.close()

        # Log the last network weights        
        #self.log.logMessage("FINAL NETWORK PARAMS: %s" % str(self.network.solver.net.params['ip1'][0].data[...]))
        
        
        
    # Train the agent's CNN on a minibatch of Experiences    
    def trainMinibatch(self):
        self.log.logMessage("TRAINING MINIBATCH")
        self.frames_trained += TRAINING_BATCH_SIZE
        experiences = self.replay_memory.get_random(TRAINING_BATCH_SIZE)
        inputs = []
        labels = []
        for experience in experiences:
            cnn_outputs = self.sequenceForward(experience.curr_seq)
            #best_action = self.pickBestAction(experience.curr_seq)
            target_vector = []
            for act in cnn_outputs:
                #act = cnn_outputs[act_id]
                act_target = experience.curr_reward + GAMMA * act
                target_vector.append(act_target)
            #target = experience.curr_reward + GAMMA * best_action_output
            inputs.append(experience.prev_seq)
            labels.append(target_vector)
            #dataset.append((experience.prev_seq, target))
            
        #Do gradient descent to minimize   (target - network.forward(experience.prev_seq)) ^ 2
        # print("INPUTS:", inputs)
        # print("LABELS:", labels)
        #self.network.set_input_data(inputs, labels)
        self.network.set_train_input_data(inputs, labels)
        self.network.train(BATCH_TRAINING_ITERATIONS) # train for a single iteration

    
    # Receive the agent's reward from its previous Action along with
    # a Frame screenshot of the current game state
    def getDecision(self, current_frame):
        self.log.logMessage("DECISION #%d in GAME FRAME #%d" % (self.actions_performed, self.game.world_counter))
        self.log.logMessage("TRAINED ON %d FRAMES" % (self.frames_trained))
       
        features = self.ae_network.encodeNumpyArray(current_frame.pixels)
        #self.log.logMessage("Current frame yields features: %s" % str(features))

        if self.previous_reward != 0:
            self.log.logMessage("GOT REWARD: %d" % self.previous_reward)
        self.total_score += self.previous_reward
                        
        # First frame of game
        if self.actions_performed == 0:
            self.actions_performed += 1
            self.previous_seq = Sequence(features)
            # print("FRAME SEQUENCE: {0}".format(self.previous_seq))
            curr_action = self.pickRandomAction()
            self.previous_seq = self.previous_seq.createNewSequence(curr_action)
            self.previous_action = curr_action
            # print("FIRST SEQUENCE: {0}".format(self.previous_seq))
            return
        
        
        # Should I make a random move?
        r = random.random()
            
        # Add on the current frame to the current sequence
        self.current_seq = self.previous_seq.createNewSequence(features)

        if r > self.epsilon or self.actions_performed < 4: #not self.current_seq.isFull():
            curr_action = self.pickRandomAction()
        else:
            # Run the CNN and pick the max output action
            curr_action = self.pickBestAction(self.current_seq)
            
        # Finally, add the chosen action to the current sequence
        self.current_seq = self.current_seq.createNewSequence(curr_action)
            
        # Actually perform the action in the game
        self.performAction(curr_action)
            
        new_experience = Experience(self.previous_seq, self.previous_action, self.previous_reward, self.current_seq)
        self.replay_memory.store(new_experience)
        self.previous_seq = self.current_seq

        if self.game.world_counter > STARTING_FRAMES and self.game.world_counter % BATCH_TRAINING_FREQUENCY == 0:
            self.trainMinibatch()
                
        # Remember the chosen Action since it will be required for the next iteration
        self.previous_action = curr_action
        
        if self.epsilon < MAX_EPSILON:
            self.epsilon *= EPSILON_UPDATE
            self.log.logMessage("UPDATED EPSILON: %.5f" % self.epsilon)