##################
  scores = []
  agent = ReinforceAgent(model, actions=ACTIONS, noise=EXPLORE_RATE)
  for _ in range(TEST_EPISODES_PER_EPOCH):
    env.reset()
    agent.reset()
    replay = []
    done = False
    while not done:
      action = agent.process(env.state)
      _, reward, done, prevState = env.apply(action)
      replay.append((prevState, action, reward))
    ##
    states, actions, rewards = zip( *replay )
    
    actions = ACTIONS.toIndex(actions)
    trainable.fit(
      [
        np.array(states),
        np.array(actions),
        np.array(discountedReturns(rewards, GAMMA))
      ],
      epochs=1, verbose=0
    )
    ##
    scores.append(env.score)

  Utils.trackScores(scores, metrics)
  ##################
  EXPLORE_RATE = max((0.001, EXPLORE_RATE * EXPLORE_RATE_DECAY))
  plotData2file(metrics, 'chart.jpg')
Exemple #2
0
TRAIN_EPISODES = 200
TEST_EPISODES = 1
EPOCHS = 100
NOISE_STD = 0.1
NOISE_STD_DECAY = 0.99
TAU = 0.005

memory = CebLinear(maxSize=50000, sampleWeight='same')
metrics = {}
for epoch in range(EPOCHS):
    print('Start of %d epoch. Noise std: %.3f' % (epoch, NOISE_STD))
    ##################
    print('Testing...')
    scores = Utils.testAgent(RawActionAgent(model,
                                            processor=addNoise(NOISE_STD)),
                             memory,
                             TEST_EPISODES,
                             env=RawPendulumEnvironment)
    Utils.trackScores(scores, metrics)
    ##################
    # train model
    lossesActor = []
    lossesCritic = []
    for _ in range(TRAIN_EPISODES):
        states, actions, rewards, nextStates, nextStateScoreMultiplier = memory.sampleBatch(
            BATCH_SIZE)
        nextStateScoreMultiplier = tf.convert_to_tensor(
            nextStateScoreMultiplier * GAMMA, dtype=tf.float32)
        rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)

        lossCritic, lossActor = model.fit(states, actions, rewards, nextStates,
Exemple #3
0
for epoch in range(EPOCHS):
  print('Start of %d epoch. Explore rate: %.3f' % (epoch, EXPLORE_RATE))
  # for stability
  modelClone.set_weights(model.get_weights())
  lossSum = 0
  for _ in range(TRAIN_EPISODES):
    states, actions, rewards, nextStates, nextStateScoreMultiplier = memory.sampleBatch(
      batch_size=BATCH_SIZE, maxSamplesFromEpisode=16
    )
    actions = ACTIONS.toIndex(actions)
    
    futureScores = modelClone.predict(nextStates).max(axis=-1) * nextStateScoreMultiplier
    targets = modelClone.predict(states)
    targets[np.arange(len(targets)), actions] = rewards + futureScores * GAMMA

    lossSum += model.fit(states, targets, epochs=1, verbose=0).history['loss'][0]
  print('Avg. train loss: %.4f' % (lossSum / TRAIN_EPISODES))
  ##################
  print('Testing...')
  scores = Utils.testAgent(
    DQNAgent(model, actions=ACTIONS, exploreRate=EXPLORE_RATE),
    memory, TEST_EPISODES
  )
  Utils.trackScores(scores, metrics)
  ##################
  if (epoch % 10) == 0: # debug
    Utils.showAgentPlay( DQNAgent(model, actions=ACTIONS, exploreRate=0) )
  ##################
  EXPLORE_RATE = max((0.001, EXPLORE_RATE * EXPLORE_RATE_DECAY))
  plotData2file(metrics, 'chart.jpg')
STEPS_PER_EPISODE = 200
BOOTSTRAPPED_STEPS = 10

metrics = {}

env = PendulumEnvironment()
memory = EB.CebLinear(maxSize=10 * TEST_EPISODES * STEPS_PER_EPISODE,
                      sampleWeight='abs')
curiosityModel = CCuriosityIRWatched(CCuriosityIR(layersSizes=[10, 10, 10]))
processor = replayProcessor(curiosityModel,
                            rewardScale=1.0 / BOOTSTRAPPED_STEPS,
                            normalize=True)
# collect random experience
for episodeN in range(2):
    Utils.testAgent(RandomAgent(low=-1, high=1),
                    memory,
                    episodes=100,
                    processor=processor)
print('random experience collected')
####################
model = createFatModel(input_shape=(3, ), output_size=ACTIONS.N)
model.compile(optimizer=tf.optimizers.Adam(lr=1e-4),
              loss=tf.keras.losses.Huber(delta=1.0))

ghostNetwork = GhostNetwork(model, mixer='hard')
for epoch in range(EPOCHS):
    print('Start of %d epoch. Explore rate: %.3f' % (epoch, EXPLORE_RATE))
    ##################
    # Training
    ghostNetwork.update()
    trainLoss = train(
        ghostNetwork, memory, {