samples[i][2] = samples[i][2] - xreward * (i + 1) / num_samples memory.add_sample(samples[i]) sample_batch = memory.sample_samples(batch_size) actual_batch_size = len(sample_batch) state_batch = np.zeros((actual_batch_size, 9)) next_state_batch = np.zeros((actual_batch_size, 9)) action_batch = [sample[1] for sample in sample_batch] for i, sample in enumerate(sample_batch): state_batch[i] = sample[0] if sample[3] is not None: next_state_batch[i] = sample[3] qsa_batch = model.predict_batch(state_batch, sess) for i in range(actual_batch_size): if sample_batch[i][3] is None: qsa_batch[i, action_batch[i]] = sample_batch[i][2] else: qsa_batch[i, action_batch[i]] = sample_batch[i][ 2] + gamma * get_max_reward( next_state_batch[i].reshape(1, 9), model, sess) model.train_batch(state_batch, qsa_batch, sess) epsilon = 0.9 * np.exp(-0.0001 * game) model.save(sess, 'tic_tac_toe_model_nobad') model.plot_losses('losses.png')
samples[i].append(None) memory.add_sample(samples[i]) sample_batch = memory.sample_samples(batch_size) actual_batch_size = len(sample_batch) state_batch = np.zeros((actual_batch_size, 9)) next_state_batch = np.zeros((actual_batch_size, 9)) action_batch = [sample[1] for sample in sample_batch] for i, sample in enumerate(sample_batch): state_batch[i] = sample[0] if sample[3] is not None: next_state_batch[i] = sample[3] qsa_batch = model.predict_batch(state_batch, sess) for i in range(actual_batch_size): for choice in range(9): if state_batch[i, choice] != 0: qsa_batch[i, choice] = -2 if sample_batch[i][3] is None: qsa_batch[i, action_batch[i]] = sample_batch[i][2] else: qsa_batch[i, action_batch[i]] = sample_batch[i][2] + gamma*np.amax(model.predict_one(next_state_batch[i].reshape((1,9)), sess)) model.train_batch(state_batch, qsa_batch, sess) epsilon = 0.9*np.exp(-0.001*game) model.save(sess) model.plot_losses()