/
q_main_y.py
158 lines (130 loc) · 5.37 KB
/
q_main_y.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import os
import numpy as np
import random
import pandas as pd
from parse import parser
import matplotlib.pyplot as plt
#import plaidml.keras
#plaidml.keras.install_backend()
#import keras
def chunk_list(list, n):
for i in range(0, len(list), n):
yield list[i : i + n]
class dataset():
def __init__(self, data):
self.ycom = data[:,0]
self.yfbk = data[:,1]
self.init_mean = np.mean(np.absolute(self.ycom-self.yfbk))
self.num_steps = len(self.ycom)
self.errors = np.zeros(self.num_steps)
self.mean_errors = []
def pad_size(self, max_steps):
difference = max_steps - self.num_steps
filler = np.zeros(difference)
self.ycom = np.concatenate((self.ycom, filler))
self.yfbk = np.concatenate((self.yfbk, filler))
self.errors = np.concatenate((self.errors, filler))
self.num_steps = len(self.ycom)
actions = [1, .9, .8, .7, .6, .5, .4, .3, .2, .1, 0, -.1, -.2, -.3, -.4, -.5, -.6, -.7, -.8, -.9, -1]
action_size = len(actions)
dirname = os.getcwd()
dirname+='/data/'
data_parser = parser()
files = []
for file in os.listdir(dirname):
if file.endswith(".DAT"):
files.append(file)
file_data = []
for file in files:
data_parser.parse_data(dirname+file)
temp = data_parser.get_y().values
file_data.append(temp)
set_length = 500
datasets = []
for file in file_data:
sets = list(chunk_list(file, set_length))
for set in sets:
temp = dataset(set)
if temp.num_steps < set_length:
temp.pad_size(set_length)
datasets.append(temp)
random.shuffle(datasets)
state_size = set_length
qtable = np.zeros((state_size, action_size))
#Function below gives our reward system.
def check_error(e_n, e_nplus):
#Check error subtracted from previous error
result = e_n - e_nplus
if result < 0:
#return a positive reward.
return 100
elif result == 0:
#return a neutral reward.
return 0
else:
#return a negative reward.
return -1
# SPECIFY HYPERPARAMETERS
total_epochs = 1000 #Total total_epochs
learning_rate = 0.003 #Learning Rate
gamma = 0.95 #Discounting Rate
#Exploration parameters
epsilon = 1.0 #Exploration Rate
max_epsilon = 1.0 #Exploration Probability at start
min_epsilon = 0.01 #Minimum exploration Rate
decay_rate = 0.01 #Exponential decayrate for exploration prob
#List of rewards
rewards = []
overall_init_mean = 0
sum = 0
for set in datasets:
sum += set.init_mean
overall_init_mean = sum/len(datasets)
mean_errors = np.zeros((len(datasets), total_epochs))
for epoch in range(total_epochs):
#Reset the environment
step = 0
done = False
total_rewards = 0
for set in range(len(datasets)):
for step in range(state_size-1):
#Choose an action a in the current world state s, can be exploitation or exploration
## First we randomise a number
exp_exp_tradeoff = random.uniform(0,1)
##If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
if exp_exp_tradeoff > epsilon:
action = np.argmax(qtable[step, :])
#Else doing a random choice --> exploration
else:
action = np.random.randint(action_size)
#Take the action a and observe the outcome state s' and reward r
datasets[set].yfbk[step] += actions[action]*np.sign(datasets[set].ycom[step]-datasets[set].yfbk[step]) #add 1, 0 or -1 to demonstrate accel, const or decel
datasets[set].errors[step] = np.absolute(datasets[set].ycom[step]-datasets[set].yfbk[step])
datasets[set].errors[step+1] = np.absolute(datasets[set].ycom[step+1]-datasets[set].yfbk[step+1])
reward = check_error(datasets[set].errors[step], datasets[set].errors[step+1])
# Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
# qtable[new_state,:] : all the actions we can take from new state
qtable[step,action] = qtable[step, action] + learning_rate*(reward + gamma * np.max(qtable[step+1, :]) - qtable[step,action])
total_rewards += reward
n_mean_step = np.mean(np.absolute(datasets[set].errors))
datasets[set].mean_errors.append(n_mean_step)
mean_errors[set, epoch] = n_mean_step
delta_err = datasets[set].init_mean - n_mean_step
#print("Initial mean error for dataset {} was: {} In this epoch, ({}), Q-Learning has changed this by {} to: {} ".format(set,datasets[set].init_mean,epoch, delta_err, n_mean_step))
overall_mean = np.mean(mean_errors[:, epoch])
prev_mean = np.mean(mean_errors[:, epoch-1])
init_delta = np.absolute(overall_init_mean - overall_mean)
epoch_delta = np.absolute(overall_mean - prev_mean)
print("Initial mean error was: {}, reduced in epoch {} to {}. A change of {} from previous, {} from init.".format(overall_init_mean,epoch, overall_mean, epoch_delta, init_delta))
#Reduce epsilon (because we need less and less exploration)
epsilon = min_epsilon +(max_epsilon - min_epsilon)*np.exp(-decay_rate*epoch)
epoch += 1
rewards.append(total_rewards)
np.savetxt('y_qtable.txt', qtable)
end_mean = []
for i in range(0,total_epochs-1):
end_mean.append(np.mean(mean_errors[:,i]))
plt.plot(end_mean)
plt.ylabel('mean error per epoch')
plt.xlabel('Number of epochs')
plt.show()