import json
import random

import numpy as np

from src.utils2 import c_ex as c, get_path
import src.dataloaders as d
from src.logistic import fit_logistic_regression


path = get_path(__file__) + '/..'

D = d.trainingset_extended()
Dt = d.testset_extended()

cols = c('sde5', 'v11', 'e9')

a = range(D.shape[0])
random.shuffle(a)

X = D[:, cols]
X = X[a[:320000], :]
y = D[a[:320000], c('isalert')]
y = y.astype(int)^1

Xt = D[:, cols]
Xt = Xt[a[320000:], :]
yt = D[a[320000:], c('isalert')]
yt = yt.astype(int)^1

num_tests = 1
Esempio n. 2
0
from __future__ import division
import numpy as np

from src.dataloaders import testset
from src.utils2 import c


T = testset()
length = T.shape[0]

fails_e9 = np.abs(T[:,c('E9')]-T[:,c('IsAlert')]).sum()
fails_v5 = np.abs(T[:,c('V5')]-T[:,c('IsAlert')]).sum()

print "Percent classified by E9: %.2f" % ((length - fails_e9)/length,) 
print "Percent classified by V5: %.2f" % ((length - fails_v5)/length,) 
Esempio n. 3
0
from src.dataloaders import trainingset
from src.utils2 import get_path, c, L


L = list(L[4:])
D = trainingset()

path = get_path(__file__) + '/..'
savepath_template = '{0}/plots/scatterplots/{1}-{2}.pdf'
num_points = 100

rows = np.random.random_integers(0,D.shape[0]-1, num_points)
data = D[rows,:]

colors = map(lambda x: 'blue' if x==1 else 'red', data[:,c('IsAlert')])
blue = Rectangle((0,0),1,1,fc='b')
red = Rectangle((0,0),1,1,fc='r')

exclude = ['V7', 'V9', 'P8', 'E3', 'E7', 'E8', 'E9', 'V3', 'V5', 'V10']
features = [f for f in L if f not in exclude]

for f1, f2 in it.combinations(features, 2):
    plt.title('Feature {0} vs {1} ({2} points)'.format(f1, f2, num_points), 
            {'size': 20})
    plt.legend((blue, red), ('Alert', 'Not Alert'))
    plt.scatter(data[:,c(f1)], data[:,c(f2)], c=colors)
    plt.gca().set_xlabel(f1, {'size': 18})
    plt.gca().set_ylabel(f2, {'size': 18})
    plt.savefig(savepath_template.format(path,f1,f2), format='pdf', 
            papertype='a4')
Esempio n. 4
0
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.structure.modules import SigmoidLayer

import src.dataloaders as d
from src.utils2 import c


D = d.testset()

a = range(D.shape[0])
random.shuffle(a)

num_train_rows = 10000
num_test_rows = 5000

tr_rows = a[:num_train_rows]
ts_rows = a[num_train_rows : (num_train_rows + num_test_rows)]

features = ["V11", "sdE5", "E9"]

X = D[tr_rows, c(*features)]
Y = D[tr_rows, c("IsAlert")]
Xt = D[ts_rows, c(*features)]
Yt = D[ts_rows, c("IsAlert")]

nn = buildNetwork(3, 3, 1, outclass=SigmoidLayer)
ds = ClassificationDataSet(3, 1)
for i, row in enumerate(X):
    ds.addSample(row, Y[i])
trainer = BackpropTrainer(nn, ds)
path = get_path(__file__) + '/..'

with open(path+'/data/forward-selection-results-2-for-docs.json') as f:
    _tmp = json.load(f)
    features = _tmp['labels_chosen']

# Controls how many of the 48
# features from feature selection
# that are included in the log reg.
features = features[:3]

D = d.trainingset_extended()
T = d.testset_extended()

cols = c(*features)

Xt = T[:, cols]
yt = T[:, c.isalert]

C = 1000000
num_bins = 10

bins = get_bins(T.shape[0], num_bins)

classifier = _fit_logistic_regression(D[:, cols], D[:, c.isalert], C)

auc = num_bins*[0]
fpr = num_bins*[0]
tpr = num_bins*[0]
import json
import random

import numpy as np

from src.utils2 import c_ex as c, get_path
import src.dataloaders as d
from src.logistic import fit_logistic_regression


path = get_path(__file__) + '/..'

D = d.trainingset_extended_window()

cols = c('sde5', 'v11', 'e9')

a = range(int(D.shape[0]))
random.shuffle(a)

C = 10000
num_bins = 8
bin_size = int(np.ceil(len(a)/num_bins))
bins = [a[i*bin_size:(i+1)*bin_size] for i in range(num_bins)]

X = D[:, cols]
y = D[:, c.isalert]

auc = num_bins*[0]
fpr = num_bins*[0]
tpr = num_bins*[0]
D = d.trainingset_extended_window()

a = range(D.shape[0])
random.shuffle(a)

num_train_rows = 10000
num_test_rows = 5000
max_features = 12

tr_rows = a[:num_train_rows]
ts_rows = a[num_train_rows:(num_train_rows+num_test_rows)]

X = D[:, 4:]
X = X[tr_rows, :]
y = D[tr_rows, c('isalert')]

Xt = D[:, 4:]
Xt = Xt[ts_rows, :]
yt = D[ts_rows, c('isalert')]

auc = np.zeros((max_features,90));

# Remove P3, P6, P8, V7 and V9 and 
# the corresponding running features.
# See session 9 on data exploration
# for details
cc = LabelIndex(L_ex[4:])
exclude = cc('p3', 'p6', 'p8', 'v7', 'v9', 
             'mp3', 'mp6', 'mp8', 'mv7', 'mv9',
             'sdp3', 'sdp6', 'sdp8', 'sdv7', 'sdv9')