import json import random import numpy as np from src.utils2 import c_ex as c, get_path import src.dataloaders as d from src.logistic import fit_logistic_regression path = get_path(__file__) + '/..' D = d.trainingset_extended() Dt = d.testset_extended() cols = c('sde5', 'v11', 'e9') a = range(D.shape[0]) random.shuffle(a) X = D[:, cols] X = X[a[:320000], :] y = D[a[:320000], c('isalert')] y = y.astype(int)^1 Xt = D[:, cols] Xt = Xt[a[320000:], :] yt = D[a[320000:], c('isalert')] yt = yt.astype(int)^1 num_tests = 1
from __future__ import division import numpy as np from src.dataloaders import testset from src.utils2 import c T = testset() length = T.shape[0] fails_e9 = np.abs(T[:,c('E9')]-T[:,c('IsAlert')]).sum() fails_v5 = np.abs(T[:,c('V5')]-T[:,c('IsAlert')]).sum() print "Percent classified by E9: %.2f" % ((length - fails_e9)/length,) print "Percent classified by V5: %.2f" % ((length - fails_v5)/length,)
from src.dataloaders import trainingset from src.utils2 import get_path, c, L L = list(L[4:]) D = trainingset() path = get_path(__file__) + '/..' savepath_template = '{0}/plots/scatterplots/{1}-{2}.pdf' num_points = 100 rows = np.random.random_integers(0,D.shape[0]-1, num_points) data = D[rows,:] colors = map(lambda x: 'blue' if x==1 else 'red', data[:,c('IsAlert')]) blue = Rectangle((0,0),1,1,fc='b') red = Rectangle((0,0),1,1,fc='r') exclude = ['V7', 'V9', 'P8', 'E3', 'E7', 'E8', 'E9', 'V3', 'V5', 'V10'] features = [f for f in L if f not in exclude] for f1, f2 in it.combinations(features, 2): plt.title('Feature {0} vs {1} ({2} points)'.format(f1, f2, num_points), {'size': 20}) plt.legend((blue, red), ('Alert', 'Not Alert')) plt.scatter(data[:,c(f1)], data[:,c(f2)], c=colors) plt.gca().set_xlabel(f1, {'size': 18}) plt.gca().set_ylabel(f2, {'size': 18}) plt.savefig(savepath_template.format(path,f1,f2), format='pdf', papertype='a4')
from pybrain.supervised.trainers import BackpropTrainer from pybrain.structure.modules import SigmoidLayer import src.dataloaders as d from src.utils2 import c D = d.testset() a = range(D.shape[0]) random.shuffle(a) num_train_rows = 10000 num_test_rows = 5000 tr_rows = a[:num_train_rows] ts_rows = a[num_train_rows : (num_train_rows + num_test_rows)] features = ["V11", "sdE5", "E9"] X = D[tr_rows, c(*features)] Y = D[tr_rows, c("IsAlert")] Xt = D[ts_rows, c(*features)] Yt = D[ts_rows, c("IsAlert")] nn = buildNetwork(3, 3, 1, outclass=SigmoidLayer) ds = ClassificationDataSet(3, 1) for i, row in enumerate(X): ds.addSample(row, Y[i]) trainer = BackpropTrainer(nn, ds)
path = get_path(__file__) + '/..' with open(path+'/data/forward-selection-results-2-for-docs.json') as f: _tmp = json.load(f) features = _tmp['labels_chosen'] # Controls how many of the 48 # features from feature selection # that are included in the log reg. features = features[:3] D = d.trainingset_extended() T = d.testset_extended() cols = c(*features) Xt = T[:, cols] yt = T[:, c.isalert] C = 1000000 num_bins = 10 bins = get_bins(T.shape[0], num_bins) classifier = _fit_logistic_regression(D[:, cols], D[:, c.isalert], C) auc = num_bins*[0] fpr = num_bins*[0] tpr = num_bins*[0]
import json import random import numpy as np from src.utils2 import c_ex as c, get_path import src.dataloaders as d from src.logistic import fit_logistic_regression path = get_path(__file__) + '/..' D = d.trainingset_extended_window() cols = c('sde5', 'v11', 'e9') a = range(int(D.shape[0])) random.shuffle(a) C = 10000 num_bins = 8 bin_size = int(np.ceil(len(a)/num_bins)) bins = [a[i*bin_size:(i+1)*bin_size] for i in range(num_bins)] X = D[:, cols] y = D[:, c.isalert] auc = num_bins*[0] fpr = num_bins*[0] tpr = num_bins*[0]
D = d.trainingset_extended_window() a = range(D.shape[0]) random.shuffle(a) num_train_rows = 10000 num_test_rows = 5000 max_features = 12 tr_rows = a[:num_train_rows] ts_rows = a[num_train_rows:(num_train_rows+num_test_rows)] X = D[:, 4:] X = X[tr_rows, :] y = D[tr_rows, c('isalert')] Xt = D[:, 4:] Xt = Xt[ts_rows, :] yt = D[ts_rows, c('isalert')] auc = np.zeros((max_features,90)); # Remove P3, P6, P8, V7 and V9 and # the corresponding running features. # See session 9 on data exploration # for details cc = LabelIndex(L_ex[4:]) exclude = cc('p3', 'p6', 'p8', 'v7', 'v9', 'mp3', 'mp6', 'mp8', 'mv7', 'mv9', 'sdp3', 'sdp6', 'sdp8', 'sdv7', 'sdv9')