-
Notifications
You must be signed in to change notification settings - Fork 0
/
errorDatastructure.py
executable file
·177 lines (162 loc) · 7.35 KB
/
errorDatastructure.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/python
from sklearn import cross_validation
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import LeavePOut
from sklearn.metrics.pairwise import *
from scipy.spatial.distance import *
import numpy as np
from micAnalysis import *
import sys
from regressFit import *
from micAnalysis import *
from drawPlot import *
from detectAnomaly import *
from fields import *
from analyze import *
from pickleDump import *
#global inputColumnNames
#global measuredColumnNames
#global outputColumnNames
#global regressionDict
#this is a global container for error datastructure
#targetName vs TargetErrorData map
#global TargetErrorDataMap
#global ErrorDistributionProfileMapForTargetAndFeature
#This is a map where initial samples for test and train are kept.
#Later it is populated with prediction functions and prediction errors.
#The top level key is "target-name". The content DataStructure "TargetErrorData"
#TargetErrorDataMap ={}
#This map keeps the calculated error profile for each target for each profile along with curve-fitted error function
#top level key is target name. 2nd level key is feature name. Then the content is "errorDistributionProfile"
#ErrorDistributionProfileMapForTargetAndFeature = {}
class FeatureDataPoint:
def __init__(self,nameValueMap):
self.featureNameValueMap = nameValueMap
def __str__(self):
s = str(self.featureNameValueMap)
return s
def getSelectedFeaturePtFromProductionDataPoint(prodFeaturePt,selectedFeatureMap):
#print "TEST: prodFeaturePt ", prodFeaturePt
#print "TEST: selectedFeatureMap ",selectedFeatureMap
selectedDataPointMap = {}
#selectedDataPointMap["in1"] = 2
selectedFeatures = selectedFeatureMap.keys()
productionDataPtMap = prodFeaturePt.featureNameValueMap
#print FeatureDataPoint
for feature,value in productionDataPtMap.iteritems():
if feature in selectedFeatures:
selectedDataPointMap[feature] = value
#print "TEST : selectedDataPointMap ", selectedDataPointMap
dataPtWithSelectedFeature = FeatureDataPoint(selectedDataPointMap)
return dataPtWithSelectedFeature
def getSelectedInputArrFromSelectedDataPoint(selectedFeaturePt,selectedFeatureSortedByInIndex):
inArr = []
# remember, selectedFeatureMap should have a format: <featureName> <=> <index in input feature Arr>
#AND it is supposed to be sorted based on index in input feature Arr in increasing order
for selectedFeature in selectedFeatureSortedByInIndex:
inArr.append(selectedFeaturePt.featureNameValueMap[selectedFeature[0]])
#end for
return inArr
class Observations:
#while creating the observations, specify observationType, TRAIN or TEST.
# depending on this type, observation data is populated differently
# also, PredictedArr and PredictionErrArr is only calculated for observationType=TEST
def __init__(self,inArr,tarArr,observationType):
self.observeType = observationType #TRAIN or TEST. Certain operations are valid on TRAIN and certain are valid on TEST
self.ParamArr = inArr
self.TargetArr = tarArr
self.PredictedArr = None
self.PredictionErrArr = None
self.DistanceToTargetArr = None
def __str__(self):
s = str("\n\tObservationType:" + self.observeType + "\tPARAM Arr: " + np.array_str(self.ParamArr) + "\tTARGET Arr: " + np.array_str(self.TargetArr))
if(self.PredictionErrArr != None):
s = s + "\tPREDICTED Arr: " + np.array_str(self.PredictedArr)
s = s + "\tPREDICTION ERROR: " + str(self.PredictionErrArr)
s = s + "\tDISTANCE: " + str(self.DistanceToTargetArr)
return s
class FeatureErrorData:
def __init__(self):
self.name = ''
self.TrainingObservations = None
self.TestObservations = []
self.RegressionFunction = None
def __str__(self):
featureName = getInputParameterNameFromFeatureIndex(self.name)
#s = "\n\t\t----------------- Feature id: " + str(self.name) + " --------\n"
s = "\n\t\t----------------- Feature id: " + featureName + " --------\n"
s = s + "- - - - - - - - - - - - - - - - - - \n"
s = s + "\t\t" + "TRAIN OBSERVATIONS: " + str(self.TrainingObservations) + "\n"
s = s + "\t\t" + "TEST OBSERVATIONS: "
for testObs in self.TestObservations:
s = s + "\n\t\t\t" + str(testObs)
#s = s + "\n\t\t RegressionFunc: " + str(self.RegressionFunction)
s = s + "\n- - - - - - - - - - - - - - - - - - "
return s
#data structure per target for train and test samples
#the internal FeatureErrorDataMap contains list of training samples and test samples for each (sorted)feature
class TargetErrorData:
def __init__(self):
self.name = ''
# featureName vs FeatureErrorData map
self.FeatureErrorDataMap = {}
self.errors = [] # only to be populated by histogram function and will be used in anomaly detection
def __init__(self,targetName):
self.name = targetName
self.FeatureErrorDataMap = {}
self.errors = []
def __str__(self):
s = "\tTargetErrorData = " + self.name
for fKeys in self.FeatureErrorDataMap.keys():
fErrData = self.FeatureErrorDataMap[fKeys]
s = s + str(fErrData)
s = s + "\n"
return s
def printFullErrorDataStructure():
print "\n\n******* Target error data map ********"
tgtErrMap = getGlobalObject("TargetErrorDataMap")
for targetkey in tgtErrMap.keys():
tarErrData = tgtErrMap[targetkey]
print "\n*****************For target = ", str(tarErrData)
#for errDS in errDSList:
# print "Training obs: \n\t" + str(errDS.TrainingObservations)
# testObs = errDS.TestObservations
# for o in testObs:
# print "Test obs: \n\t" + str(o)
class errorDistributionProfile:
def __init__(self,feature_name,target_name):
self.FeatureName = feature_name
self.TargetName = target_name
self.ErrorRegressFunction = None
self.MeanPointOfTrainingSet = None
self.StandardDeviationOfTrainingSet = None
self.ErrorSamples = []
def __str__(self):
s = "\nError profile: TargetName = " + self.TargetName + " FeatureName = " + self.FeatureName
s = s + "\n\tMean point of training = " + str(self.MeanPointOfTrainingSet) + " StdDev of training set = " + str(self.StandardDeviationOfTrainingSet)
#s = s + "\n\tError Curve Fitted Fuction: " + str(self.ErrorRegressFunction.coeff_)
s = s + "\n\tError Curve Fitted Fuction: " + str(self.ErrorRegressFunction.named_steps['linear'].coef_)
s = s + "\n\tError Samples: " + np.array_str(self.ErrorSamples) + "\n"
#doHistogramPlot(self.ErrorSamples,self.TargetName,self.FeatureName,doSave=True)
return s
def printErrorDistributionProfileMapForTargetAndFeatureMap(errProfMap=None):
workingErrProfileMap = errProfMap
if(errProfMap == None):
workingErrProfileMap = getGlobalObject("ErrorDistributionProfileMapForTargetAndFeature")
else:
workingErrProfileMap = errProfMap
#print "just printing the map\n", workingErrProfileMap
for targetKey in workingErrProfileMap.keys():
featureErrMap = workingErrProfileMap[targetKey]
for featureName in featureErrMap.keys():
errProf = featureErrMap[featureName]
print "\nxxxxxxxxxxxxxxxx ERROR PROFILE xxxxxxxxxxxxxxxxxxxxxxxxxxxxx\n"
print str(errProf)
if __name__ == "__main__":
o = FeatureErrorData()