def _makeTable(self, element): """This function takes the data element and figures out what data to read and how to read it.""" if self.__timer: self.__timer.output('Read file') fileName = element.get("file") fileType = element.get("type") try: directory=element.get("dir") except: directory='' try: header = element.get("header") except: header = None try: sep = element.get("sep") except: sep = None try: types = element.get("types") except: types = None if fileType == "UniTable": if self.__timer: self.__timer.output('Read file') w = uniTableWrapper(uni.UniTable().from_any_file(fileName)) self.__timer.output('returning') return [w] else: return [uniTableWrapper(uni.UniTable().from_any_file(fileName))] elif fileType == "CSV": _args={} #Holder for optional keword arguments if header: #Add header option _args['header'] = header if sep: #Add insep option _args['insep'] = sep if types: #Add types option _args['types'] = types if (directory is not None and os.path.exists(directory)): filelist=[os.path.join(directory,f) for f in os.listdir(directory)] for f in filelist: if (os.path.isdir(f)): filelist.remove(f) return [uniTableWrapper(uni.UniTable().from_csv_file(files,**_args)) for files in filelist] else: return [uniTableWrapper(uni.UniTable().from_csv_file(fileName,**_args))] elif fileType == "XML": return xmlFile(fileName) elif fileType == "XMLEvents": magicheader = False for child in element: if child.tag == "MagicHeader": magicheader = True break return xmlTable(fileName, magicheader) else: raise StandardError, "File type not recognized"
def _collectTreeStats(self, results, stats): """Given a list of scores and the current count tables, update the appropriate count tables.""" #The last four values of each result are score, alert, segment and extras. #Anything before the last four were asked for as the field we are predicting or the fields used to make our prediction. for result in results: #print result rules = Producer.tupelize(result[-2]) #tbl = stats[rules] if stats[rules]: tbl = stats[rules][-1] #update information if (len(tbl) < 20000): tbl.append(result[:-4]) else: stats[rules].append( uni.UniTable([self.__field] + self.__attributes, _prealloc=100000)) tbl = stats[rules][-1] tbl.append(result[:-4]) else: #initialize information, preallocate 100 rows tmp = uni.UniTable([self.__field] + self.__attributes, _prealloc=100000) tmp.append(result[:-4]) stats[rules] = [tmp]
def makeConfigs(inFile, outFile, inPMML, outPMML): #open data file inf = uni.UniTable().fromfile(inFile) #start the configuration file root = ET.Element("model") root.set("input", str(inPMML)) root.set("output", str(outPMML)) test = ET.SubElement(root, "test") test.set("field", "Automaker") test.set("weightField", "Count") test.set("testStatistic", "dDist") test.set("testType", "threshold") test.set("windowSize", "200") # note that for dDist test, threshold is really a 'ceiling' test.set("threshold", "0.15") # use a discrete distribution model for test baseline = ET.SubElement(test, "baseline") baseline.set("dist", "discrete") baseline.set("file", str(inFile)) baseline.set("type", "UniTable") #create the segmentation declarations for the two fields segmentation = ET.SubElement(test, "segmentation") makeSegment(inf, segmentation, "Color") #output the configurations tree = ET.ElementTree(root) tree.write(outFile)
def makeConfigs(inFile, outFile, inPMML, outPMML): #open data file inf = uni.UniTable().fromfile(inFile) #start the configuration file root = ET.Element("model") root.set("input", str(inPMML)) root.set("output", str(outPMML)) test = ET.SubElement(root, "test") test.set("field", "volume") test.set("testStatistic", "zValue") test.set("testType", "threshold") test.set("threshold", "1.5") baseline = ET.SubElement(test, "baseline") baseline.set("dist", "gaussian") baseline.set("file", str(inFile)) baseline.set("type", "UniTable") #create the segmentation declarations for the two fields segmentation = ET.SubElement(test, "segmentation") makeSegment(inf, segmentation, "fielddeviceid") makeSegment(inf, segmentation, "locationtimestamp") #output the configurations tree = ET.ElementTree(root) tree.write(outFile)
name = line[start + 1:] name = name[:name.find('"')] names.append(name) inf.close() #Read in the pmml file and set the "get value function" myReader = pmmlReader() myReader.parse("sample_events.pmml") myPMML = myReader.root myPMML.updateInputFunctions(get) #open the output file out = open("sample_events.out", "w") #read in evente myTable = uni.UniTable() myTable.from_any_file("sample_events.nab") rows = len(myTable) #will hold fifty (or whatever step is set to) values at a time inputValues = {} #Gets the model for convience and speed's sake model = myPMML.getChildrenOfType(pmmlModels)[0] #Tell the model which fields we'll want back out for reporting model.initialize(["Auth_Dt"]) #cache 50 rows at a time cnt = 0 step = 50
def makeTests(self): """""" if self.__timer: self.__timer.output("Making test distributions from statistics") #TEMPORARY outFields = [] outValues = [] outMeans = [] outStdDevs = [] #extensions extensions = [] if self.__skip: extensions.append( pmmlExtension(children=[ extensionSkip(attributes={"number": str(self.__skip)}) ])) #create a test for each segment tests = [] keys = self._stats.keys() keys.sort() if self._alternate: #include alternate distributions baseDist = self._baseline.get("dist") altDist = self._alternate.get("dist") for entry in keys: if self._stats[entry] and self._altstats[entry]: child = Producer.makeDistribution(baseDist, self._stats[entry]) if child: baseline = pmmlBaseline(children=[child]) temp = Producer.makeDistribution( altDist, self._altstats[entry]) if temp: alt = pmmlAlternate(children=[temp]) segments = Producer.makeSegments(entry) segments = pmmlSegments(children=segments) children = list(extensions) children.extend([baseline, alt, segments]) tests.append( pmmlTestDistributions(children=children, attributes=self._attrs)) else: #do not include alternate distributions baseDist = self._baseline.get("dist") for entry in keys: if self._stats[entry]: child = Producer.makeDistribution(baseDist, self._stats[entry], self.testValidation) if child: baseline = pmmlBaseline(children=[child]) segments = Producer.makeSegments(entry) segments = pmmlSegments(children=segments) children = list(extensions) children.extend([baseline, segments]) tests.append( pmmlTestDistributions(children=children, attributes=self._attrs)) #TEMPORARY if self.__debugFile: if entry: outFields.append(entry[0][0]) outValues.append(entry[0][1]) stats = self._stats[entry] outMeans.append(float(stats[2]) / stats[4]) outStdDevs.append( math.sqrt( max((float(stats[3]) / stats[4] - (float(stats[2]) / stats[4])**2), 0))) #put the tests in the current model originals = self._model.getChildrenOfType(pmmlTestDistributions) if (self.__mode == 'Update'): self._model.removeChildren(originals) self._model.addChildren(tests) else: self._model.addChildren(tests) #TEMPORARY if self.__debugFile: out = uni.UniTable(["field", "value", "mean", "stddev"]) out["field"] = outFields out["value"] = outValues out["mean"] = outMeans out["stddev"] = outStdDevs out.to_nab_file(str(self.__debugFile))