def _makeSegmentRecord(self, pmmlSegment, autoSegment=False): pmmlPredicate, pmmlSubModel = pmmlSegment.matches(pmml.nonExtension) originalId = pmmlSegment.attrib.get("id", None) segmentRecord = SegmentRecord(pmmlSubModel, pmmlPredicate, self.pmmlOutput, self, originalId) if originalId is None: pmmlSegment.attrib["id"] = segmentRecord.name() modelClass = pmmlSubModel.__class__ algoName = self.producerAlgorithm[ modelClass.__name__].attrib["algorithm"] segmentRecord.consumerAlgorithm = consumerAlgorithmMap[modelClass]( self, segmentRecord) segmentRecord.producerAlgorithm = producerAlgorithmMap[ modelClass, algoName](self, segmentRecord) segmentRecord.producerParameters = self.producerAlgorithm[ modelClass.__name__].parameters self.setProvenance(pmmlSubModel, algoName, segmentRecord.producerAlgorithm, segmentRecord.producerParameters) localTransformations = pmmlSubModel.child(pmml.LocalTransformations, exception=False) if localTransformations is not None: segmentRecord.aggregates = localTransformations.matches( pmml.Aggregate, maxdepth=None) segmentRecord.aggregates.extend( localTransformations.matches(pmml.X_ODG_AggregateReduce, maxdepth=None)) else: segmentRecord.aggregates = [] for aggregate in segmentRecord.aggregates: aggregate.initialize(self.consumerUpdateScheme) index = len(self.segmentRecords) added = False wantFastLookup = True if wantFastLookup: if _segmentHelpers.isSimpleEqual(pmmlPredicate): allSimpleEquals = set([pmmlPredicate]) isOr = False compoundAnds = [] elif _segmentHelpers.isCompoundAnd(pmmlPredicate): allSimpleEquals = set() compoundAnds = [pmmlPredicate] isOr = False elif isinstance(pmmlPredicate, pmml.pmmlFalse): allSimpleEquals = [] compoundAnds = [] # If the top level predicate is False, nothing will ever match. # Don't even put this in the lookup list (so set added=True). added = True else: allSimpleEquals = set( pmmlPredicate.matches(_segmentHelpers.isSimpleEqual)) compoundAnds = pmmlPredicate.matches( _segmentHelpers.isCompoundAnd) isOr = True if len(allSimpleEquals) + len(compoundAnds) != len( pmmlPredicate.children): allSimpleEquals = compoundAnds = [] for element in compoundAnds: if isOr: added = False elif added: break addEq = {} addComp = {} if element.child(pmml.pmmlTrue, exception=False) or element.child( pmml.pmmlFalse, exception=False): # True short-circuits all matches; put it in the slow-lookup list self._lookup.other allSimpleEquals = [] added = False break simpleEquals = element.matches(_segmentHelpers.isSimpleEqual) if simpleEquals: addEq = dict([(x['field'], x['value']) for x in simpleEquals]) def matchesAddEq(x, y): return x and y in addEq simpleComparators = element.matches( _segmentHelpers.isComparator) if len(simpleComparators): for s in simpleComparators: field = s['field'] lowerBound = s['operator'].startswith("g") # greater func = _segmentHelpers[s['operator']] val = s['value'] if field not in addComp: if lowerBound: addComp[field] = ((val, func), (None, None)) else: addComp[field] = ((None, None), (val, func)) else: if lowerBound: addComp[field] = ((val, func), addComp[field][1]) else: addComp[field] = (addComp[field][0], (val, func)) elif not len(simpleEquals): # If any of the compound ands have neither an equals nor a comparator # the entire predicate has to be added to the slow-lookup list self._lookup.other break def matchesAddComp(x, y): return x and y in addComp for eqTuple, compTuple in self._lookup.tuples.keys(): if len(addEq) == len(eqTuple) and \ len(addComp) == len(compTuple) and \ reduce(matchesAddEq, eqTuple, True) and \ reduce(matchesAddComp, compTuple, True): match = hash(tuple([addEq[key] for key in eqTuple])) self._lookup.tuples[(eqTuple, compTuple)].setdefault( match, []).append(index) if len(compTuple): d = self._lookup.tuples[(eqTuple, compTuple)].setdefault( MATCHRANGES, {}) for field, tup in addComp.iteritems(): if field not in d: d[field] = {tup: [index]} else: d[field].setdefault(tup, []).append(index) added = True break if not added: compTuple = tuple(addComp.keys()) if len(addEq): eqTuple, match = zip(*[[k, v] for k, v in addEq.iteritems()]) match = hash(match) self._lookup.tuples[(eqTuple, compTuple)] = { match: [index] } else: eqTuple = () d = self._lookup.tuples.setdefault((eqTuple, compTuple), {}) if len(compTuple): d = d.setdefault(MATCHRANGES, {}) for field, tup in addComp.iteritems(): d[field] = {tup: [index]} added = True for element in allSimpleEquals: field = element['field'] value = hash(element['value']) lookup = self._lookup.fields.setdefault(field, {}) lookup.setdefault(value, []).append(index) added = True if not added: self._lookup.other.append(index) if self.firstSegment: self.metadata.data["Total segments"] = 0 self.metadata.data["New segments created"] = 0 self.metadata.data["Average aggregations per segment"] = len( segmentRecord.aggregates) self.metadata.data["Average predicates per segment"] = ( 1.0 + len(pmmlPredicate.matches(lambda x: True, maxdepth=None))) self.metadata.data["Average local transformations per segment"] = \ len(segmentRecord.pmmlModel.dataContext.cast) - len(self.pmmlModel.dataContext.cast) self.firstSegment = False self.metadata.data[ "First segment model type"] = segmentRecord.pmmlModel.tag self.segmentRecords.append(segmentRecord) if autoSegment: segmentRecord.initialize(existingSegment=False, customProcessing=self.customProcessing, setModelMaturity=(not self.hasProducer)) self.pmmlFile.subModels.append(segmentRecord.pmmlModel) if self.customProcessing is not None: self.customProcessing.allSegments.append( segmentRecord.userFriendly) self.metadata.data["New segments created"] += 1 self.metadata.info( "New segment created: %s, ID=%s" % (segmentRecord.expressionTree, segmentRecord.name())) segmentRecord.pmmlModel.dataContext.clear() self.metadata.data["Total segments"] += 1 increment = 1.0 / float(self.metadata.data["Total segments"]) self.metadata.data["Average aggregations per segment"] *= (1.0 - increment) self.metadata.data["Average aggregations per segment"] += len( segmentRecord.aggregates) * increment self.metadata.data["Average predicates per segment"] *= (1.0 - increment) self.metadata.data["Average predicates per segment"] += (1.0 + len( pmmlPredicate.matches(lambda x: True, maxdepth=None))) * increment self.metadata.data["Average local transformations per segment"] *= ( 1.0 - increment) self.metadata.data["Average local transformations per segment"] += \ (len(segmentRecord.pmmlModel.dataContext.cast) - len(self.pmmlModel.dataContext.cast)) * increment return segmentRecord
def _makeSegmentRecord(self, pmmlSegment, autoSegment=False): pmmlPredicate, pmmlSubModel = pmmlSegment.matches(pmml.nonExtension) segmentRecord = SegmentRecord(pmmlSubModel, pmmlPredicate, self.pmmlOutput, self, pmmlSegment.attrib.get("id", None)) modelClass = pmmlSubModel.__class__ algoName = self.producerAlgorithm[modelClass.__name__].attrib["algorithm"] segmentRecord.consumerAlgorithm = consumerAlgorithmMap[modelClass](self, segmentRecord) segmentRecord.producerAlgorithm = producerAlgorithmMap[modelClass, algoName](self, segmentRecord) segmentRecord.producerParameters = self.producerAlgorithm[modelClass.__name__].parameters self.setProvenance(pmmlSubModel, algoName, segmentRecord.producerAlgorithm, segmentRecord.producerParameters) localTransformations = pmmlSubModel.child(pmml.LocalTransformations, exception=False) if localTransformations is not None: segmentRecord.aggregates = localTransformations.matches(pmml.Aggregate, maxdepth=None) segmentRecord.aggregates.extend(localTransformations.matches(pmml.X_ODG_AggregateReduce, maxdepth=None)) else: segmentRecord.aggregates = [] for aggregate in segmentRecord.aggregates: aggregate.initialize(self.consumerUpdateScheme) index = len(self.segmentRecords) added = False wantFastLookup = True if wantFastLookup: if _segmentHelpers.isSimpleEqual(pmmlPredicate): allSimpleEquals = set([pmmlPredicate]) isOr = False compoundAnds = [] elif _segmentHelpers.isCompoundAnd(pmmlPredicate): allSimpleEquals = set() compoundAnds = [pmmlPredicate] isOr = False elif isinstance(pmmlPredicate, pmml.pmmlFalse): allSimpleEquals = [] compoundAnds = [] # If the top level predicate is False, nothing will ever match. # Don't even put this in the lookup list (so set added=True). added = True else: allSimpleEquals = set(pmmlPredicate.matches(_segmentHelpers.isSimpleEqual)) compoundAnds = pmmlPredicate.matches(_segmentHelpers.isCompoundAnd) isOr = True if len(allSimpleEquals) + len(compoundAnds) != len(pmmlPredicate.children): allSimpleEquals = compoundAnds = [] for element in compoundAnds: if isOr: added = False elif added: break addEq = {} addComp = {} if element.child(pmml.pmmlTrue, exception=False) or element.child(pmml.pmmlFalse, exception=False): # True short-circuits all matches; put it in the slow-lookup list self._lookup.other allSimpleEquals = [] added = False break simpleEquals = element.matches(_segmentHelpers.isSimpleEqual) if simpleEquals: addEq = dict([(x['field'], x['value']) for x in simpleEquals]) def matchesAddEq(x, y): return x and y in addEq simpleComparators = element.matches(_segmentHelpers.isComparator) if len(simpleComparators): for s in simpleComparators: field = s['field'] lowerBound = s['operator'].startswith("g") # greater func = _segmentHelpers[s['operator']] val = s['value'] if field not in addComp: if lowerBound: addComp[field] = ((val, func), (None, None)) else: addComp[field] = ((None, None), (val, func)) else: if lowerBound: addComp[field] = ((val, func), addComp[field][1]) else: addComp[field] = (addComp[field][0], (val, func)) elif not len(simpleEquals): # If any of the compound ands have neither an equals nor a comparator # the entire predicate has to be added to the slow-lookup list self._lookup.other break def matchesAddComp(x, y): return x and y in addComp for eqTuple, compTuple in self._lookup.tuples.keys(): if len(addEq) == len(eqTuple) and \ len(addComp) == len(compTuple) and \ reduce(matchesAddEq, eqTuple, True) and \ reduce(matchesAddComp, compTuple, True): match = hash(tuple([addEq[key] for key in eqTuple])) self._lookup.tuples[(eqTuple, compTuple)].setdefault(match,[]).append(index) if len(compTuple): d = self._lookup.tuples[(eqTuple, compTuple)].setdefault(MATCHRANGES,{}) for field, tup in addComp.iteritems(): if field not in d: d[field] = {tup:[index]} else: d[field].setdefault(tup, []).append(index) added = True break if not added: compTuple = tuple(addComp.keys()) if len(addEq): eqTuple, match = zip(*[[k,v] for k,v in addEq.iteritems()]) match = hash(match) self._lookup.tuples[(eqTuple, compTuple)] = {match:[index]} else: eqTuple = () d = self._lookup.tuples.setdefault((eqTuple, compTuple), {}) if len(compTuple): d = d.setdefault(MATCHRANGES, {}) for field, tup in addComp.iteritems(): d[field] = {tup:[index]} added = True for element in allSimpleEquals: field = element['field'] value = hash(element['value']) lookup = self._lookup.fields.setdefault(field, {}) lookup.setdefault(value, []).append(index) added = True if not added: self._lookup.other.append(index) if self.firstSegment: self.metadata.data["Total segments"] = 0 self.metadata.data["New segments created"] = 0 self.metadata.data["Average aggregations per segment"] = len(segmentRecord.aggregates) self.metadata.data["Average predicates per segment"] = (1.0 + len(pmmlPredicate.matches(lambda x: True, maxdepth=None))) self.metadata.data["Average local transformations per segment"] = \ len(segmentRecord.pmmlModel.dataContext.cast) - len(self.pmmlModel.dataContext.cast) self.firstSegment = False self.metadata.data["First segment model type"] = segmentRecord.pmmlModel.tag self.segmentRecords.append(segmentRecord) if autoSegment: segmentRecord.initialize(existingSegment=False) self.pmmlFile.subModels.append(segmentRecord.pmmlModel) self.metadata.data["New segments created"] += 1 self.metadata.info("New segment created: %s, ID=%s" % (segmentRecord.expressionTree, segmentRecord.name())) segmentRecord.pmmlModel.dataContext.clear() self.metadata.data["Total segments"] += 1 increment = 1.0 / float(self.metadata.data["Total segments"]) self.metadata.data["Average aggregations per segment"] *= (1.0 - increment) self.metadata.data["Average aggregations per segment"] += len(segmentRecord.aggregates) * increment self.metadata.data["Average predicates per segment"] *= (1.0 - increment) self.metadata.data["Average predicates per segment"] += (1.0 + len(pmmlPredicate.matches(lambda x: True, maxdepth=None))) * increment self.metadata.data["Average local transformations per segment"] *= (1.0 - increment) self.metadata.data["Average local transformations per segment"] += \ (len(segmentRecord.pmmlModel.dataContext.cast) - len(self.pmmlModel.dataContext.cast)) * increment return segmentRecord
def initialize(self): """Interpret PMML file, set up SegmentRecords list, and initialize all algorithms.""" self.firstSegment = True # set up the header, so that our models can be stamped with time and event number header = self.pmmlFile.child(pmml.Header) if header.exists(pmml.Extension): headerExtension = header.child(pmml.Extension) else: headerExtension = pmml.Extension() header.children.insert(0, headerExtension) if headerExtension.exists(pmml.X_ODG_RandomSeed): del headerExtension[headerExtension.index(pmml.X_ODG_RandomSeed)] augustusRandomSeed = pmml.X_ODG_RandomSeed( value=self.augustusRandomSeed) headerExtension.children.append(augustusRandomSeed) if headerExtension.exists(pmml.X_ODG_Eventstamp): del headerExtension[headerExtension.index(pmml.X_ODG_Eventstamp)] self.eventStamp = pmml.X_ODG_Eventstamp(number=0) headerExtension.children.append(self.eventStamp) if header.exists(pmml.Timestamp): del header[header.index(pmml.Timestamp)] self.timeStamp = pmml.Timestamp( xmlbase.XMLText(datetime.datetime.today().isoformat())) header.children.append(self.timeStamp) # select the first model or select a model by name if self.modelName is None: self.pmmlModel = self.pmmlFile.topModels[0] else: self.pmmlModel = None for model in self.pmmlFile.topModels: if "modelName" in model.attrib and model.attrib[ "modelName"] == self.modelName: self.pmmlModel = model break if self.pmmlModel is None: raise RuntimeError, "No model named \"%s\" was found in the PMML file" % self.modelName # connect the dataContext to the dataStream, so that events will flow from the input file into the transformations self.resetDataStream(self.dataStream) # clear the cache the model DataContexts (initializes some dictionaries) self.pmmlModel.dataContext.clear() if self.pmmlModel.dataContext.transformationDictionary: self.metadata.data["Transformation dictionary elements"] = len( self.pmmlModel.dataContext.transformationDictionary.cast) else: self.metadata.data["Transformation dictionary elements"] = 0 self.segmentRecords = [] self._lookup = NameSpace(tuples={}, fields={}, other=[]) SegmentRecord.maturityThreshold = self.maturityThreshold SegmentRecord.lockingThreshold = self.lockingThreshold if self.pmmlFile.exists(pmml.TransformationDictionary): if self.pmmlFile.child(pmml.TransformationDictionary).exists( pmml.Aggregate, maxdepth=None): raise NotImplementedError, "Aggregate transformations in the TransformationDictionary are not supported" if self.pmmlFile.child(pmml.TransformationDictionary).exists( pmml.X_ODG_AggregateReduce, maxdepth=None): raise NotImplementedError, "X-ODG-AggregateReduce transformations in the TransformationDictionary are not supported" # MiningModels are special because we handle segmentation at the Engine level # Currently no support for MiningModels nested within MiningModels if isinstance(self.pmmlModel, pmml.MiningModel): self.pmmlOutput = self.pmmlModel.child(pmml.Output, exception=False) segmentation = self.pmmlModel.child(pmml.Segmentation, exception=False) # for now, assume a MiningModel without any segments will be populated through autosegmentation if self.pmmlModel.exists(pmml.LocalTransformations): if self.pmmlModel.child(pmml.LocalTransformations).exists( pmml.Aggregate, maxdepth=None): raise NotImplementedError, "Aggregate transformations in the MiningModel's LocalTransformations are not supported" if self.pmmlModel.child(pmml.LocalTransformations).exists( pmml.X_ODG_AggregateReduce, maxdepth=None): raise NotImplementedError, "X-ODG-AggregateReduce transformations in the MiningModel's LocalTransformations are not supported" if segmentation.attrib["multipleModelMethod"] == "selectFirst": self.multipleModelMethod = SELECTFIRST elif segmentation.attrib["multipleModelMethod"] == "selectAll": self.multipleModelMethod = SELECTALL else: raise NotImplementedError, "Only 'selectFirst', 'selectAll', and no segmentation have been implemented." self.metadata.data[ "Match all segments"] = self.multipleModelMethod != SELECTFIRST for pmmlSegment in segmentation.matches(pmml.Segment): self._makeSegmentRecord(pmmlSegment) else: self.multipleModelMethod = SELECTONLY segmentRecord = SegmentRecord(self.pmmlModel, None, None, self) modelClass = self.pmmlModel.__class__ algoName = self.producerAlgorithm[ modelClass.__name__].attrib["algorithm"] segmentRecord.consumerAlgorithm = consumerAlgorithmMap[modelClass]( self, segmentRecord) segmentRecord.producerAlgorithm = producerAlgorithmMap[ modelClass, algoName](self, segmentRecord) segmentRecord.producerParameters = self.producerAlgorithm[ modelClass.__name__].parameters self.setProvenance(self.pmmlModel, algoName, segmentRecord.producerAlgorithm, segmentRecord.producerParameters) localTransformations = self.pmmlModel.child( pmml.LocalTransformations, exception=False) if localTransformations is not None: segmentRecord.aggregates = localTransformations.matches( pmml.Aggregate, maxdepth=None) segmentRecord.aggregates.extend( localTransformations.matches(pmml.X_ODG_AggregateReduce, maxdepth=None)) else: segmentRecord.aggregates = [] for aggregate in segmentRecord.aggregates: aggregate.initialize(self.consumerUpdateScheme) self.segmentRecords.append(segmentRecord) self.metadata.data[ "First segment model type"] = segmentRecord.pmmlModel.tag self.reinitialize()
def initialize(self): """Interpret PMML file, set up SegmentRecords list, and initialize all algorithms.""" self.firstSegment = True # set up the header, so that our models can be stamped with time and event number header = self.pmmlFile.child(pmml.Header) if header.exists(pmml.Extension): headerExtension = header.child(pmml.Extension) else: headerExtension = pmml.Extension() header.children.insert(0, headerExtension) if headerExtension.exists(pmml.X_ODG_RandomSeed): del headerExtension[headerExtension.index(pmml.X_ODG_RandomSeed)] augustusRandomSeed = pmml.X_ODG_RandomSeed(value=self.augustusRandomSeed) headerExtension.children.append(augustusRandomSeed) if headerExtension.exists(pmml.X_ODG_Eventstamp): del headerExtension[headerExtension.index(pmml.X_ODG_Eventstamp)] self.eventStamp = pmml.X_ODG_Eventstamp(number=0) headerExtension.children.append(self.eventStamp) if header.exists(pmml.Timestamp): del header[header.index(pmml.Timestamp)] self.timeStamp = pmml.Timestamp(xmlbase.XMLText(datetime.datetime.today().isoformat())) header.children.append(self.timeStamp) # select the first model or select a model by name if self.modelName is None: self.pmmlModel = self.pmmlFile.topModels[0] else: self.pmmlModel = None for model in self.pmmlFile.topModels: if "modelName" in model.attrib and model.attrib["modelName"] == self.modelName: self.pmmlModel = model break if self.pmmlModel is None: raise RuntimeError, "No model named \"%s\" was found in the PMML file" % self.modelName # connect the dataContext to the dataStream, so that events will flow from the input file into the transformations self.resetDataStream(self.dataStream) # clear the cache the model DataContexts (initializes some dictionaries) self.pmmlModel.dataContext.clear() if self.pmmlModel.dataContext.transformationDictionary: self.metadata.data["Transformation dictionary elements"] = len(self.pmmlModel.dataContext.transformationDictionary.cast) else: self.metadata.data["Transformation dictionary elements"] = 0 self.segmentRecords = [] self._lookup = NameSpace(tuples={}, fields={}, other=[]) SegmentRecord.maturityThreshold = self.maturityThreshold SegmentRecord.lockingThreshold = self.lockingThreshold if self.pmmlFile.exists(pmml.TransformationDictionary): if self.pmmlFile.child(pmml.TransformationDictionary).exists(pmml.Aggregate, maxdepth=None): raise NotImplementedError, "Aggregate transformations in the TransformationDictionary are not supported" if self.pmmlFile.child(pmml.TransformationDictionary).exists(pmml.X_ODG_AggregateReduce, maxdepth=None): raise NotImplementedError, "X-ODG-AggregateReduce transformations in the TransformationDictionary are not supported" # MiningModels are special because we handle segmentation at the Engine level # Currently no support for MiningModels nested within MiningModels if isinstance(self.pmmlModel, pmml.MiningModel): self.pmmlOutput = self.pmmlModel.child(pmml.Output, exception=False) segmentation = self.pmmlModel.child(pmml.Segmentation, exception=False) # for now, assume a MiningModel without any segments will be populated through autosegmentation if self.pmmlModel.exists(pmml.LocalTransformations): if self.pmmlModel.child(pmml.LocalTransformations).exists(pmml.Aggregate, maxdepth=None): raise NotImplementedError, "Aggregate transformations in the MiningModel's LocalTransformations are not supported" if self.pmmlModel.child(pmml.LocalTransformations).exists(pmml.X_ODG_AggregateReduce, maxdepth=None): raise NotImplementedError, "X-ODG-AggregateReduce transformations in the MiningModel's LocalTransformations are not supported" if segmentation.attrib["multipleModelMethod"] == "selectFirst": self.multipleModelMethod = SELECTFIRST elif segmentation.attrib["multipleModelMethod"] == "selectAll": self.multipleModelMethod = SELECTALL else: raise NotImplementedError, "Only 'selectFirst', 'selectAll', and no segmentation have been implemented." self.metadata.data["Match all segments"] = self.multipleModelMethod != SELECTFIRST for pmmlSegment in segmentation.matches(pmml.Segment): self._makeSegmentRecord(pmmlSegment) else: self.multipleModelMethod = SELECTONLY segmentRecord = SegmentRecord(self.pmmlModel, None, None, self) modelClass = self.pmmlModel.__class__ algoName = self.producerAlgorithm[modelClass.__name__].attrib["algorithm"] segmentRecord.consumerAlgorithm = consumerAlgorithmMap[modelClass](self, segmentRecord) segmentRecord.producerAlgorithm = producerAlgorithmMap[modelClass, algoName](self, segmentRecord) segmentRecord.producerParameters = self.producerAlgorithm[modelClass.__name__].parameters self.setProvenance(self.pmmlModel, algoName, segmentRecord.producerAlgorithm, segmentRecord.producerParameters) localTransformations = self.pmmlModel.child(pmml.LocalTransformations, exception=False) if localTransformations is not None: segmentRecord.aggregates = localTransformations.matches(pmml.Aggregate, maxdepth=None) segmentRecord.aggregates.extend(localTransformations.matches(pmml.X_ODG_AggregateReduce, maxdepth=None)) else: segmentRecord.aggregates = [] for aggregate in segmentRecord.aggregates: aggregate.initialize(self.consumerUpdateScheme) self.segmentRecords.append(segmentRecord) self.metadata.data["First segment model type"] = segmentRecord.pmmlModel.tag for segmentRecord in self.segmentRecords: segmentRecord.initialize(existingSegment=True)