def field_not_in_list(field, values): mv = pmml.MapValues(outputColumn='output', defaultValue=1) mv.append(pmml.FieldColumnPair(field=field, column='input')) it = pmml.InlineTable() for v in values: it.append(pmml_row(input=v, output=0)) mv.append(it) return { DerivedFeatureTransformations.TRANSFORMATION: mv, DerivedFeatureTransformations.FUNCTION: lambda df: reduce(np.logical_and, [df[field] != _ for _ in values]) }
def transformation_dictionary(self): """ Build a transformation dictionary and return a TransformationDictionary element """ td = pmml.TransformationDictionary() # define a schema with all variables available for a model encoded_schema = [] self.context.schemas[Schema.NUMERIC] = encoded_schema idx = {} # First, populate transformation dictionary for _all_ derived fields, because they can be requested later for f in self.context.schemas[Schema.DERIVED]: ef = RealNumericFeature(name=f.name) df = pmml.DerivedField(name=ef.full_name, optype=ef.optype.value, dataType=ef.data_type.value) df.append(f.transformation) td.append(df) assert f.name not in idx, 'Duplicate field definition: {}'.format( f.name) idx[f.name] = ef # second, define the numeric transformations for the categorical variables for f in self.context.schemas[Schema.INPUT]: assert f.name not in idx, 'Duplicate field definition: {}'.format( f.name) if isinstance(f, CategoricalFeature): ef = RealNumericFeature(name=f.name, namespace=Schema.NUMERIC.namespace) # create a record in transformation dictionary with mapping from raw values into numbers df = pmml.DerivedField(name=ef.full_name, optype=ef.optype.value, dataType=ef.data_type.value) mv = pmml.MapValues(outputColumn='output', dataType=ef.data_type.value) mv.append( pmml.FieldColumnPair(field=f.full_name, column='input')) it = pmml.InlineTable() for i, v in enumerate(f.value_list): it.append(pmml_row(input=v, output=i)) td.append(df.append(mv.append(it))) idx[f.name] = ef else: idx[f.name] = f # now we can build a mirror of model schema into the numeric schema self.context.schemas[Schema.NUMERIC] = [ idx[f.name] for f in self.context.schemas[Schema.MODEL] ] return td
def test_transform_with_derived_field(self): self.est = DecisionTreeClassifier(max_depth=2) self.est.fit([ [0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 1, 1], ], [0, 1, 1, 1]) mapping = pmml.MapValues(dataType="double", outputColumn="output") mapping.append(pmml.FieldColumnPair(column="x1", field="x1")) mapping.append(pmml.FieldColumnPair(column="x2", field="x2")) it = pmml.InlineTable() it.append(pmml_row(x1=0, x2='zero', output=0)) it.append(pmml_row(x1=0, x2='one', output=0)) it.append(pmml_row(x1=1, x2='zero', output=0)) it.append(pmml_row(x1=1, x2='one', output=1)) mapping.append(it) self.ctx = TransformationContext({ Schema.INPUT: [ IntegerNumericFeature('x1'), StringCategoricalFeature('x2', ['zero', 'one']) ], Schema.DERIVED: [ DerivedFeature(feature=RealNumericFeature(name='x3'), transformation=mapping) ], Schema.MODEL: [ IntegerNumericFeature('x1'), StringCategoricalFeature('x2', ['zero', 'one']), RealNumericFeature(name='x3') ], Schema.OUTPUT: [IntegerCategoricalFeature('output', ['neg', 'pos'])] }) self.converter = DecisionTreeConverter(estimator=self.est, context=self.ctx, mode=ModelMode.CLASSIFICATION) self.converter.pmml().toxml()
def map_values(field, value_map, default_value): mv = pmml.MapValues(outputColumn='output', default_value=default_value) mv.append(pmml.FieldColumnPair(field=field, column='input')) it = pmml.InlineTable() for k, v in value_map.items(): it.append(pmml_row(input=k, output=v)) mv.append(it) return { DerivedFeatureTransformations.TRANSFORMATION: mv, DerivedFeatureTransformations.FUNCTION: lambda df: np.vectorize(partial(value_map.get, default_value)) (df[field]) }