def create_right_node(obj, derived_col_names): nd = pml.Node() nd.set_SimplePredicate( pml.SimplePredicate(field=xgboostToPmml.replace_name_with_derivedColumnNames(derived_col_names[int(obj['split_feature'])],\ derived_col_names), operator=SIMPLE_PREDICATE_OPERATOR.GREATER_THAN.value, value="{:.16f}".format(obj['threshold']))) create_node(obj['right_child'], nd, derived_col_names) return nd
def get_targets(model, target_name): """ It returns the Target element of the model. Parameters ---------- model : An Xgboost model instance. target_name : String Name of the Target column. Returns ------- targets : Returns Nyoka's Target object """ if model.__class__.__name__ == 'XGBRegressor': targets = pml.Targets( Target=[ pml.Target( field=target_name, rescaleConstant="{:.16f}".format(model.base_score if model.base_score is not None else 0.5) ) ] ) return targets
def create_right_node(obj, derived_col_names): nd = pml.Node() nd.set_SimplePredicate( pml.SimplePredicate(field=replace_name_with_derivedColumnNames(obj['split'], derived_col_names),\ operator='greaterOrEqual', value="{:.16f}".format(obj['split_condition']))) create_node(obj['children'][1], nd, derived_col_names) return nd
def get_data_dictionary(): data_fields = [] if not self._use_lag: for i,hull in enumerate(self._hulls): for j in range(self._length_of_fingerprint): data_fields.append( pml.DataField( name=hull["name"] + _UNDERSCORE + str(j), optype=OPTYPE.CONTINUOUS, dataType=DATATYPE.DOUBLE ) ) else: for idx, hull in enumerate(self._hulls): data_fields.append( pml.DataField( name=hull["name"], optype=OPTYPE.CONTINUOUS, dataType=DATATYPE.DOUBLE ) ) data_dict = pml.DataDictionary( numberOfFields=len(data_fields), DataField=data_fields ) return data_dict
def create_right_node(obj,derived_col_names): nd = pml.Node() nd.set_SimplePredicate( pml.SimplePredicate(field=xgboostToPmml.replace_name_with_derivedColumnNames(derived_col_names[int(obj['split_feature'])],\ derived_col_names), operator='greaterOrEqual', value="{:.16f}".format(obj['threshold']))) create_node(obj['right_child'], nd, derived_col_names) return nd
def create_right_node(obj,derived_col_names): nd = pml.Node() nd.set_SimplePredicate( pml.SimplePredicate(field=replace_name_with_derivedColumnNames(obj['split'], derived_col_names),\ operator=SIMPLE_PREDICATE_OPERATOR.GREATER_OR_EQUAL, value="{:.16f}".format(obj['split_condition']))) create_node(obj['children'][1], nd, derived_col_names) return nd
def __init__(self, predictedClasses=None): ny.Output.__init__(self) if predictedClasses: ny.Output.add_OutputField( self, ny.OutputField(name="predicted_label", feature="predictedValue", dataType="string", optype="categorical")) ny.Output.add_OutputField( self, ny.OutputField(name="top1_prob", feature="probability", dataType="double")) ny.Output.add_OutputField( self, ny.OutputField(name="top5_prob", feature="topCategories", numTopCategories="5", dataType="string", optype="categorical")) else: ny.Output.add_OutputField( self, ny.OutputField(name="predicted_predictions", feature="predictedValue", dataType="double", optype="continuous"))
def lag(trfm, col_names): """ Generates pre-processing elements for Nyoka's Lag Parameters ---------- trfm : Contains the Nyoka's Lag instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Lag preprocessing. """ derived_flds = list() pp_dict = dict() derived_colnames = get_derived_colnames(trfm.aggregation, col_names) for idx, name in enumerate(col_names): lag = pml.Lag(field=name, n=trfm.value, aggregate=trfm.aggregation) derived_fld = pml.DerivedField(name=derived_colnames[idx], Lag=lag, optype=OPTYPE.CONTINUOUS.value,\ dataType=DATATYPE.DOUBLE.value) derived_flds.append(derived_fld) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def __init__(self, dataSet, script_args): if script_args['content'].__class__.__name__ == 'str': content = script_args['content'] def_name = script_args['def_name'] else: import inspect content = inspect.getsource(script_args['content']) def_name = script_args['content'].__name__ encode = True if "encode" in script_args: encode = script_args['encode'] if encode: content = base64.b64encode(content.encode()).decode() return_type = script_args['return_type'].lower() extension = [ ny.Extension(extender='ADAPA', name=def_name, value=return_type, anytypeobjs_=[content]) ] def_func = ny.DefineFunction( name='customFunc', optype='categorical' if return_type == 'string' else 'continous', dataType=return_type, ParameterField=[ ny.ParameterField(name=dataSet, dataType='binary') ], Apply=ny.Apply(function='python', Extension=extension, FieldRef=[ny.FieldRef(field=dataSet)]), ) ny.TransformationDictionary.__init__(self) ny.TransformationDictionary.add_DefineFunction(self, def_func)
def lbl_binarizer(trfm, col_names, **kwargs): """ Parameters ---------- trfm : Contains the Sklearn's Label Binarizer preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Label Binarizer preprocessing. """ derived_flds = list() derived_colnames = list() pp_dict = dict() categoric_lbls = trfm.classes_.tolist() model_exception_list = [ "LinearRegression", "LogisticRegression", "SVR", "SVC" ] model = kwargs['model'] for col_name_idx in range(len(col_names)): if len(categoric_lbls) == 2: derived_colnames = get_derived_colnames( "labelBinarizer(" + str(col_names[col_name_idx]), [categoric_lbls[-1]], ")") norm_descr = pml.NormDiscrete(field=str(col_names[-1]), value=str(categoric_lbls[-1])) derived_flds.append( pml.DerivedField(NormDiscrete=norm_descr, name=derived_colnames[-1], optype="categorical", dataType="double")) else: derived_colnames = get_derived_colnames( "labelBinarizer(" + str(col_names[col_name_idx]), categoric_lbls, ")") for attribute_name in col_names: for class_name, class_idx in zip(categoric_lbls, range(len(categoric_lbls))): norm_descr = pml.NormDiscrete(field=str(attribute_name), value=str(class_name)) derived_flds.append( pml.DerivedField(NormDiscrete=norm_descr, name=derived_colnames[class_idx], optype="categorical", dataType="double")) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_class_lbl'] = categoric_lbls pp_dict['pp_feat_name'] = col_names[0] return pp_dict
def get_output_for_regression_model(index): output_fields = [ pml.OutputField(name="normalizedDistance" + _UNDERSCORE + str(index), optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value) ] return pml.Output(OutputField=output_fields)
def get_header(): header = pml.Header( Application=pml.Application( name=HEADER_INFO.APPLICATION_NAME, version=HEADER_INFO.APPLICATION_VERSION ), description=self._fingerprint_description ) return header
def __init__(self, description, copyright): if not description: description = "Keras Model in PMML" if not copyright: copyright = "Copyright (c) 2018 Software AG" ny.Header.__init__(self, copyright=copyright, description=description, Timestamp=ny.Timestamp(str(datetime.datetime.now())), Application=ny.Application(name="Nyoka",version=metadata.__version__))
def create_left_node(obj, derived_col_names): nd = pml.Node() nd.set_SimplePredicate( pml.SimplePredicate(field=replace_name_with_derivedColumnNames( obj['split'], derived_col_names), operator='lessThan', value=obj['split_condition'])) create_node(obj['children'][0], nd, derived_col_names) return nd
def lbl_binarizer(trfm, col_names, **kwargs): """ Generates pre-processing elements for Scikit-Learn's LabelBinarizer Parameters ---------- trfm : Contains the Sklearn's Label Binarizer preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Label Binarizer preprocessing. """ derived_flds = list() derived_colnames = list() pp_dict = dict() categoric_lbls = trfm.classes_.tolist() for col_name_idx in range(len(col_names)): if len(categoric_lbls) == 2: derived_colnames = get_derived_colnames( "labelBinarizer(" + str(col_names[col_name_idx]), [categoric_lbls[-1]], ")") norm_descr = pml.NormDiscrete(field=str(col_names[-1]), value=str(categoric_lbls[-1])) derived_flds.append( pml.DerivedField(NormDiscrete=norm_descr, name=derived_colnames[-1], optype=OPTYPE.CATEGORICAL.value, dataType=DATATYPE.DOUBLE.value)) else: derived_colnames = get_derived_colnames( "labelBinarizer(" + str(col_names[col_name_idx]), categoric_lbls, ")") for attribute_name in col_names: for class_name, class_idx in zip(categoric_lbls, range(len(categoric_lbls))): norm_descr = pml.NormDiscrete(field=str(attribute_name), value=str(class_name)) derived_flds.append( pml.DerivedField(NormDiscrete=norm_descr, name=derived_colnames[class_idx], optype=OPTYPE.CATEGORICAL.value, dataType=DATATYPE.DOUBLE.value)) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_class_lbl'] = categoric_lbls pp_dict['pp_feat_name'] = col_names[0] return pp_dict
def __init__(self, dataSet, predictedClasses): ny.MiningSchema.__init__(self) name = dataSet ny.MiningSchema.add_MiningField(self, ny.MiningField( name=name, usageType="active", invalidValueTreatment="asIs")) ny.MiningSchema.add_MiningField(self, ny.MiningField( name="labels" if predictedClasses else "predictions", usageType="target", invalidValueTreatment="asIs"))
def create_left_node(obj, derived_col_names): nd = pml.Node() nd.set_SimplePredicate( pml.SimplePredicate( field=xgboostToPmml.replace_name_with_derivedColumnNames( derived_col_names[int(obj['split_feature'])], derived_col_names), operator='lessThan', value=obj['threshold'])) create_node(obj['left_child'], nd, derived_col_names) return nd
def tfidf_vectorizer(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's TfIdfVectorizer Parameters ---------- trfm : Contains the Sklearn's TfIdfVectorizer preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to TfIdfVectorizer preprocessing. """ pp_dict = dict() features = [ str(feat.encode("utf8"))[2:-1] for feat in trfm.get_feature_names() ] idfs = trfm.idf_ extra_features = list(trfm.vocabulary_.keys()) derived_flds = list() derived_colnames = get_derived_colnames('tfidf@[' + col_names[0] + ']', features) if trfm.lowercase: derived_flds.append( pml.DerivedField(name='lowercase(' + col_names[0] + ')', optype=OPTYPE.CATEGORICAL.value, dataType=DATATYPE.STRING.value, Apply=pml.Apply( function=FUNCTION.LOWERCASE.value, FieldRef=[pml.FieldRef(field=col_names[0])]))) for feat_idx, idf in zip(range(len(features)), idfs): derived_flds.append( pml.DerivedField( name=derived_colnames[feat_idx], optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value, Apply=pml.Apply( function=FUNCTION.MULTIPLICATION.value, TextIndex=[ pml.TextIndex( textField='lowercase(' + col_names[0] + ')', wordSeparatorCharacterRE='\\s+', tokenize='true', Constant=pml.Constant(valueOf_=features[feat_idx]), Extension=[ pml.Extension(value=extra_features[feat_idx]) ]) ], Constant=[pml.Constant(valueOf_="{:.16f}".format(idf))]))) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_name'] = col_names[0] pp_dict['pp_feat_class_lbl'] = list() return pp_dict
def get_mining_model(): output = get_output_for_mining_model() mining_model = pml.MiningModel( functionName=MINING_FUNCTION.REGRESSION.value, modelName=self._fingerprint_name if self._model_name is None else self._model_name, MiningSchema=pml.MiningSchema( MiningField=get_mining_fields_for_mining_model()), Output=output, Segmentation=pml.Segmentation( multipleModelMethod=MULTIPLE_MODEL_METHOD.SUM.value, Segment=get_segments())) return mining_model
def get_mining_fields_for_regression_model(index): mining_fields = [] if not self._use_lag: for i in range(self._length_of_fingerprint): mining_fields.append( pml.MiningField(name=self._hulls[index]["name"] + _UNDERSCORE + str(i), usageType="active")) else: mining_fields.append( pml.MiningField(name=self._hulls[index]["name"], usageType="active")) return mining_fields
def get_output(model, target_name): """ It returns the output element of the model. Parameters ---------- model : An Xboost model instance. target_name : String Name of the Target column. Returns ------- Output : Nyoka's Output object """ mining_func = get_mining_func(model) output_fields = list() if not has_target(model): output_fields.append(pml.OutputField( name='predicted', feature=RESULT_FEATURE.PREDICTED_VALUE, optype=OPTYPE.CONTINUOUS, dataType=DATATYPE.DOUBLE )) else: alt_target_name = 'predicted_' + target_name if mining_func == MINING_FUNCTION.CLASSIFICATION: for cls in model.classes_: output_fields.append(pml.OutputField( name='probability_' + str(cls), feature=RESULT_FEATURE.PROBABILITY, optype=OPTYPE.CONTINUOUS, dataType=DATATYPE.DOUBLE, value=str(cls) )) output_fields.append(pml.OutputField( name=alt_target_name, feature=RESULT_FEATURE.PREDICTED_VALUE, optype=OPTYPE.CATEGORICAL, dataType=get_dtype(model.classes_[0]))) else: output_fields.append(pml.OutputField( name=alt_target_name, feature=RESULT_FEATURE.PREDICTED_VALUE, optype=OPTYPE.CONTINUOUS, dataType=DATATYPE.DOUBLE)) return pml.Output(OutputField=output_fields)
def polynomial_features(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's PolynomialFeatures Parameters ---------- trfm : Contains the Sklearn's PolynomialFeatures preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to PolynomialFeatures preprocessing. """ polynomial_features.poly_ctr += 1 pp_dict = dict() derived_flds = [] derived_colnames = [] for polyfeat_idx in range(trfm.powers_.shape[0]): apply_inner_container = [] for col_name_idx in range(len(col_names)): val = int(trfm.powers_[polyfeat_idx][col_name_idx]) apply_inner = pml.Apply( function='pow', Constant=[pml.Constant( dataType="integer", valueOf_=val )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]) apply_inner_container.append(apply_inner) apply_outer = pml.Apply(function="product", Apply_member=apply_inner_container ) derived_flds.append(pml.DerivedField( Apply=apply_outer, dataType="double", optype="continuous", name="poly" + str(polynomial_features.poly_ctr) + '-' + "x" + str(polyfeat_idx) )) name = derived_flds[polyfeat_idx].get_name() derived_colnames.append(name) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def get_output_for_mining_model(): output_fields = [ pml.OutputField( name="totalDistance", optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value, feature=RESULT_FEATURE.PREDICTED_VALUE.value, ), pml.OutputField(name="finalResult", optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value, feature=RESULT_FEATURE.TRANSFORMED_VALUE.value, Apply=get_normalization_function()), ] return pml.Output(OutputField=output_fields)
def lbl_encoder(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's LabelEncoder preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to LabelEncoder preprocessing. """ pp_dict = dict() derived_flds = list() field_column_pair = list() rows = [] categoric_lbls = trfm.classes_.tolist() categoric_lbls_num = trfm.transform(trfm.classes_.tolist()).tolist() derived_colnames = get_derived_colnames('labelEncoder', col_names) for row_idx in range(len(categoric_lbls_num)): row_main = pml.row() row_main.elementobjs_ = ['input', 'output'] row_main.input = categoric_lbls[row_idx] row_main.output = str(categoric_lbls_num[row_idx]) rows.append(row_main) field_column_pair.append( pml.FieldColumnPair(field=str(col_names[0]), column="input")) inline_table = pml.InlineTable(row=rows) map_values = pml.MapValues(outputColumn="output", FieldColumnPair=field_column_pair, InlineTable=inline_table) derived_flds.append( pml.DerivedField(MapValues=map_values, name=derived_colnames[0], optype="continuous", dataType="double")) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_class_lbl'] = categoric_lbls pp_dict['pp_feat_name'] = col_names[0] return pp_dict
def __init__(self, dataSet=None): ny.MiningSchema.__init__(self) if dataSet: name = dataSet ny.MiningSchema.add_MiningField(self, ny.MiningField( name=name, usageType="active", invalidValueTreatment="asIs")) else: name = "dataSet" ny.MiningSchema.add_MiningField(self, ny.MiningField( name=name, usageType="active", invalidValueTreatment="asIs")) ny.MiningSchema.add_MiningField(self, ny.MiningField( name="predictions", usageType="target", invalidValueTreatment="asIs"))
def get_ensemble_models(model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values): """ It returns the Mining Model element of the model Parameters ---------- model : Contains LGB model object. derived_col_names : List Contains column names after preprocessing. col_names : List Contains list of feature/column names. target_name : String Name of the Target column. mining_imp_val : tuple Contains the mining_attributes,mining_strategy, mining_impute_value. categoric_values : tuple Contains Categorical attribute names and its values Returns ------- mining_models : Returns the MiningModel of the respective LGB model """ model_kwargs = sklToPmml.get_model_kwargs(model, col_names, target_name, mining_imp_val) mining_models = list() mining_models.append( pml.MiningModel(modelName="LightGBModel", Segmentation=get_outer_segmentation( model, derived_col_names, col_names, target_name, mining_imp_val, categoric_values), **model_kwargs)) return mining_models
def get_segments_for_lgbr(model, derived_col_names, feature_names, target_name, mining_imp_val,categorical_values): """ It returns all the Segments element of the model Parameters ---------- model : Contains LGB model object. derived_col_names : List Contains column names after preprocessing. feature_names : List Contains list of feature/column names. target_name : List Name of the Target column. mining_imp_val : tuple Contains the mining_attributes,mining_strategy, mining_impute_value categoric_values : tuple Contains Categorical attribute names and its values Returns ------- segment : Get the Segmentation element which contains inner segments. """ segments = list() main_key_value = [] lgb_dump = model.booster_.dump_model() for i in range(len(lgb_dump['tree_info'])): tree = lgb_dump['tree_info'][i]['tree_structure'] main_key_value.append(tree) segmentation = pml.Segmentation(multipleModelMethod="sum", Segment=generate_Segments_Equal_To_Estimators(main_key_value, derived_col_names, feature_names)) return segmentation
def get_outer_segmentation(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values): """ It returns the Segmentation element of the model. Parameters ---------- model : Contains LGB model object. derived_col_names : List Contains column names after preprocessing. col_names : List Contains list of feature/column names. target_name : String Name of the Target column. mining_imp_val : tuple Contains the mining_attributes,mining_strategy, mining_impute_value categoric_values : tuple Contains Categorical attribute names and its values Returns ------- segmentation : Get the outer most Segmentation of an LGB model """ if 'LGBMRegressor' in str(model.__class__): segmentation=get_segments(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values) else: segmentation = pml.Segmentation( multipleModelMethod=get_multiple_model_method(model), Segment=get_segments(model, derived_col_names, col_names, target_name, mining_imp_val,categoric_values) ) return segmentation
def get_segments_for_xgbr(model, derived_col_names, feature_names, target_name, mining_imp_val,categorical_values): """ It returns all the Segments element of the model Parameters ---------- model : Contains Xgboost model object. derived_col_names : List Contains column names after preprocessing. feature_names : List Contains list of feature/column names. target_name : List Name of the Target column. mining_imp_val : tuple Contains the mining_attributes,mining_strategy, mining_impute_value categoric_values : tuple Contains Categorical attribute names and its values Returns ------- segment : Nyoka's Segment object """ get_nodes_in_json_format = model._Booster.get_dump(dump_format='json') segmentation = pml.Segmentation(multipleModelMethod=MULTIPLE_MODEL_METHOD.SUM, Segment=generate_Segments_Equal_To_Estimators(get_nodes_in_json_format, derived_col_names, feature_names)) return segmentation
def _get_layer_weights_n_biases(self, layer): """ Pulls out the Weights and Bias matrix from a given Keras layer Parameters ---------- layer : Keras layer object A Keras Layer Returns ------- layer_weights : array Weights of the Keras layer in Base64String format layer_biases : array Bias of the Keras layer in Base64String format """ layer_all_weights = layer.get_weights() layer_weights = layer_biases = biases = None if layer_all_weights: if hasattr(layer, 'use_bias') and layer.use_bias: biases = layer_all_weights[-1] weights, w_shape = self._get_flatten_weights( layer_all_weights[0:-1]) layer_weights = ny.LayerWeights(content=weights, floatsPerLine=0, weightsShape=w_shape, weightsFlattenAxis="0") else: weights, w_shape = self._get_flatten_weights(layer_all_weights) layer_weights = ny.LayerWeights(content=weights, floatsPerLine=0, weightsShape=w_shape, weightsFlattenAxis="0") if biases is not None: bs_shape = biases.shape if len(bs_shape) == 1: final_bs_shape = str((bs_shape[0], 1)) else: final_bs_shape = str(bs_shape) layer_biases = ny.LayerBias(content=biases, biasShape=final_bs_shape, biasFlattenAxis="0", floatsPerLine=0) return layer_weights, layer_biases