def get_local_transformation(index): derived_fields = [] derived_field_names = [] hull = self._hulls[index] if self._use_lag: for i in range(1, self._length_of_fingerprint): name = hull["name"] + _UNDERSCORE + str(i - 1) derived_fields.append( pml.DerivedField( name=name, optype=OPTYPE.CONTINUOUS, dataType=DATATYPE.DOUBLE, Lag=pml.Lag(field=hull["name"], n=self._length_of_fingerprint - i) ) ) last_derived_name = hull["name"] + _UNDERSCORE + str(self._length_of_fingerprint - 1) derived_fields.append( pml.DerivedField( name=last_derived_name, optype=OPTYPE.CONTINUOUS, dataType=DATATYPE.DOUBLE, FieldRef=pml.FieldRef(field=hull["name"]) ) ) for idx, val in enumerate(hull["values"]): name = "distance_tag_" + str(idx) derived_field_names.append(name) derived_fields.append( pml.DerivedField( name=name, optype=OPTYPE.CONTINUOUS, dataType=DATATYPE.DOUBLE, Apply=pml.Apply( function=_CALCULATE_DISTANCE, FieldRef=[pml.FieldRef(field=hull["name"] + _UNDERSCORE + str(idx))], Constant=[ pml.Constant(valueOf_=val["maxValue"]), pml.Constant(valueOf_=val["minValue"]) ] ) ) ) derived_fields.append( pml.DerivedField( name=_SUM_OF_DISTANCE, optype=OPTYPE.CONTINUOUS, dataType=DATATYPE.DOUBLE, Apply=pml.Apply( function=FUNCTION.SUM, FieldRef=[ pml.FieldRef( field=field ) for field in derived_field_names ] ) ) ) return pml.LocalTransformations(DerivedField=derived_fields)
def tfidf_vectorizer(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's TfIdfVectorizer Parameters ---------- trfm : Contains the Sklearn's TfIdfVectorizer preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to TfIdfVectorizer preprocessing. """ pp_dict = dict() features = [ str(feat.encode("utf8"))[2:-1] for feat in trfm.get_feature_names() ] idfs = trfm.idf_ extra_features = list(trfm.vocabulary_.keys()) derived_flds = list() derived_colnames = get_derived_colnames('tfidf@[' + col_names[0] + ']', features) if trfm.lowercase: derived_flds.append( pml.DerivedField(name='lowercase(' + col_names[0] + ')', optype=OPTYPE.CATEGORICAL.value, dataType=DATATYPE.STRING.value, Apply=pml.Apply( function=FUNCTION.LOWERCASE.value, FieldRef=[pml.FieldRef(field=col_names[0])]))) for feat_idx, idf in zip(range(len(features)), idfs): derived_flds.append( pml.DerivedField( name=derived_colnames[feat_idx], optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value, Apply=pml.Apply( function=FUNCTION.MULTIPLICATION.value, TextIndex=[ pml.TextIndex( textField='lowercase(' + col_names[0] + ')', wordSeparatorCharacterRE='\\s+', tokenize='true', Constant=pml.Constant(valueOf_=features[feat_idx]), Extension=[ pml.Extension(value=extra_features[feat_idx]) ]) ], Constant=[pml.Constant(valueOf_="{:.16f}".format(idf))]))) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_name'] = col_names[0] pp_dict['pp_feat_class_lbl'] = list() return pp_dict
def pca(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's PCA Parameters ---------- trfm : Contains the Sklearn's PCA preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to PCA preprocessing. """ pca.counter += 1 pp_dict = dict() derived_flds = list() derived_colnames = list() val = trfm.mean_ zero = 0.0 for preprocess_idx in range(trfm.n_components_): add = list() for pca_idx in range(trfm.n_features_): apply_inner = pml.Apply( function=FUNCTION.SUBSTRACTTION.value, Constant=[ pml.Constant(dataType=DATATYPE.DOUBLE.value, valueOf_="{:.16f}".format(val[pca_idx])) ], FieldRef=[pml.FieldRef(field=col_names[pca_idx])]) apply_outer = pml.Apply( function=FUNCTION.MULTIPLICATION.value, Apply_member=[apply_inner], Constant=[ pml.Constant( dataType=DATATYPE.DOUBLE.value, valueOf_=zero if trfm.components_[preprocess_idx][pca_idx] == 0.0 else "{:.16f}".format( trfm.components_[preprocess_idx][pca_idx])) ]) add.append(apply_outer) app0 = pml.Apply(function=FUNCTION.SUM.value, Apply_member=add) derived_flds.append( pml.DerivedField(Apply=app0, dataType=DATATYPE.DOUBLE.value, optype=OPTYPE.CONTINUOUS.value, name="PCA" + str(pca.counter) + "-" + str(preprocess_idx))) name = derived_flds[preprocess_idx].get_name() derived_colnames.append(name) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def pca(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's PCA preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to PCA preprocessing. """ pca.counter += 1 pp_dict = dict() derived_flds = list() derived_colnames = list() val = trfm.mean_ zero = 0.0 for preprocess_idx in range(trfm.n_components_): add = list() for pca_idx in range(trfm.n_features_): apply_inner = pml.Apply( function='-', Constant=[ pml.Constant(dataType="double", valueOf_="{:.16f}".format(val[pca_idx])) ], FieldRef=[pml.FieldRef(field=col_names[pca_idx])]) apply_outer = pml.Apply( function="*", Apply_member=[apply_inner], Constant=[ pml.Constant( dataType="double", valueOf_=zero if trfm.components_[preprocess_idx][pca_idx] == 0.0 else "{:.16f}".format( trfm.components_[preprocess_idx][pca_idx])) ]) add.append(apply_outer) app0 = pml.Apply(function="sum", Apply_member=add) derived_flds.append( pml.DerivedField(Apply=app0, dataType="double", optype="continuous", name="PCA" + str(pca.counter) + "-" + str(preprocess_idx))) name = derived_flds[preprocess_idx].get_name() derived_colnames.append(name) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def lag(trfm, col_names): """ Parameters ---------- trfm : Contains the Nyoka's Lag instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Lag preprocessing. """ derived_flds = list() pp_dict = dict() derived_colnames = get_derived_colnames(trfm.aggregation, col_names) if trfm.aggregation == 'stddev': for idx, name in enumerate(col_names): applyies = list() for i in range(trfm.value): lags = list() lags.append(pml.Lag(field=name, n=i + 1)) lags.append(pml.Lag(field=name, n=trfm.value, aggregate="avg")) sub_func = pml.Apply(function="-", Lag=lags) pow_func = pml.Apply( function="pow", Apply_member=[sub_func], Constant=[pml.Constant(dataType="integer", valueOf_=2)]) applyies.append(pow_func) add_func = pml.Apply(function="+", Apply_member=applyies) div_func = pml.Apply(function="/", Apply_member=[add_func], Constant=[ pml.Constant(dataType="double", valueOf_=float(trfm.value)) ]) sqrt_func = pml.Apply(function="sqrt", Apply_member=[div_func]) derived_fld = pml.DerivedField(name=derived_colnames[idx], Apply=sqrt_func, optype="continuous", dataType="double") derived_flds.append(derived_fld) else: for idx, name in enumerate(col_names): lag = pml.Lag(field=name, n=trfm.value, aggregate=trfm.aggregation) derived_fld = pml.DerivedField(name=derived_colnames[idx], Lag=lag, optype="continuous", dataType="double") derived_flds.append(derived_fld) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def std_scaler(trfm, col_names, **kwargs): """ Parameters ---------- trfm : Contains the Sklearn's Standard Scaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Standard Scaler preprocessing. """ derived_flds = list() pp_dict = dict() if is_present("labelBinarizer", col_names): derived_flds_hidden = kwargs['derived_fld'] if derived_flds_hidden: derived_flds.extend(derived_flds_hidden) derived_colnames = get_derived_colnames('standardScaler', col_names) for col_name_idx in range(len(col_names)): apply_inner = list() apply_inner.append( pml.Apply( function='-', Constant=[ pml.Constant( dataType="double", # <--------------------- valueOf_="{:.16f}".format(trfm.mean_[col_name_idx])) ], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])])) apply_outer = pml.Apply( Apply_member=apply_inner, function='/', Constant=[ pml.Constant( dataType="double", # <---------------------------- valueOf_="{:.16f}".format(trfm.scale_[col_name_idx])) ]) derived_flds.append( pml.DerivedField(Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double")) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def rbst_scaler(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's RobustScaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to RobustScaler preprocessing. """ pp_dict = dict() derived_flds = list() derived_colnames = get_derived_colnames('robustScaler', col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): apply_inner = list() apply_inner.append( pml.Apply( function='-', Constant=[ pml.Constant( dataType="double", # <--------------------- valueOf_="{:.16f}".format( trfm.center_[col_name_idx])) ], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])], Extension=[ pml.Extension(name='scaling', anytypeobjs_=['RobustScaler']) ])) apply_outer = pml.Apply( Apply_member=apply_inner, function='/', Constant=[ pml.Constant( dataType="double", # <---------------------------- valueOf_="{:.16f}".format(trfm.scale_[col_name_idx])) ]) derived_flds.append( pml.DerivedField(Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double")) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def tfidf_vectorizer(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's TfIdfVectorizer preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to TfIdfVectorizer preprocessing. """ pp_dict = dict() features = [ str(feat.encode("utf8"))[2:-1] for feat in trfm.get_feature_names() ] idfs = trfm.idf_ derived_flds = list() derived_colnames = get_derived_colnames('tfidf@[' + col_names[0] + ']', features) if trfm.lowercase: derived_flds.append( pml.DerivedField(name='lowercase(' + col_names[0] + ')', optype='categorical', dataType='string', Apply=pml.Apply( function='lowercase', FieldRef=[pml.FieldRef(field=col_names[0])]))) for feat_idx, idf in zip(range(len(features)), idfs): derived_flds.append(pml.DerivedField( name = derived_colnames[feat_idx], optype='continuous', dataType='double', Apply=pml.Apply(function='*', TextIndex=[pml.TextIndex(textField='lowercase(' + col_names[0] + ')' if trfm.lowercase \ else col_names[0], wordSeparatorCharacterRE=trfm.token_pattern, tokenize='true', Constant=pml.Constant(valueOf_=features[feat_idx]), )], Constant=[pml.Constant(valueOf_="{:.16f}".format(idf))]) )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_name'] = col_names[0] pp_dict['pp_feat_class_lbl'] = list() return pp_dict
def rbst_scaler(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's RobustScaler Parameters ---------- trfm : Contains the Sklearn's RobustScaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to RobustScaler preprocessing. """ pp_dict = dict() derived_flds = list() derived_colnames = get_derived_colnames('robustScaler', col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): apply_inner = list() apply_inner.append( pml.Apply( function=FUNCTION.SUBSTRACTTION.value, Constant=[ pml.Constant(dataType=DATATYPE.DOUBLE.value, valueOf_="{:.16f}".format( trfm.center_[col_name_idx])) ], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])])) apply_outer = pml.Apply(Apply_member=apply_inner, function=FUNCTION.DIVISION.value, Constant=[ pml.Constant( dataType=DATATYPE.DOUBLE.value, valueOf_="{:.16f}".format( trfm.scale_[col_name_idx])) ]) derived_flds.append( pml.DerivedField(Apply=apply_outer, name=derived_colnames[col_name_idx], optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value)) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def min_max_scaler(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's MinMaxScaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to MinMaxScaler preprocessing. """ pp_dict = dict() derived_flds = list() # col_names = list(filter(lambda x: x not in exception_cols, col_names)) derived_colnames = get_derived_colnames("minMaxScaler", col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): apply_inner = list() apply_inner.append( pml.Apply( function='*', Constant=[ pml.Constant(dataType="double", valueOf_="{:.16f}".format( trfm.scale_[col_name_idx])) ], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])])) apply_outer = pml.Apply(Apply_member=apply_inner, function='+', Constant=[ pml.Constant( dataType="double", valueOf_="{:.16f}".format( trfm.min_[col_name_idx])) ]) derived_flds.append( pml.DerivedField(Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double")) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def count_vectorizer(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's CountVectorizer preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to CountVectorizer preprocessing. """ pp_dict = dict() features = [ str(feat.encode("utf8"))[2:-1] for feat in trfm.get_feature_names() ] # extra_features = [str(feat.encode("utf8"))[2:-1] for feat in list(trfm.vocabulary_.keys())] # features = trfm.get_feature_names() # extra_features = list(trfm.vocabulary_.keys()) derived_flds = list() derived_colnames = get_derived_colnames('count_vec@[' + col_names[0] + ']', features) # derived_colnames = list() if trfm.lowercase: derived_flds.append( pml.DerivedField(name='lowercase(' + col_names[0] + ')', optype='categorical', dataType='string', Apply=pml.Apply( function='lowercase', FieldRef=[pml.FieldRef(field=col_names[0])]))) for imp_features, index in zip(features, range(len(features))): # no_punct_word = remove_punctuation(imp_features) # if len(no_punct_word) == 0: # df_name = 'count_vec@[' + col_names[0] + ']('+ imp_features+')' # derived_colnames.append(df_name) df_name = derived_colnames[index] derived_flds.append(pml.DerivedField(name=df_name, optype='continuous', dataType='double', TextIndex=pml.TextIndex(textField='lowercase(' + col_names[0] + ')' if trfm.lowercase \ else col_names[0], # wordSeparatorCharacterRE='\s+', wordSeparatorCharacterRE=trfm.token_pattern, tokenize='true', Constant=pml.Constant(dataType="string", valueOf_=imp_features), # Extension=[pml.Extension( # anytypeobjs_=[extra_features[index]])] ))) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_name'] = col_names[0] pp_dict['pp_feat_class_lbl'] = list() return pp_dict
def get_calculate_distance_function(): value_for_true = pml.Constant(valueOf_=0) value_for_true.original_tagname_ = _CONSTANT calculate_distance = pml.DefineFunction( name=_CALCULATE_DISTANCE, optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value, ParameterField=[ pml.ParameterField(name=_TAG, optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value), pml.ParameterField(name=_TAG_UPPER_BOUNDARY, optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value), pml.ParameterField(name=_TAG_LOWER_BOUNDARY, optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value) ], Apply=pml.Apply( function=FUNCTION.IF.value, Apply_member=[ pml.Apply(function=_IS_INSIDE_BOUNDARY, FieldRef=[ pml.FieldRef(field=_TAG), pml.FieldRef(field=_TAG_UPPER_BOUNDARY), pml.FieldRef(field=_TAG_LOWER_BOUNDARY) ]), value_for_true, pml.Apply( function=FUNCTION.IF.value, Apply_member=[ pml.Apply( function=FUNCTION.LESS_OR_EQUAL.value, FieldRef=[ pml.FieldRef(field=_TAG), pml.FieldRef(field=_TAG_LOWER_BOUNDARY) ]), pml.Apply( function=FUNCTION.SUBSTRACTTION.value, FieldRef=[ pml.FieldRef( field=_TAG_LOWER_BOUNDARY), pml.FieldRef(field=_TAG) ]), pml.Apply( function=FUNCTION.SUBSTRACTTION.value, FieldRef=[ pml.FieldRef(field=_TAG), pml.FieldRef(field=_TAG_UPPER_BOUNDARY) ]) ]) ])) return calculate_distance
def count_vectorizer(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's CountVectorizer Parameters ---------- trfm : Contains the Sklearn's CountVectorizer preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to CountVectorizer preprocessing. """ pp_dict = dict() features = [ str(feat.encode("utf8"))[2:-1] for feat in trfm.get_feature_names() ] extra_features = list(trfm.vocabulary_.keys()) derived_flds = list() derived_colnames = get_derived_colnames('count_vec@[' + col_names[0] + ']', features) if trfm.lowercase: derived_flds.append( pml.DerivedField(name='lowercase(' + col_names[0] + ')', optype=OPTYPE.CATEGORICAL, dataType=DATATYPE.STRING, Apply=pml.Apply( function=FUNCTION.LOWERCASE, FieldRef=[pml.FieldRef(field=col_names[0])]))) for imp_features, index in zip(features, range(len(features))): df_name = derived_colnames[index] derived_flds.append(pml.DerivedField(name=df_name, optype=OPTYPE.CONTINUOUS, dataType=DATATYPE.DOUBLE, TextIndex=pml.TextIndex(textField='lowercase(' + col_names[0] + ')' if trfm.lowercase \ else col_names[0], wordSeparatorCharacterRE='\\s+', tokenize='true', Constant=pml.Constant(dataType=DATATYPE.STRING, valueOf_=imp_features), Extension=[pml.Extension(value=extra_features[index])] ))) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_name'] = col_names[0] pp_dict['pp_feat_class_lbl'] = list() return pp_dict
def polynomial_features(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's PolynomialFeatures Parameters ---------- trfm : Contains the Sklearn's PolynomialFeatures preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to PolynomialFeatures preprocessing. """ polynomial_features.poly_ctr += 1 pp_dict = dict() derived_flds = [] derived_colnames = [] for polyfeat_idx in range(trfm.powers_.shape[0]): apply_inner_container = [] for col_name_idx in range(len(col_names)): val = int(trfm.powers_[polyfeat_idx][col_name_idx]) apply_inner = pml.Apply( function='pow', Constant=[pml.Constant( dataType="integer", valueOf_=val )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]) apply_inner_container.append(apply_inner) apply_outer = pml.Apply(function="product", Apply_member=apply_inner_container ) derived_flds.append(pml.DerivedField( Apply=apply_outer, dataType="double", optype="continuous", name="poly" + str(polynomial_features.poly_ctr) + '-' + "x" + str(polyfeat_idx) )) name = derived_flds[polyfeat_idx].get_name() derived_colnames.append(name) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def get_normalization_function(): if len(self._hulls) == 1: max_distance = pml.Constant(valueOf_=self._max_distances[0]) else: max_distance = pml.Constant( valueOf_=self._length_of_fingerprint * len(self._hulls)) max_distance.original_tagname_ = "Constant" constant_100 = pml.Constant(valueOf_=100) constant_100.original_tagname_ = "Constant" # constant_max_distance = pml.Constant(valueOf_=self._max_distances[0]) # constant_max_distance.original_tagname_ = "Constant" substraction_function = pml.Apply( function=FUNCTION.MULTIPLICATION.value, Apply_member=[ pml.Apply(function=FUNCTION.DIVISION.value, Apply_member=[ pml.Apply( function=FUNCTION.SUBSTRACTTION.value, FieldRef=[ max_distance, pml.FieldRef(field="totalDistance") ]) ], Constant=[max_distance]), constant_100 ]) substraction_function.original_tagname_ = "Apply" equal_function = pml.Apply( function=FUNCTION.IF.value, Apply_member=[ pml.Apply(function=FUNCTION.EQUAL.value, FieldRef=[pml.FieldRef(field="totalDistance")], Constant=[pml.Constant(valueOf_=0)]) ], Constant=[ pml.Constant(valueOf_=100), substraction_function, ]) equal_function.original_tagname_ = "Apply" return pml.Apply( function=FUNCTION.IF.value, Apply_member=[ pml.Apply(function=FUNCTION.GREATER_OR_EQUAL.value, FieldRef=[pml.FieldRef(field="totalDistance")], Constant=[max_distance]) ], Constant=[pml.Constant(valueOf_=0), equal_function])
def max_abs_scaler(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's MaxAbsScaler Parameters ---------- trfm : Contains the Sklearn's MaxabsScaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to MaxabsScaler preprocessing. """ pp_dict = dict() derived_flds = list() derived_colnames = get_derived_colnames('maxAbsScaler', col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): apply_outer = pml.Apply( function='/', Constant=[pml.Constant( dataType="double", valueOf_="{:.16f}".format(trfm.max_abs_[col_name_idx]) )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])] ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double" )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def binarizer(trfm, col_names): """ Generates pre-processing elements for Scikit-Learn's Binarizer Parameters ---------- trfm : Contains the Sklearn's Binarizer preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Binarizer preprocessing. """ pp_dict = dict() derived_flds = list() derived_colnames = get_derived_colnames("binarizer", col_names) for col_name_idx in range(len(col_names)): apply_outer = pml.Apply( function='threshold', Constant=[pml.Constant( dataType="double", valueOf_=trfm.threshold )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double" )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def imputer(trfm, col_names, **kwargs): """ Generates pre-processing elements for Scikit-Learn's Imputer Parameters ---------- trfm : Contains the Sklearn's Imputer preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Imputer preprocessing. """ original_col_names = imputer.col_names derived_colnames = col_names pp_dict = dict() derived_flds = list() model = kwargs['model'] mining_strategy = trfm.strategy if "mean" in mining_strategy: mining_strategy = MISSING_VALUE_TREATMENT_METHOD.AS_MEAN.value elif "median" in mining_strategy: mining_strategy = MISSING_VALUE_TREATMENT_METHOD.AS_MEDIAN.value elif "most_frequent" in mining_strategy: mining_strategy = MISSING_VALUE_TREATMENT_METHOD.AS_MODE.value mining_replacement_val = trfm.statistics_ if not any_in(original_col_names, col_names): derived_colnames = get_derived_colnames('imputer', col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): const_list = list() apply_inner = list() apply_inner.append( pml.Apply(function=FUNCTION.IS_MISSING.value, FieldRef=[ pml.FieldRef(field=col_names[col_name_idx]) ])) const_obj = pml.Constant( dataType=DATATYPE.DOUBLE.value, valueOf_=mining_replacement_val[col_name_idx]), fieldref_obj = pml.FieldRef(field=col_names[col_name_idx]) fieldref_obj.original_tagname_ = "FieldRef" const_list.append(const_obj[0]) const_list.append(fieldref_obj) apply_outer = pml.Apply(Apply_member=apply_inner, function=FUNCTION.IF.value, Constant=const_list) derived_flds.append( pml.DerivedField(Apply=apply_outer, name=derived_colnames[col_name_idx], optype=OPTYPE.CONTINUOUS.value, dataType=DATATYPE.DOUBLE.value)) else: pp_dict['mining_strategy'] = mining_strategy pp_dict['mining_replacement_val'] = mining_replacement_val pp_dict['mining_attributes'] = col_names pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def imputer(trfm, col_names, **kwargs): """ Parameters ---------- trfm : Contains the Sklearn's Imputer preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Imputer preprocessing. """ original_col_names = imputer.col_names derived_colnames = col_names pp_dict = dict() derived_flds = list() model = kwargs['model'] mining_strategy = trfm.strategy if "mean" in mining_strategy: mining_strategy = "asMean" elif "median" in mining_strategy: mining_strategy = "asMedian" elif "most_frequent" in mining_strategy: mining_strategy = "asMode" mining_replacement_val = trfm.statistics_ if not any_in(original_col_names, col_names): derived_colnames = get_derived_colnames('imputer', col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): const_list = list() apply_inner = list() apply_inner.append( pml.Apply(function='isMissing', FieldRef=[ pml.FieldRef(field=col_names[col_name_idx]) ])) const_obj = pml.Constant( dataType="double", # <--------------------- valueOf_=mining_replacement_val[col_name_idx]), fieldref_obj = pml.FieldRef(field=col_names[col_name_idx]) fieldref_obj.original_tagname_ = "FieldRef" const_list.append(const_obj[0]) const_list.append(fieldref_obj) apply_outer = pml.Apply(Apply_member=apply_inner, function='if', Constant=const_list) derived_flds.append( pml.DerivedField(Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double")) else: pp_dict['mining_strategy'] = mining_strategy pp_dict['mining_replacement_val'] = mining_replacement_val pp_dict['mining_attributes'] = col_names pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict