def lbl_binarizer(trfm, col_names, **kwargs): """ Parameters ---------- trfm : Contains the Sklearn's Label Binarizer preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Label Binarizer preprocessing. """ derived_flds = list() derived_colnames = list() pp_dict = dict() categoric_lbls = trfm.classes_.tolist() model_exception_list = ["LinearRegression", "LogisticRegression", "SVR", "SVC"] model = kwargs['model'] for col_name_idx in range(len(col_names)): if len(categoric_lbls) == 2: derived_colnames = get_derived_colnames("labelBinarizer(" + str(col_names[col_name_idx]), [categoric_lbls[-1]], ")") norm_descr = pml.NormDiscrete(field=str(col_names[-1]), value=str(categoric_lbls[-1])) derived_flds.append(pml.DerivedField(NormDiscrete=norm_descr, name=derived_colnames[-1], optype="categorical", dataType="double")) else: derived_colnames = get_derived_colnames("labelBinarizer(" + str(col_names[col_name_idx]), categoric_lbls, ")") for attribute_name in col_names: for class_name, class_idx in zip(categoric_lbls, range(len(categoric_lbls))): norm_descr = pml.NormDiscrete(field=str(attribute_name), value=str(class_name)) derived_flds.append( pml.DerivedField(NormDiscrete=norm_descr, name=derived_colnames[class_idx], optype="categorical", dataType="double")) if any_in([model.__class__.__name__], model_exception_list): pp_dict['hidden_lb_der_flds'] = derived_flds exception_cols.extend(derived_colnames) derived_flds = list() pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_class_lbl'] = categoric_lbls pp_dict['pp_feat_name'] = col_names[0] return pp_dict
def tfidf_vectorizer(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's TfIdfVectorizer preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to TfIdfVectorizer preprocessing. """ pp_dict = dict() features = trfm.get_feature_names() idfs = trfm.idf_ extra_features = list(trfm.vocabulary_.keys()) derived_flds = list() derived_colnames = get_derived_colnames('tfidf@[' + col_names[0] + ']', features) derived_flds.append( pml.DerivedField(name='lowercase(' + col_names[0] + ')', optype='categorical', dataType='string', Apply=pml.Apply(function='lowercase', FieldRef=[pml.FieldRef(field=col_names[0])]))) for feat_idx, idf in zip(range(len(features)), idfs): derived_flds.append(pml.DerivedField( name=derived_colnames[feat_idx], optype='continuous', dataType='double', Apply=pml.Apply(function='*', TextIndex=[pml.TextIndex(textField='lowercase(' + col_names[0] + ')', wordSeparatorCharacterRE='\s+', tokenize='true', Constant=pml.Constant(valueOf_=features[feat_idx]), Extension=[pml.Extension(anytypeobjs_=[extra_features[feat_idx]])])], Constant=[pml.Constant(valueOf_=idf)]) )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_name'] = col_names[0] pp_dict['pp_feat_class_lbl'] = list() return pp_dict
def std_scaler(trfm, col_names, **kwargs): """ Parameters ---------- trfm : Contains the Sklearn's Standard Scaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Standard Scaler preprocessing. """ derived_flds = list() pp_dict = dict() if is_present("labelBinarizer",col_names): derived_flds_hidden = kwargs['derived_fld'] if derived_flds_hidden: derived_flds.extend(derived_flds_hidden) derived_colnames = get_derived_colnames('standardScaler', col_names) for col_name_idx in range(len(col_names)): apply_inner = list() apply_inner.append(pml.Apply( function='-', Constant=[pml.Constant( dataType="double", # <--------------------- valueOf_=unround_scalers(trfm.mean_[col_name_idx]) )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])] )) apply_outer = pml.Apply( Apply_member=apply_inner, function='/', Constant=[pml.Constant( dataType="double", # <---------------------------- valueOf_=unround_scalers(trfm.scale_[col_name_idx]) )] ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double" )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def rbst_scaler(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's RobustScaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to RobustScaler preprocessing. """ pp_dict = dict() derived_flds = list() derived_colnames = get_derived_colnames('robustScaler', col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): apply_inner = list() apply_inner.append(pml.Apply( function='-', Constant=[pml.Constant( dataType="double", # <--------------------- valueOf_=unround_scalers(trfm.center_[col_name_idx]) )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])], Extension=[pml.Extension(name='scaling', anytypeobjs_=['RobustScaler'])] )) apply_outer = pml.Apply( Apply_member=apply_inner, function='/', Constant=[pml.Constant( dataType="double", # <---------------------------- valueOf_=unround_scalers(trfm.scale_[col_name_idx]) )] ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double" )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def min_max_scaler(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's MinMaxScaler preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to MinMaxScaler preprocessing. """ pp_dict = dict() derived_flds = list() # col_names = list(filter(lambda x: x not in exception_cols, col_names)) derived_colnames = get_derived_colnames("minMaxScaler", col_names) for col_name_idx in range(len(col_names)): if(col_names[col_name_idx] not in exception_cols): apply_inner = list() apply_inner.append(pml.Apply( function='*', Constant=[pml.Constant( dataType="double", valueOf_=unround_scalers(trfm.scale_[col_name_idx]) )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])] )) apply_outer = pml.Apply( Apply_member=apply_inner, function='+', Constant=[pml.Constant( dataType="double", valueOf_=unround_scalers(trfm.min_[col_name_idx]) )] ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double" )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def polynomial_features(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's PolynomialFeatures preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to PolynomialFeatures preprocessing. """ polynomial_features.poly_ctr += 1 pp_dict = dict() derived_flds = [] derived_colnames = [] for polyfeat_idx in range(trfm.powers_.shape[0]): apply_inner_container = [] for col_name_idx in range(len(col_names)): val = int(trfm.powers_[polyfeat_idx][col_name_idx]) apply_inner = pml.Apply( function='pow', Constant=[pml.Constant( dataType="integer", valueOf_=val )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]) apply_inner_container.append(apply_inner) apply_outer = pml.Apply(function="product", Apply_member=apply_inner_container ) derived_flds.append(pml.DerivedField( Apply=apply_outer, dataType="double", optype="continuous", name="poly" + str(polynomial_features.poly_ctr) + '-' + "x" + str(polyfeat_idx) )) name = derived_flds[polyfeat_idx].get_name() derived_colnames.append(name) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def pca(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's PCA preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to PCA preprocessing. """ pca.counter += 1 pp_dict = dict() derived_flds = list() derived_colnames = list() val = trfm.mean_ zero = 0.0 for preprocess_idx in range(trfm.n_components): add = list() for pca_idx in range(trfm.n_features_): apply_inner = pml.Apply(function='-', Constant=[pml.Constant(dataType="double", valueOf_=val[pca_idx])], FieldRef=[pml.FieldRef(field=col_names[pca_idx])]) apply_outer = pml.Apply(function="*", Apply_member=[apply_inner], Constant=[pml.Constant(dataType="double", valueOf_=zero if trfm.components_[preprocess_idx][ pca_idx] == 0.0 else trfm.components_[preprocess_idx][pca_idx])]) add.append(apply_outer) app0 = pml.Apply(function="sum", Apply_member=add) derived_flds.append(pml.DerivedField(Apply=app0, dataType="double", optype="continuous", name="PCA" + str(pca.counter) + "-" + str(preprocess_idx))) name = derived_flds[preprocess_idx].get_name() derived_colnames.append(name) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def lbl_encoder(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's LabelEncoder preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to LabelEncoder preprocessing. """ pp_dict = dict() derived_flds = list() field_column_pair = list() rows = [] categoric_lbls = trfm.classes_.tolist() categoric_lbls_num = trfm.transform(trfm.classes_.tolist()).tolist() derived_colnames = get_derived_colnames('labelEncoder', col_names) for row_idx in range(len(categoric_lbls_num)): row_main = pml.row() row_main.elementobjs_ = ['input', 'output'] row_main.input = categoric_lbls[row_idx] row_main.output = str(categoric_lbls_num[row_idx]) rows.append(row_main) field_column_pair.append(pml.FieldColumnPair(field=str(col_names[0]), column="input")) inline_table = pml.InlineTable(row=rows) map_values = pml.MapValues(outputColumn="output", FieldColumnPair=field_column_pair, InlineTable=inline_table) derived_flds.append( pml.DerivedField(MapValues=map_values, name=derived_colnames[0], optype="continuous", dataType="double")) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames pp_dict['pp_feat_class_lbl'] = categoric_lbls pp_dict['pp_feat_name'] = col_names[0] return pp_dict
def binarizer(trfm, col_names): """ Parameters ---------- trfm : Contains the Sklearn's Binarizer preprocessing instance. col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Binarizer preprocessing. """ pp_dict = dict() derived_flds = list() derived_colnames = get_derived_colnames("binarizer", col_names) for col_name_idx in range(len(col_names)): apply_outer = pml.Apply( function='threshold', Constant=[pml.Constant( dataType="double", valueOf_=trfm.threshold )], FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double" )) pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict
def imputer(trfm, col_names, **kwargs): """ Parameters ---------- trfm : Contains the Sklearn's Imputer preprocessing instance col_names : list Contains list of feature/column names. The column names may represent the names of preprocessed attributes. Returns ------- pp_dict : dictionary Returns a dictionary that contains attributes related to Imputer preprocessing. """ original_col_names = imputer.col_names derived_colnames = col_names pp_dict = dict() derived_flds = list() model = kwargs['model'] mining_strategy = trfm.strategy if "mean" in mining_strategy: mining_strategy = "asMean" elif "median" in mining_strategy: mining_strategy = "asMedian" elif "most_frequent" in mining_strategy: mining_strategy = "asMode" mining_replacement_val = trfm.statistics_ if not any_in(original_col_names, col_names): derived_colnames = get_derived_colnames('imputer', col_names) for col_name_idx in range(len(col_names)): if (col_names[col_name_idx] not in exception_cols): const_list = list() apply_inner = list() apply_inner.append(pml.Apply(function='isMissing', FieldRef=[pml.FieldRef(field=col_names[col_name_idx])])) const_obj = pml.Constant( dataType="double", # <--------------------- valueOf_=mining_replacement_val[col_name_idx] ), fieldref_obj = pml.FieldRef(field=col_names[col_name_idx]) fieldref_obj.original_tagname_ = "FieldRef" const_list.append(const_obj[0]) const_list.append(fieldref_obj) apply_outer = pml.Apply( Apply_member=apply_inner, function='if', Constant=const_list ) derived_flds.append(pml.DerivedField( Apply=apply_outer, name=derived_colnames[col_name_idx], optype="continuous", dataType="double" )) else: pp_dict['mining_strategy'] = mining_strategy pp_dict['mining_replacement_val'] = mining_replacement_val pp_dict['mining_attributes'] = col_names pp_dict['der_fld'] = derived_flds pp_dict['der_col_names'] = derived_colnames return pp_dict