Example #1
0
def std_scaler(trfm, col_names, **kwargs):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's Standard Scaler preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to Standard Scaler preprocessing.

    """
    derived_flds = list()
    pp_dict = dict()
    if is_present("labelBinarizer",col_names):
        derived_flds_hidden = kwargs['derived_fld']
        if derived_flds_hidden:
            derived_flds.extend(derived_flds_hidden)

    derived_colnames = get_derived_colnames('standardScaler', col_names)
    for col_name_idx in range(len(col_names)):
        apply_inner = list()
        apply_inner.append(pml.Apply(
            function='-',
            Constant=[pml.Constant(
                dataType="double",  # <---------------------
                valueOf_=unround_scalers(trfm.mean_[col_name_idx])
            )],
            FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]
        ))
        apply_outer = pml.Apply(
            Apply_member=apply_inner,
            function='/',
            Constant=[pml.Constant(
                dataType="double",  # <----------------------------
                valueOf_=unround_scalers(trfm.scale_[col_name_idx])
            )]
        )
        derived_flds.append(pml.DerivedField(
            Apply=apply_outer,
            name=derived_colnames[col_name_idx],
            optype="continuous",
            dataType="double"
        ))


    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #2
0
def rbst_scaler(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's RobustScaler preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to RobustScaler preprocessing.

    """
    pp_dict = dict()
    derived_flds = list()
    derived_colnames = get_derived_colnames('robustScaler', col_names)
    for col_name_idx in range(len(col_names)):
        if (col_names[col_name_idx] not in exception_cols):
            apply_inner = list()
            apply_inner.append(pml.Apply(
                function='-',
                Constant=[pml.Constant(
                    dataType="double",  # <---------------------
                    valueOf_=unround_scalers(trfm.center_[col_name_idx])
                )],
                FieldRef=[pml.FieldRef(field=col_names[col_name_idx])],
                Extension=[pml.Extension(name='scaling', anytypeobjs_=['RobustScaler'])]
            ))
            apply_outer = pml.Apply(
                Apply_member=apply_inner,
                function='/',
                Constant=[pml.Constant(
                    dataType="double",  # <----------------------------
                    valueOf_=unround_scalers(trfm.scale_[col_name_idx])
                )]
            )
            derived_flds.append(pml.DerivedField(
                Apply=apply_outer,
                name=derived_colnames[col_name_idx],
                optype="continuous",
                dataType="double"
            ))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #3
0
def min_max_scaler(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's MinMaxScaler preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to MinMaxScaler preprocessing.

    """
    pp_dict = dict()
    derived_flds = list()
    # col_names = list(filter(lambda x: x not in exception_cols, col_names))
    derived_colnames = get_derived_colnames("minMaxScaler", col_names)
    for col_name_idx in range(len(col_names)):
        if(col_names[col_name_idx] not in exception_cols):
            apply_inner = list()
            apply_inner.append(pml.Apply(
                function='*',
                Constant=[pml.Constant(
                    dataType="double",
                    valueOf_=unround_scalers(trfm.scale_[col_name_idx])
                )],
                FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]
            ))
            apply_outer = pml.Apply(
                Apply_member=apply_inner,
                function='+',
                Constant=[pml.Constant(
                    dataType="double",
                    valueOf_=unround_scalers(trfm.min_[col_name_idx])
                )]
            )
            derived_flds.append(pml.DerivedField(
                Apply=apply_outer,
                name=derived_colnames[col_name_idx],
                optype="continuous",
                dataType="double"
            ))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #4
0
def pca(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's PCA preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to PCA preprocessing.

    """
    pca.counter += 1
    pp_dict = dict()
    derived_flds = list()
    derived_colnames = list()
    val = trfm.mean_
    zero = 0.0
    for preprocess_idx in range(trfm.n_components):
        add = list()
        for pca_idx in range(trfm.n_features_):
            apply_inner = pml.Apply(function='-',
                                    Constant=[pml.Constant(dataType="double",
                                                           valueOf_=val[pca_idx])],
                                    FieldRef=[pml.FieldRef(field=col_names[pca_idx])])
            apply_outer = pml.Apply(function="*",
                                    Apply_member=[apply_inner],
                                    Constant=[pml.Constant(dataType="double",
                                                           valueOf_=zero if trfm.components_[preprocess_idx][
                                                                                pca_idx] == 0.0 else
                                                           trfm.components_[preprocess_idx][pca_idx])])
            add.append(apply_outer)
        app0 = pml.Apply(function="sum", Apply_member=add)

        derived_flds.append(pml.DerivedField(Apply=app0,
                                             dataType="double",
                                             optype="continuous",
                                             name="PCA" + str(pca.counter) + "-" + str(preprocess_idx)))
        name = derived_flds[preprocess_idx].get_name()
        derived_colnames.append(name)
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #5
0
def tfidf_vectorizer(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's TfIdfVectorizer preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to TfIdfVectorizer preprocessing.

    """
    pp_dict = dict()
    features = trfm.get_feature_names()
    idfs = trfm.idf_
    extra_features = list(trfm.vocabulary_.keys())
    derived_flds = list()
    derived_colnames = get_derived_colnames('tfidf@[' + col_names[0] + ']', features)
    derived_flds.append(
        pml.DerivedField(name='lowercase(' + col_names[0] + ')',
                         optype='categorical', dataType='string',
                         Apply=pml.Apply(function='lowercase',
                                         FieldRef=[pml.FieldRef(field=col_names[0])])))
    for feat_idx, idf in zip(range(len(features)), idfs):
        derived_flds.append(pml.DerivedField(
            name=derived_colnames[feat_idx],
            optype='continuous',
            dataType='double',
            Apply=pml.Apply(function='*',
                            TextIndex=[pml.TextIndex(textField='lowercase(' + col_names[0] + ')',
                                                     wordSeparatorCharacterRE='\s+',
                                                     tokenize='true',
                                                     Constant=pml.Constant(valueOf_=features[feat_idx]),
                                                     Extension=[pml.Extension(anytypeobjs_=[extra_features[feat_idx]])])],
                            Constant=[pml.Constant(valueOf_=idf)])
        ))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    pp_dict['pp_feat_name'] = col_names[0]
    pp_dict['pp_feat_class_lbl'] = list()
    return pp_dict
Example #6
0
def polynomial_features(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's PolynomialFeatures preprocessing instance.
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to PolynomialFeatures preprocessing.

    """
    polynomial_features.poly_ctr += 1
    pp_dict = dict()
    derived_flds = []
    derived_colnames = []

    for polyfeat_idx in range(trfm.powers_.shape[0]):
        apply_inner_container = []
        for col_name_idx in range(len(col_names)):
            val = int(trfm.powers_[polyfeat_idx][col_name_idx])
            apply_inner = pml.Apply(
                function='pow',
                Constant=[pml.Constant(
                    dataType="integer",
                    valueOf_=val
                )],
                FieldRef=[pml.FieldRef(field=col_names[col_name_idx])])
            apply_inner_container.append(apply_inner)
        apply_outer = pml.Apply(function="product",
                                Apply_member=apply_inner_container
                                )
        derived_flds.append(pml.DerivedField(
            Apply=apply_outer,
            dataType="double",
            optype="continuous",
            name="poly" + str(polynomial_features.poly_ctr) + '-' + "x" + str(polyfeat_idx)
        ))
        name = derived_flds[polyfeat_idx].get_name()
        derived_colnames.append(name)
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #7
0
def binarizer(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's Binarizer preprocessing instance.
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to Binarizer preprocessing.

    """
    pp_dict = dict()
    derived_flds = list()
    derived_colnames = get_derived_colnames("binarizer", col_names)
    for col_name_idx in range(len(col_names)):
        apply_outer = pml.Apply(
            function='threshold',
            Constant=[pml.Constant(
                dataType="double",
                valueOf_=trfm.threshold
            )],
            FieldRef=[pml.FieldRef(field=col_names[col_name_idx])])

        derived_flds.append(pml.DerivedField(
            Apply=apply_outer,
            name=derived_colnames[col_name_idx],
            optype="continuous",
            dataType="double"
        ))

    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #8
0
def imputer(trfm, col_names, **kwargs):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's Imputer preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to Imputer preprocessing.

    """
    original_col_names = imputer.col_names
    derived_colnames = col_names
    pp_dict = dict()
    derived_flds = list()

    model = kwargs['model']

    mining_strategy = trfm.strategy
    if "mean" in mining_strategy:
        mining_strategy = "asMean"
    elif "median" in mining_strategy:
        mining_strategy = "asMedian"
    elif "most_frequent" in mining_strategy:
        mining_strategy = "asMode"
    mining_replacement_val = trfm.statistics_

    if not any_in(original_col_names, col_names):
        derived_colnames = get_derived_colnames('imputer', col_names)
        for col_name_idx in range(len(col_names)):
            if (col_names[col_name_idx] not in exception_cols):
                const_list = list()
                apply_inner = list()
                apply_inner.append(pml.Apply(function='isMissing', FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]))
                const_obj = pml.Constant(
                    dataType="double",  # <---------------------
                    valueOf_=mining_replacement_val[col_name_idx]
                ),
                fieldref_obj = pml.FieldRef(field=col_names[col_name_idx])
                fieldref_obj.original_tagname_ = "FieldRef"
                const_list.append(const_obj[0])
                const_list.append(fieldref_obj)
                apply_outer = pml.Apply(
                    Apply_member=apply_inner,
                    function='if',
                    Constant=const_list
                )

                derived_flds.append(pml.DerivedField(
                    Apply=apply_outer,
                    name=derived_colnames[col_name_idx],
                    optype="continuous",
                    dataType="double"
                ))
    else:
        pp_dict['mining_strategy'] = mining_strategy
        pp_dict['mining_replacement_val'] = mining_replacement_val
        pp_dict['mining_attributes'] = col_names

    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict