Ejemplo n.º 1
0
def lbl_binarizer(trfm, col_names, **kwargs):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's Label Binarizer preprocessing instance.
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to Label Binarizer preprocessing.

    """
    derived_flds = list()
    derived_colnames = list()
    pp_dict = dict()
    categoric_lbls = trfm.classes_.tolist()
    model_exception_list = ["LinearRegression", "LogisticRegression", "SVR", "SVC"]
    model = kwargs['model']
    for col_name_idx in range(len(col_names)):
        if len(categoric_lbls) == 2:
            derived_colnames = get_derived_colnames("labelBinarizer(" + str(col_names[col_name_idx]),
                                                    [categoric_lbls[-1]], ")")

            norm_descr = pml.NormDiscrete(field=str(col_names[-1]), value=str(categoric_lbls[-1]))
            derived_flds.append(pml.DerivedField(NormDiscrete=norm_descr,
                                                 name=derived_colnames[-1],
                                                 optype="categorical",
                                                 dataType="double"))
        else:
            derived_colnames = get_derived_colnames("labelBinarizer(" + str(col_names[col_name_idx]),
                                                    categoric_lbls, ")")
            for attribute_name in col_names:
                for class_name, class_idx in zip(categoric_lbls, range(len(categoric_lbls))):
                    norm_descr = pml.NormDiscrete(field=str(attribute_name), value=str(class_name))
                    derived_flds.append(
                        pml.DerivedField(NormDiscrete=norm_descr,
                                         name=derived_colnames[class_idx],
                                         optype="categorical",
                                         dataType="double"))
    if any_in([model.__class__.__name__], model_exception_list):
        pp_dict['hidden_lb_der_flds'] = derived_flds
        exception_cols.extend(derived_colnames)
        derived_flds = list()

    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    pp_dict['pp_feat_class_lbl'] = categoric_lbls
    pp_dict['pp_feat_name'] = col_names[0]

    return pp_dict
Ejemplo n.º 2
0
def tfidf_vectorizer(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's TfIdfVectorizer preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to TfIdfVectorizer preprocessing.

    """
    pp_dict = dict()
    features = trfm.get_feature_names()
    idfs = trfm.idf_
    extra_features = list(trfm.vocabulary_.keys())
    derived_flds = list()
    derived_colnames = get_derived_colnames('tfidf@[' + col_names[0] + ']', features)
    derived_flds.append(
        pml.DerivedField(name='lowercase(' + col_names[0] + ')',
                         optype='categorical', dataType='string',
                         Apply=pml.Apply(function='lowercase',
                                         FieldRef=[pml.FieldRef(field=col_names[0])])))
    for feat_idx, idf in zip(range(len(features)), idfs):
        derived_flds.append(pml.DerivedField(
            name=derived_colnames[feat_idx],
            optype='continuous',
            dataType='double',
            Apply=pml.Apply(function='*',
                            TextIndex=[pml.TextIndex(textField='lowercase(' + col_names[0] + ')',
                                                     wordSeparatorCharacterRE='\s+',
                                                     tokenize='true',
                                                     Constant=pml.Constant(valueOf_=features[feat_idx]),
                                                     Extension=[pml.Extension(anytypeobjs_=[extra_features[feat_idx]])])],
                            Constant=[pml.Constant(valueOf_=idf)])
        ))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    pp_dict['pp_feat_name'] = col_names[0]
    pp_dict['pp_feat_class_lbl'] = list()
    return pp_dict
Ejemplo n.º 3
0
def std_scaler(trfm, col_names, **kwargs):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's Standard Scaler preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to Standard Scaler preprocessing.

    """
    derived_flds = list()
    pp_dict = dict()
    if is_present("labelBinarizer",col_names):
        derived_flds_hidden = kwargs['derived_fld']
        if derived_flds_hidden:
            derived_flds.extend(derived_flds_hidden)

    derived_colnames = get_derived_colnames('standardScaler', col_names)
    for col_name_idx in range(len(col_names)):
        apply_inner = list()
        apply_inner.append(pml.Apply(
            function='-',
            Constant=[pml.Constant(
                dataType="double",  # <---------------------
                valueOf_=unround_scalers(trfm.mean_[col_name_idx])
            )],
            FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]
        ))
        apply_outer = pml.Apply(
            Apply_member=apply_inner,
            function='/',
            Constant=[pml.Constant(
                dataType="double",  # <----------------------------
                valueOf_=unround_scalers(trfm.scale_[col_name_idx])
            )]
        )
        derived_flds.append(pml.DerivedField(
            Apply=apply_outer,
            name=derived_colnames[col_name_idx],
            optype="continuous",
            dataType="double"
        ))


    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Ejemplo n.º 4
0
def rbst_scaler(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's RobustScaler preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to RobustScaler preprocessing.

    """
    pp_dict = dict()
    derived_flds = list()
    derived_colnames = get_derived_colnames('robustScaler', col_names)
    for col_name_idx in range(len(col_names)):
        if (col_names[col_name_idx] not in exception_cols):
            apply_inner = list()
            apply_inner.append(pml.Apply(
                function='-',
                Constant=[pml.Constant(
                    dataType="double",  # <---------------------
                    valueOf_=unround_scalers(trfm.center_[col_name_idx])
                )],
                FieldRef=[pml.FieldRef(field=col_names[col_name_idx])],
                Extension=[pml.Extension(name='scaling', anytypeobjs_=['RobustScaler'])]
            ))
            apply_outer = pml.Apply(
                Apply_member=apply_inner,
                function='/',
                Constant=[pml.Constant(
                    dataType="double",  # <----------------------------
                    valueOf_=unround_scalers(trfm.scale_[col_name_idx])
                )]
            )
            derived_flds.append(pml.DerivedField(
                Apply=apply_outer,
                name=derived_colnames[col_name_idx],
                optype="continuous",
                dataType="double"
            ))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Ejemplo n.º 5
0
def min_max_scaler(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's MinMaxScaler preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to MinMaxScaler preprocessing.

    """
    pp_dict = dict()
    derived_flds = list()
    # col_names = list(filter(lambda x: x not in exception_cols, col_names))
    derived_colnames = get_derived_colnames("minMaxScaler", col_names)
    for col_name_idx in range(len(col_names)):
        if(col_names[col_name_idx] not in exception_cols):
            apply_inner = list()
            apply_inner.append(pml.Apply(
                function='*',
                Constant=[pml.Constant(
                    dataType="double",
                    valueOf_=unround_scalers(trfm.scale_[col_name_idx])
                )],
                FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]
            ))
            apply_outer = pml.Apply(
                Apply_member=apply_inner,
                function='+',
                Constant=[pml.Constant(
                    dataType="double",
                    valueOf_=unround_scalers(trfm.min_[col_name_idx])
                )]
            )
            derived_flds.append(pml.DerivedField(
                Apply=apply_outer,
                name=derived_colnames[col_name_idx],
                optype="continuous",
                dataType="double"
            ))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Ejemplo n.º 6
0
def polynomial_features(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's PolynomialFeatures preprocessing instance.
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to PolynomialFeatures preprocessing.

    """
    polynomial_features.poly_ctr += 1
    pp_dict = dict()
    derived_flds = []
    derived_colnames = []

    for polyfeat_idx in range(trfm.powers_.shape[0]):
        apply_inner_container = []
        for col_name_idx in range(len(col_names)):
            val = int(trfm.powers_[polyfeat_idx][col_name_idx])
            apply_inner = pml.Apply(
                function='pow',
                Constant=[pml.Constant(
                    dataType="integer",
                    valueOf_=val
                )],
                FieldRef=[pml.FieldRef(field=col_names[col_name_idx])])
            apply_inner_container.append(apply_inner)
        apply_outer = pml.Apply(function="product",
                                Apply_member=apply_inner_container
                                )
        derived_flds.append(pml.DerivedField(
            Apply=apply_outer,
            dataType="double",
            optype="continuous",
            name="poly" + str(polynomial_features.poly_ctr) + '-' + "x" + str(polyfeat_idx)
        ))
        name = derived_flds[polyfeat_idx].get_name()
        derived_colnames.append(name)
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Ejemplo n.º 7
0
def pca(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's PCA preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to PCA preprocessing.

    """
    pca.counter += 1
    pp_dict = dict()
    derived_flds = list()
    derived_colnames = list()
    val = trfm.mean_
    zero = 0.0
    for preprocess_idx in range(trfm.n_components):
        add = list()
        for pca_idx in range(trfm.n_features_):
            apply_inner = pml.Apply(function='-',
                                    Constant=[pml.Constant(dataType="double",
                                                           valueOf_=val[pca_idx])],
                                    FieldRef=[pml.FieldRef(field=col_names[pca_idx])])
            apply_outer = pml.Apply(function="*",
                                    Apply_member=[apply_inner],
                                    Constant=[pml.Constant(dataType="double",
                                                           valueOf_=zero if trfm.components_[preprocess_idx][
                                                                                pca_idx] == 0.0 else
                                                           trfm.components_[preprocess_idx][pca_idx])])
            add.append(apply_outer)
        app0 = pml.Apply(function="sum", Apply_member=add)

        derived_flds.append(pml.DerivedField(Apply=app0,
                                             dataType="double",
                                             optype="continuous",
                                             name="PCA" + str(pca.counter) + "-" + str(preprocess_idx)))
        name = derived_flds[preprocess_idx].get_name()
        derived_colnames.append(name)
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Ejemplo n.º 8
0
def lbl_encoder(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's LabelEncoder preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to LabelEncoder preprocessing.

    """
    pp_dict = dict()
    derived_flds = list()
    field_column_pair = list()
    rows = []
    categoric_lbls = trfm.classes_.tolist()
    categoric_lbls_num = trfm.transform(trfm.classes_.tolist()).tolist()
    derived_colnames = get_derived_colnames('labelEncoder', col_names)
    for row_idx in range(len(categoric_lbls_num)):
        row_main = pml.row()
        row_main.elementobjs_ = ['input', 'output']
        row_main.input = categoric_lbls[row_idx]
        row_main.output = str(categoric_lbls_num[row_idx])
        rows.append(row_main)
    field_column_pair.append(pml.FieldColumnPair(field=str(col_names[0]), column="input"))
    inline_table = pml.InlineTable(row=rows)
    map_values = pml.MapValues(outputColumn="output", FieldColumnPair=field_column_pair, InlineTable=inline_table)
    derived_flds.append(
        pml.DerivedField(MapValues=map_values, name=derived_colnames[0], optype="continuous", dataType="double"))

    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    pp_dict['pp_feat_class_lbl'] = categoric_lbls
    pp_dict['pp_feat_name'] = col_names[0]

    return pp_dict
Ejemplo n.º 9
0
def binarizer(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's Binarizer preprocessing instance.
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to Binarizer preprocessing.

    """
    pp_dict = dict()
    derived_flds = list()
    derived_colnames = get_derived_colnames("binarizer", col_names)
    for col_name_idx in range(len(col_names)):
        apply_outer = pml.Apply(
            function='threshold',
            Constant=[pml.Constant(
                dataType="double",
                valueOf_=trfm.threshold
            )],
            FieldRef=[pml.FieldRef(field=col_names[col_name_idx])])

        derived_flds.append(pml.DerivedField(
            Apply=apply_outer,
            name=derived_colnames[col_name_idx],
            optype="continuous",
            dataType="double"
        ))

    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Ejemplo n.º 10
0
def imputer(trfm, col_names, **kwargs):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's Imputer preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to Imputer preprocessing.

    """
    original_col_names = imputer.col_names
    derived_colnames = col_names
    pp_dict = dict()
    derived_flds = list()

    model = kwargs['model']

    mining_strategy = trfm.strategy
    if "mean" in mining_strategy:
        mining_strategy = "asMean"
    elif "median" in mining_strategy:
        mining_strategy = "asMedian"
    elif "most_frequent" in mining_strategy:
        mining_strategy = "asMode"
    mining_replacement_val = trfm.statistics_

    if not any_in(original_col_names, col_names):
        derived_colnames = get_derived_colnames('imputer', col_names)
        for col_name_idx in range(len(col_names)):
            if (col_names[col_name_idx] not in exception_cols):
                const_list = list()
                apply_inner = list()
                apply_inner.append(pml.Apply(function='isMissing', FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]))
                const_obj = pml.Constant(
                    dataType="double",  # <---------------------
                    valueOf_=mining_replacement_val[col_name_idx]
                ),
                fieldref_obj = pml.FieldRef(field=col_names[col_name_idx])
                fieldref_obj.original_tagname_ = "FieldRef"
                const_list.append(const_obj[0])
                const_list.append(fieldref_obj)
                apply_outer = pml.Apply(
                    Apply_member=apply_inner,
                    function='if',
                    Constant=const_list
                )

                derived_flds.append(pml.DerivedField(
                    Apply=apply_outer,
                    name=derived_colnames[col_name_idx],
                    optype="continuous",
                    dataType="double"
                ))
    else:
        pp_dict['mining_strategy'] = mining_strategy
        pp_dict['mining_replacement_val'] = mining_replacement_val
        pp_dict['mining_attributes'] = col_names

    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict