Example #1
0
        def get_local_transformation(index):

            derived_fields = []
            derived_field_names = []
            hull = self._hulls[index]
            if self._use_lag:
                for i in range(1, self._length_of_fingerprint):
                    name = hull["name"] + _UNDERSCORE + str(i - 1)
                    derived_fields.append(
                        pml.DerivedField(
                            name=name,
                            optype=OPTYPE.CONTINUOUS,
                            dataType=DATATYPE.DOUBLE,
                            Lag=pml.Lag(field=hull["name"], n=self._length_of_fingerprint - i)
                        )
                    )
                last_derived_name = hull["name"] + _UNDERSCORE + str(self._length_of_fingerprint - 1)
                derived_fields.append(
                    pml.DerivedField(
                        name=last_derived_name,
                        optype=OPTYPE.CONTINUOUS,
                        dataType=DATATYPE.DOUBLE,
                        FieldRef=pml.FieldRef(field=hull["name"])
                    )
                )
            for idx, val in enumerate(hull["values"]):
                name = "distance_tag_" + str(idx)
                derived_field_names.append(name)
                derived_fields.append(
                    pml.DerivedField(
                        name=name,
                        optype=OPTYPE.CONTINUOUS,
                        dataType=DATATYPE.DOUBLE,
                        Apply=pml.Apply(
                            function=_CALCULATE_DISTANCE,
                            FieldRef=[pml.FieldRef(field=hull["name"] + _UNDERSCORE + str(idx))],
                            Constant=[
                                pml.Constant(valueOf_=val["maxValue"]),
                                pml.Constant(valueOf_=val["minValue"])
                            ]
                        )
                    )
                )
            derived_fields.append(
                pml.DerivedField(
                    name=_SUM_OF_DISTANCE,
                    optype=OPTYPE.CONTINUOUS,
                    dataType=DATATYPE.DOUBLE,
                    Apply=pml.Apply(
                        function=FUNCTION.SUM,
                        FieldRef=[
                            pml.FieldRef(
                                field=field
                            )
                            for field in derived_field_names
                        ]
                    )
                )
            )
            return pml.LocalTransformations(DerivedField=derived_fields)
Example #2
0
def tfidf_vectorizer(trfm, col_names):
    """
    Generates pre-processing elements for Scikit-Learn's TfIdfVectorizer

    Parameters
    ----------
    trfm :
        Contains the Sklearn's TfIdfVectorizer preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to TfIdfVectorizer preprocessing.

    """
    pp_dict = dict()
    features = [
        str(feat.encode("utf8"))[2:-1] for feat in trfm.get_feature_names()
    ]
    idfs = trfm.idf_
    extra_features = list(trfm.vocabulary_.keys())
    derived_flds = list()
    derived_colnames = get_derived_colnames('tfidf@[' + col_names[0] + ']',
                                            features)
    if trfm.lowercase:
        derived_flds.append(
            pml.DerivedField(name='lowercase(' + col_names[0] + ')',
                             optype=OPTYPE.CATEGORICAL.value,
                             dataType=DATATYPE.STRING.value,
                             Apply=pml.Apply(
                                 function=FUNCTION.LOWERCASE.value,
                                 FieldRef=[pml.FieldRef(field=col_names[0])])))
    for feat_idx, idf in zip(range(len(features)), idfs):
        derived_flds.append(
            pml.DerivedField(
                name=derived_colnames[feat_idx],
                optype=OPTYPE.CONTINUOUS.value,
                dataType=DATATYPE.DOUBLE.value,
                Apply=pml.Apply(
                    function=FUNCTION.MULTIPLICATION.value,
                    TextIndex=[
                        pml.TextIndex(
                            textField='lowercase(' + col_names[0] + ')',
                            wordSeparatorCharacterRE='\\s+',
                            tokenize='true',
                            Constant=pml.Constant(valueOf_=features[feat_idx]),
                            Extension=[
                                pml.Extension(value=extra_features[feat_idx])
                            ])
                    ],
                    Constant=[pml.Constant(valueOf_="{:.16f}".format(idf))])))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    pp_dict['pp_feat_name'] = col_names[0]
    pp_dict['pp_feat_class_lbl'] = list()
    return pp_dict
Example #3
0
def pca(trfm, col_names):
    """
    Generates pre-processing elements for Scikit-Learn's PCA

    Parameters
    ----------
    trfm :
        Contains the Sklearn's PCA preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to PCA preprocessing.

    """
    pca.counter += 1
    pp_dict = dict()
    derived_flds = list()
    derived_colnames = list()
    val = trfm.mean_
    zero = 0.0
    for preprocess_idx in range(trfm.n_components_):
        add = list()
        for pca_idx in range(trfm.n_features_):
            apply_inner = pml.Apply(
                function=FUNCTION.SUBSTRACTTION.value,
                Constant=[
                    pml.Constant(dataType=DATATYPE.DOUBLE.value,
                                 valueOf_="{:.16f}".format(val[pca_idx]))
                ],
                FieldRef=[pml.FieldRef(field=col_names[pca_idx])])
            apply_outer = pml.Apply(
                function=FUNCTION.MULTIPLICATION.value,
                Apply_member=[apply_inner],
                Constant=[
                    pml.Constant(
                        dataType=DATATYPE.DOUBLE.value,
                        valueOf_=zero
                        if trfm.components_[preprocess_idx][pca_idx] == 0.0
                        else "{:.16f}".format(
                            trfm.components_[preprocess_idx][pca_idx]))
                ])
            add.append(apply_outer)
        app0 = pml.Apply(function=FUNCTION.SUM.value, Apply_member=add)

        derived_flds.append(
            pml.DerivedField(Apply=app0,
                             dataType=DATATYPE.DOUBLE.value,
                             optype=OPTYPE.CONTINUOUS.value,
                             name="PCA" + str(pca.counter) + "-" +
                             str(preprocess_idx)))
        name = derived_flds[preprocess_idx].get_name()
        derived_colnames.append(name)
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #4
0
def pca(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's PCA preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to PCA preprocessing.

    """
    pca.counter += 1
    pp_dict = dict()
    derived_flds = list()
    derived_colnames = list()
    val = trfm.mean_
    zero = 0.0
    for preprocess_idx in range(trfm.n_components_):
        add = list()
        for pca_idx in range(trfm.n_features_):
            apply_inner = pml.Apply(
                function='-',
                Constant=[
                    pml.Constant(dataType="double",
                                 valueOf_="{:.16f}".format(val[pca_idx]))
                ],
                FieldRef=[pml.FieldRef(field=col_names[pca_idx])])
            apply_outer = pml.Apply(
                function="*",
                Apply_member=[apply_inner],
                Constant=[
                    pml.Constant(
                        dataType="double",
                        valueOf_=zero
                        if trfm.components_[preprocess_idx][pca_idx] == 0.0
                        else "{:.16f}".format(
                            trfm.components_[preprocess_idx][pca_idx]))
                ])
            add.append(apply_outer)
        app0 = pml.Apply(function="sum", Apply_member=add)

        derived_flds.append(
            pml.DerivedField(Apply=app0,
                             dataType="double",
                             optype="continuous",
                             name="PCA" + str(pca.counter) + "-" +
                             str(preprocess_idx)))
        name = derived_flds[preprocess_idx].get_name()
        derived_colnames.append(name)
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #5
0
def lag(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Nyoka's Lag instance.
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to Lag preprocessing.

    """
    derived_flds = list()
    pp_dict = dict()
    derived_colnames = get_derived_colnames(trfm.aggregation, col_names)
    if trfm.aggregation == 'stddev':
        for idx, name in enumerate(col_names):
            applyies = list()
            for i in range(trfm.value):
                lags = list()
                lags.append(pml.Lag(field=name, n=i + 1))
                lags.append(pml.Lag(field=name, n=trfm.value, aggregate="avg"))
                sub_func = pml.Apply(function="-", Lag=lags)
                pow_func = pml.Apply(
                    function="pow",
                    Apply_member=[sub_func],
                    Constant=[pml.Constant(dataType="integer", valueOf_=2)])
                applyies.append(pow_func)
            add_func = pml.Apply(function="+", Apply_member=applyies)
            div_func = pml.Apply(function="/",
                                 Apply_member=[add_func],
                                 Constant=[
                                     pml.Constant(dataType="double",
                                                  valueOf_=float(trfm.value))
                                 ])
            sqrt_func = pml.Apply(function="sqrt", Apply_member=[div_func])
            derived_fld = pml.DerivedField(name=derived_colnames[idx],
                                           Apply=sqrt_func,
                                           optype="continuous",
                                           dataType="double")
            derived_flds.append(derived_fld)
    else:
        for idx, name in enumerate(col_names):
            lag = pml.Lag(field=name, n=trfm.value, aggregate=trfm.aggregation)
            derived_fld = pml.DerivedField(name=derived_colnames[idx],
                                           Lag=lag,
                                           optype="continuous",
                                           dataType="double")
            derived_flds.append(derived_fld)

    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #6
0
def std_scaler(trfm, col_names, **kwargs):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's Standard Scaler preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to Standard Scaler preprocessing.

    """
    derived_flds = list()
    pp_dict = dict()
    if is_present("labelBinarizer", col_names):
        derived_flds_hidden = kwargs['derived_fld']
        if derived_flds_hidden:
            derived_flds.extend(derived_flds_hidden)

    derived_colnames = get_derived_colnames('standardScaler', col_names)
    for col_name_idx in range(len(col_names)):
        apply_inner = list()
        apply_inner.append(
            pml.Apply(
                function='-',
                Constant=[
                    pml.Constant(
                        dataType="double",  # <---------------------
                        valueOf_="{:.16f}".format(trfm.mean_[col_name_idx]))
                ],
                FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]))
        apply_outer = pml.Apply(
            Apply_member=apply_inner,
            function='/',
            Constant=[
                pml.Constant(
                    dataType="double",  # <----------------------------
                    valueOf_="{:.16f}".format(trfm.scale_[col_name_idx]))
            ])
        derived_flds.append(
            pml.DerivedField(Apply=apply_outer,
                             name=derived_colnames[col_name_idx],
                             optype="continuous",
                             dataType="double"))

    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #7
0
def rbst_scaler(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's RobustScaler preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to RobustScaler preprocessing.

    """
    pp_dict = dict()
    derived_flds = list()
    derived_colnames = get_derived_colnames('robustScaler', col_names)
    for col_name_idx in range(len(col_names)):
        if (col_names[col_name_idx] not in exception_cols):
            apply_inner = list()
            apply_inner.append(
                pml.Apply(
                    function='-',
                    Constant=[
                        pml.Constant(
                            dataType="double",  # <---------------------
                            valueOf_="{:.16f}".format(
                                trfm.center_[col_name_idx]))
                    ],
                    FieldRef=[pml.FieldRef(field=col_names[col_name_idx])],
                    Extension=[
                        pml.Extension(name='scaling',
                                      anytypeobjs_=['RobustScaler'])
                    ]))
            apply_outer = pml.Apply(
                Apply_member=apply_inner,
                function='/',
                Constant=[
                    pml.Constant(
                        dataType="double",  # <----------------------------
                        valueOf_="{:.16f}".format(trfm.scale_[col_name_idx]))
                ])
            derived_flds.append(
                pml.DerivedField(Apply=apply_outer,
                                 name=derived_colnames[col_name_idx],
                                 optype="continuous",
                                 dataType="double"))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #8
0
def tfidf_vectorizer(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's TfIdfVectorizer preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to TfIdfVectorizer preprocessing.

    """
    pp_dict = dict()
    features = [
        str(feat.encode("utf8"))[2:-1] for feat in trfm.get_feature_names()
    ]
    idfs = trfm.idf_
    derived_flds = list()
    derived_colnames = get_derived_colnames('tfidf@[' + col_names[0] + ']',
                                            features)
    if trfm.lowercase:
        derived_flds.append(
            pml.DerivedField(name='lowercase(' + col_names[0] + ')',
                             optype='categorical',
                             dataType='string',
                             Apply=pml.Apply(
                                 function='lowercase',
                                 FieldRef=[pml.FieldRef(field=col_names[0])])))
    for feat_idx, idf in zip(range(len(features)), idfs):
        derived_flds.append(pml.DerivedField(
            name = derived_colnames[feat_idx],
            optype='continuous',
            dataType='double',
            Apply=pml.Apply(function='*',
                            TextIndex=[pml.TextIndex(textField='lowercase(' + col_names[0] + ')' if trfm.lowercase \
                                else col_names[0],
                                                    wordSeparatorCharacterRE=trfm.token_pattern,
                                                    tokenize='true',
                                                    Constant=pml.Constant(valueOf_=features[feat_idx]),
                                                    )],
                            Constant=[pml.Constant(valueOf_="{:.16f}".format(idf))])
                            ))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    pp_dict['pp_feat_name'] = col_names[0]
    pp_dict['pp_feat_class_lbl'] = list()
    return pp_dict
Example #9
0
def rbst_scaler(trfm, col_names):
    """
    Generates pre-processing elements for Scikit-Learn's RobustScaler

    Parameters
    ----------
    trfm :
        Contains the Sklearn's RobustScaler preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to RobustScaler preprocessing.

    """
    pp_dict = dict()
    derived_flds = list()
    derived_colnames = get_derived_colnames('robustScaler', col_names)
    for col_name_idx in range(len(col_names)):
        if (col_names[col_name_idx] not in exception_cols):
            apply_inner = list()
            apply_inner.append(
                pml.Apply(
                    function=FUNCTION.SUBSTRACTTION.value,
                    Constant=[
                        pml.Constant(dataType=DATATYPE.DOUBLE.value,
                                     valueOf_="{:.16f}".format(
                                         trfm.center_[col_name_idx]))
                    ],
                    FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]))
            apply_outer = pml.Apply(Apply_member=apply_inner,
                                    function=FUNCTION.DIVISION.value,
                                    Constant=[
                                        pml.Constant(
                                            dataType=DATATYPE.DOUBLE.value,
                                            valueOf_="{:.16f}".format(
                                                trfm.scale_[col_name_idx]))
                                    ])
            derived_flds.append(
                pml.DerivedField(Apply=apply_outer,
                                 name=derived_colnames[col_name_idx],
                                 optype=OPTYPE.CONTINUOUS.value,
                                 dataType=DATATYPE.DOUBLE.value))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #10
0
def min_max_scaler(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's MinMaxScaler preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to MinMaxScaler preprocessing.

    """
    pp_dict = dict()
    derived_flds = list()
    # col_names = list(filter(lambda x: x not in exception_cols, col_names))
    derived_colnames = get_derived_colnames("minMaxScaler", col_names)
    for col_name_idx in range(len(col_names)):
        if (col_names[col_name_idx] not in exception_cols):
            apply_inner = list()
            apply_inner.append(
                pml.Apply(
                    function='*',
                    Constant=[
                        pml.Constant(dataType="double",
                                     valueOf_="{:.16f}".format(
                                         trfm.scale_[col_name_idx]))
                    ],
                    FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]))
            apply_outer = pml.Apply(Apply_member=apply_inner,
                                    function='+',
                                    Constant=[
                                        pml.Constant(
                                            dataType="double",
                                            valueOf_="{:.16f}".format(
                                                trfm.min_[col_name_idx]))
                                    ])
            derived_flds.append(
                pml.DerivedField(Apply=apply_outer,
                                 name=derived_colnames[col_name_idx],
                                 optype="continuous",
                                 dataType="double"))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #11
0
def count_vectorizer(trfm, col_names):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's CountVectorizer preprocessing instance.
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to CountVectorizer preprocessing.

    """
    pp_dict = dict()
    features = [
        str(feat.encode("utf8"))[2:-1] for feat in trfm.get_feature_names()
    ]
    # extra_features = [str(feat.encode("utf8"))[2:-1] for feat in list(trfm.vocabulary_.keys())]
    # features = trfm.get_feature_names()
    # extra_features = list(trfm.vocabulary_.keys())
    derived_flds = list()
    derived_colnames = get_derived_colnames('count_vec@[' + col_names[0] + ']',
                                            features)
    # derived_colnames = list()
    if trfm.lowercase:
        derived_flds.append(
            pml.DerivedField(name='lowercase(' + col_names[0] + ')',
                             optype='categorical',
                             dataType='string',
                             Apply=pml.Apply(
                                 function='lowercase',
                                 FieldRef=[pml.FieldRef(field=col_names[0])])))
    for imp_features, index in zip(features, range(len(features))):
        # no_punct_word = remove_punctuation(imp_features)
        # if len(no_punct_word) == 0:
        # df_name = 'count_vec@[' + col_names[0] + ']('+ imp_features+')'
        # derived_colnames.append(df_name)
        df_name = derived_colnames[index]
        derived_flds.append(pml.DerivedField(name=df_name,
                                            optype='continuous',
                                            dataType='double',
                                            TextIndex=pml.TextIndex(textField='lowercase(' + col_names[0] + ')' if trfm.lowercase \
                                                else col_names[0],
                                                                    # wordSeparatorCharacterRE='\s+',
                                                                    wordSeparatorCharacterRE=trfm.token_pattern,
                                                                    tokenize='true',
                                                                    Constant=pml.Constant(dataType="string",
                                                                                        valueOf_=imp_features),
                                                                    # Extension=[pml.Extension(
                                                                    #     anytypeobjs_=[extra_features[index]])]
                                                                    )))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    pp_dict['pp_feat_name'] = col_names[0]
    pp_dict['pp_feat_class_lbl'] = list()
    return pp_dict
Example #12
0
        def get_calculate_distance_function():

            value_for_true = pml.Constant(valueOf_=0)
            value_for_true.original_tagname_ = _CONSTANT

            calculate_distance = pml.DefineFunction(
                name=_CALCULATE_DISTANCE,
                optype=OPTYPE.CONTINUOUS.value,
                dataType=DATATYPE.DOUBLE.value,
                ParameterField=[
                    pml.ParameterField(name=_TAG,
                                       optype=OPTYPE.CONTINUOUS.value,
                                       dataType=DATATYPE.DOUBLE.value),
                    pml.ParameterField(name=_TAG_UPPER_BOUNDARY,
                                       optype=OPTYPE.CONTINUOUS.value,
                                       dataType=DATATYPE.DOUBLE.value),
                    pml.ParameterField(name=_TAG_LOWER_BOUNDARY,
                                       optype=OPTYPE.CONTINUOUS.value,
                                       dataType=DATATYPE.DOUBLE.value)
                ],
                Apply=pml.Apply(
                    function=FUNCTION.IF.value,
                    Apply_member=[
                        pml.Apply(function=_IS_INSIDE_BOUNDARY,
                                  FieldRef=[
                                      pml.FieldRef(field=_TAG),
                                      pml.FieldRef(field=_TAG_UPPER_BOUNDARY),
                                      pml.FieldRef(field=_TAG_LOWER_BOUNDARY)
                                  ]), value_for_true,
                        pml.Apply(
                            function=FUNCTION.IF.value,
                            Apply_member=[
                                pml.Apply(
                                    function=FUNCTION.LESS_OR_EQUAL.value,
                                    FieldRef=[
                                        pml.FieldRef(field=_TAG),
                                        pml.FieldRef(field=_TAG_LOWER_BOUNDARY)
                                    ]),
                                pml.Apply(
                                    function=FUNCTION.SUBSTRACTTION.value,
                                    FieldRef=[
                                        pml.FieldRef(
                                            field=_TAG_LOWER_BOUNDARY),
                                        pml.FieldRef(field=_TAG)
                                    ]),
                                pml.Apply(
                                    function=FUNCTION.SUBSTRACTTION.value,
                                    FieldRef=[
                                        pml.FieldRef(field=_TAG),
                                        pml.FieldRef(field=_TAG_UPPER_BOUNDARY)
                                    ])
                            ])
                    ]))
            return calculate_distance
Example #13
0
def count_vectorizer(trfm, col_names):
    """
    Generates pre-processing elements for Scikit-Learn's CountVectorizer

    Parameters
    ----------
    trfm :
        Contains the Sklearn's CountVectorizer preprocessing instance.
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to CountVectorizer preprocessing.

    """
    pp_dict = dict()
    features = [
        str(feat.encode("utf8"))[2:-1] for feat in trfm.get_feature_names()
    ]
    extra_features = list(trfm.vocabulary_.keys())
    derived_flds = list()
    derived_colnames = get_derived_colnames('count_vec@[' + col_names[0] + ']',
                                            features)
    if trfm.lowercase:
        derived_flds.append(
            pml.DerivedField(name='lowercase(' + col_names[0] + ')',
                             optype=OPTYPE.CATEGORICAL,
                             dataType=DATATYPE.STRING,
                             Apply=pml.Apply(
                                 function=FUNCTION.LOWERCASE,
                                 FieldRef=[pml.FieldRef(field=col_names[0])])))
    for imp_features, index in zip(features, range(len(features))):
        df_name = derived_colnames[index]
        derived_flds.append(pml.DerivedField(name=df_name,
                                            optype=OPTYPE.CONTINUOUS,
                                            dataType=DATATYPE.DOUBLE,
                                            TextIndex=pml.TextIndex(textField='lowercase(' + col_names[0] + ')' if trfm.lowercase \
                                                else col_names[0],
                                                                    wordSeparatorCharacterRE='\\s+',
                                                                    tokenize='true',
                                                                    Constant=pml.Constant(dataType=DATATYPE.STRING,
                                                                                        valueOf_=imp_features),
                                                                    Extension=[pml.Extension(value=extra_features[index])]
                                                                    )))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    pp_dict['pp_feat_name'] = col_names[0]
    pp_dict['pp_feat_class_lbl'] = list()
    return pp_dict
Example #14
0
def polynomial_features(trfm, col_names):
    """
    Generates pre-processing elements for Scikit-Learn's PolynomialFeatures

    Parameters
    ----------
    trfm :
        Contains the Sklearn's PolynomialFeatures preprocessing instance.
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to PolynomialFeatures preprocessing.

    """
    polynomial_features.poly_ctr += 1
    pp_dict = dict()
    derived_flds = []
    derived_colnames = []

    for polyfeat_idx in range(trfm.powers_.shape[0]):
        apply_inner_container = []
        for col_name_idx in range(len(col_names)):
            val = int(trfm.powers_[polyfeat_idx][col_name_idx])
            apply_inner = pml.Apply(
                function='pow',
                Constant=[pml.Constant(
                    dataType="integer",
                    valueOf_=val
                )],
                FieldRef=[pml.FieldRef(field=col_names[col_name_idx])])
            apply_inner_container.append(apply_inner)
        apply_outer = pml.Apply(function="product",
                                Apply_member=apply_inner_container
                                )
        derived_flds.append(pml.DerivedField(
            Apply=apply_outer,
            dataType="double",
            optype="continuous",
            name="poly" + str(polynomial_features.poly_ctr) + '-' + "x" + str(polyfeat_idx)
        ))
        name = derived_flds[polyfeat_idx].get_name()
        derived_colnames.append(name)
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #15
0
        def get_normalization_function():
            if len(self._hulls) == 1:
                max_distance = pml.Constant(valueOf_=self._max_distances[0])
            else:
                max_distance = pml.Constant(
                    valueOf_=self._length_of_fingerprint * len(self._hulls))
            max_distance.original_tagname_ = "Constant"

            constant_100 = pml.Constant(valueOf_=100)
            constant_100.original_tagname_ = "Constant"

            # constant_max_distance = pml.Constant(valueOf_=self._max_distances[0])
            # constant_max_distance.original_tagname_ = "Constant"

            substraction_function = pml.Apply(
                function=FUNCTION.MULTIPLICATION.value,
                Apply_member=[
                    pml.Apply(function=FUNCTION.DIVISION.value,
                              Apply_member=[
                                  pml.Apply(
                                      function=FUNCTION.SUBSTRACTTION.value,
                                      FieldRef=[
                                          max_distance,
                                          pml.FieldRef(field="totalDistance")
                                      ])
                              ],
                              Constant=[max_distance]), constant_100
                ])
            substraction_function.original_tagname_ = "Apply"

            equal_function = pml.Apply(
                function=FUNCTION.IF.value,
                Apply_member=[
                    pml.Apply(function=FUNCTION.EQUAL.value,
                              FieldRef=[pml.FieldRef(field="totalDistance")],
                              Constant=[pml.Constant(valueOf_=0)])
                ],
                Constant=[
                    pml.Constant(valueOf_=100),
                    substraction_function,
                ])
            equal_function.original_tagname_ = "Apply"

            return pml.Apply(
                function=FUNCTION.IF.value,
                Apply_member=[
                    pml.Apply(function=FUNCTION.GREATER_OR_EQUAL.value,
                              FieldRef=[pml.FieldRef(field="totalDistance")],
                              Constant=[max_distance])
                ],
                Constant=[pml.Constant(valueOf_=0), equal_function])
Example #16
0
def max_abs_scaler(trfm, col_names):
    """
    Generates pre-processing elements for Scikit-Learn's MaxAbsScaler

    Parameters
    ----------
    trfm :
        Contains the Sklearn's MaxabsScaler preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to MaxabsScaler preprocessing.

    """
    pp_dict = dict()
    derived_flds = list()
    derived_colnames = get_derived_colnames('maxAbsScaler', col_names)
    for col_name_idx in range(len(col_names)):
        if (col_names[col_name_idx] not in exception_cols):
            apply_outer = pml.Apply(
                function='/',
                Constant=[pml.Constant(
                    dataType="double", 
                    valueOf_="{:.16f}".format(trfm.max_abs_[col_name_idx])
                )],
                FieldRef=[pml.FieldRef(field=col_names[col_name_idx])]
            )

            derived_flds.append(pml.DerivedField(
                Apply=apply_outer,
                name=derived_colnames[col_name_idx],
                optype="continuous",
                dataType="double"
            ))
    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #17
0
def binarizer(trfm, col_names):
    """
    Generates pre-processing elements for Scikit-Learn's Binarizer

    Parameters
    ----------
    trfm :
        Contains the Sklearn's Binarizer preprocessing instance.
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to Binarizer preprocessing.

    """
    pp_dict = dict()
    derived_flds = list()
    derived_colnames = get_derived_colnames("binarizer", col_names)
    for col_name_idx in range(len(col_names)):
        apply_outer = pml.Apply(
            function='threshold',
            Constant=[pml.Constant(
                dataType="double",
                valueOf_=trfm.threshold
            )],
            FieldRef=[pml.FieldRef(field=col_names[col_name_idx])])

        derived_flds.append(pml.DerivedField(
            Apply=apply_outer,
            name=derived_colnames[col_name_idx],
            optype="continuous",
            dataType="double"
        ))

    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #18
0
def imputer(trfm, col_names, **kwargs):
    """
    Generates pre-processing elements for Scikit-Learn's Imputer

    Parameters
    ----------
    trfm :
        Contains the Sklearn's Imputer preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to Imputer preprocessing.

    """
    original_col_names = imputer.col_names
    derived_colnames = col_names
    pp_dict = dict()
    derived_flds = list()

    model = kwargs['model']

    mining_strategy = trfm.strategy
    if "mean" in mining_strategy:
        mining_strategy = MISSING_VALUE_TREATMENT_METHOD.AS_MEAN.value
    elif "median" in mining_strategy:
        mining_strategy = MISSING_VALUE_TREATMENT_METHOD.AS_MEDIAN.value
    elif "most_frequent" in mining_strategy:
        mining_strategy = MISSING_VALUE_TREATMENT_METHOD.AS_MODE.value
    mining_replacement_val = trfm.statistics_

    if not any_in(original_col_names, col_names):
        derived_colnames = get_derived_colnames('imputer', col_names)
        for col_name_idx in range(len(col_names)):
            if (col_names[col_name_idx] not in exception_cols):
                const_list = list()
                apply_inner = list()
                apply_inner.append(
                    pml.Apply(function=FUNCTION.IS_MISSING.value,
                              FieldRef=[
                                  pml.FieldRef(field=col_names[col_name_idx])
                              ]))
                const_obj = pml.Constant(
                    dataType=DATATYPE.DOUBLE.value,
                    valueOf_=mining_replacement_val[col_name_idx]),
                fieldref_obj = pml.FieldRef(field=col_names[col_name_idx])
                fieldref_obj.original_tagname_ = "FieldRef"
                const_list.append(const_obj[0])
                const_list.append(fieldref_obj)
                apply_outer = pml.Apply(Apply_member=apply_inner,
                                        function=FUNCTION.IF.value,
                                        Constant=const_list)

                derived_flds.append(
                    pml.DerivedField(Apply=apply_outer,
                                     name=derived_colnames[col_name_idx],
                                     optype=OPTYPE.CONTINUOUS.value,
                                     dataType=DATATYPE.DOUBLE.value))
    else:
        pp_dict['mining_strategy'] = mining_strategy
        pp_dict['mining_replacement_val'] = mining_replacement_val
        pp_dict['mining_attributes'] = col_names

    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict
Example #19
0
def imputer(trfm, col_names, **kwargs):
    """

    Parameters
    ----------
    trfm :
        Contains the Sklearn's Imputer preprocessing instance
    col_names : list
        Contains list of feature/column names.
        The column names may represent the names of preprocessed attributes.

    Returns
    -------
    pp_dict : dictionary
        Returns a dictionary that contains attributes related to Imputer preprocessing.

    """
    original_col_names = imputer.col_names
    derived_colnames = col_names
    pp_dict = dict()
    derived_flds = list()

    model = kwargs['model']

    mining_strategy = trfm.strategy
    if "mean" in mining_strategy:
        mining_strategy = "asMean"
    elif "median" in mining_strategy:
        mining_strategy = "asMedian"
    elif "most_frequent" in mining_strategy:
        mining_strategy = "asMode"
    mining_replacement_val = trfm.statistics_

    if not any_in(original_col_names, col_names):
        derived_colnames = get_derived_colnames('imputer', col_names)
        for col_name_idx in range(len(col_names)):
            if (col_names[col_name_idx] not in exception_cols):
                const_list = list()
                apply_inner = list()
                apply_inner.append(
                    pml.Apply(function='isMissing',
                              FieldRef=[
                                  pml.FieldRef(field=col_names[col_name_idx])
                              ]))
                const_obj = pml.Constant(
                    dataType="double",  # <---------------------
                    valueOf_=mining_replacement_val[col_name_idx]),
                fieldref_obj = pml.FieldRef(field=col_names[col_name_idx])
                fieldref_obj.original_tagname_ = "FieldRef"
                const_list.append(const_obj[0])
                const_list.append(fieldref_obj)
                apply_outer = pml.Apply(Apply_member=apply_inner,
                                        function='if',
                                        Constant=const_list)

                derived_flds.append(
                    pml.DerivedField(Apply=apply_outer,
                                     name=derived_colnames[col_name_idx],
                                     optype="continuous",
                                     dataType="double"))
    else:
        pp_dict['mining_strategy'] = mining_strategy
        pp_dict['mining_replacement_val'] = mining_replacement_val
        pp_dict['mining_attributes'] = col_names

    pp_dict['der_fld'] = derived_flds
    pp_dict['der_col_names'] = derived_colnames
    return pp_dict