Ejemplo n.º 1
0
def outer_su(idadf1,
             key1,
             idadf2,
             key2,
             target=None,
             features1=None,
             features2=None):
    """
    Compute the symmetric uncertainty coefficients between a set of features
    and a set of target from two different IdaDataFrames on a particular key. 
    
    This is experimental 
    """
    target1, features1 = _check_input(idadf1, target, features1)
    target2, features2 = _check_input(idadf2, None, features2)

    if key1 not in idadf1.columns:
        raise ValueError("%s is not a column in idadf1")
    if key2 not in idadf2.columns:
        raise ValueError("%s is not a column in idadf2")

    condition = "a.\"%s\" = b.\"%s\"" % (key1, key2)

    if key2 in features2:
        features2.remove(key2)

    afeaturesas = ", ".join([
        "a.\"%s\" as \"a.%s\" " % (feature, feature) for feature in features1
    ])
    bfeaturesas = ", ".join([
        "b.\"%s\" as \"b.%s\" " % (feature, feature) for feature in features2
    ])

    selectlist = [afeaturesas, bfeaturesas]

    if target1 is not None:
        atargetas = ", ".join(
            ["a.\"%s\" as \"a.%s\" " % (tar, tar) for tar in [target1]])
        selectlist.append(atargetas)
        atarget = "a." + target1
    else:
        atarget = None

    abfeatures = ["a." + feature for feature in features1
                  ] + ["b." + feature for feature in features2]
    selectstr = ", ".join(selectlist)

    expression = "SELECT %s FROM %s as a FULL OUTER JOIN %s as b ON %s" % (
        selectstr, idadf1.name, idadf2.name, condition)

    viewname = idadf1._idadb._create_view_from_expression(expression)

    try:
        idadf_join = ibmdbpy.IdaDataFrame(idadf1._idadb, viewname)
        return su(idadf_join, target=atarget, features=abfeatures)
    except:
        raise
    finally:
        idadf1._idadb.drop_view(viewname)
Ejemplo n.º 2
0
def idaview(request, idadb, idadf):
    """
    IdaDataFrame fixture to be used for the whole testing session. Open a view
    based on idadf fixture.
    """
    def fin():
        try:
            idadb.drop_view("TEST_VIEW_ibmdbpy")
            idadb.commit()
        except:
            pass

    request.addfinalizer(fin)

    if idadb.exists_view("TEST_VIEW_ibmdbpy"):
        idadb.drop_view("TEST_VIEW_ibmdbpy")

    idadb._create_view(idadf, "TEST_VIEW_ibmdbpy")
    return ibmdbpy.IdaDataFrame(idadb, "TEST_VIEW_ibmdbpy")
Ejemplo n.º 3
0
def idaview_tmp(request, idadb, idadf):
    """
    IdaDataFrame fixture to be used by destructive and semi-destructive
    functions. To be considered as a temporary DataFrame that is created
    and destroyed for each function that requires it. Opens a view based on
    idadf fixture.
    """
    def fin():
        try:
            idadb.drop_view("TEST_VIEW_ibmdbpy_TMP")
            idadb.commit()
        except:
            pass

    request.addfinalizer(fin)

    if idadb.exists_view("TEST_VIEW_ibmdbpy_TMP"):
        idadb.drop_view("TEST_VIEW_ibmdbpy_TMP")

    idadb._create_view(idadf, "TEST_VIEW_ibmdbpy_TMP")
    return ibmdbpy.IdaDataFrame(idadb, "TEST_VIEW_ibmdbpy_TMP")
Ejemplo n.º 4
0
    def predict(self,
                idadf,
                column_id=None,
                outtable=None,
                outtableProb=None,
                mestimation=False):
        """
        Use the Naive Bayes predict stored procedure to apply a Naive Bayes model
        to generate classification predictions for a data set.

        Parameters
        ----------
        idadf : IdaDataFrame
             IdaDataFrame to be used as input.

        column_id : str, optional
            The column of the input table that identifies a unique instance ID.
            By default, the same id column that is specified in the stored
            procedure to build the model.

        outtable : str, optional
            The name of the output table where the predictions are stored.
            It should contain only alphanumerical characters and underscores.
            All lower case characters will be converted to upper case characters.
            If this parameter is not specified, it is generated automatically. If
            the parameter corresponds to an existing table in the database, it
            will be replaced.

        outtableProb : str, optional
            The name of the output table where the probabilities for each of the classes are stored.
            It should contain only alphanumerical characters and underscores.
            All lower case characters will be converted to upper case characters.
            If this parameter is not specified, the table is not created.
            If the parameter corresponds to an existing table in the database, it
            will be replaced.

        mestimation : flag, default: False
            A flag that indicates the use of m-estimation for probabilities.
            This kind of estimation might be slower than other ones, but it
            might produce better results for small or unbalanced data sets.

        Returns
        -------
        IdaDataFrame
            IdaDataFrame containing the classification decision for each
            datapoints referenced by their ID.
        """
        if not isinstance(idadf, ibmdbpy.IdaDataFrame):
            raise TypeError("Argument should be an IdaDataFrame")

        idadf._idadb._check_procedure("PREDICT_NAIVEBAYES",
                                      "Prediction for Naive Bayes")

        # Check the ID
        if column_id is None:
            column_id = self._column_id
        if column_id not in idadf.columns:
            raise ValueError(
                "No id columns is available in IdaDataFrame:" + column_id +
                ". Either create a new ID column using add_column_id function"
                + " or give the name of a column that can be used as ID")

        if self._idadb is None:
            raise IdaNaiveBayesError(
                "The Naive Bayes model was not trained before.")

        # Check or create an outtable name, drop it if it already exists.
        if outtable is None:
            outtable = idadf._idadb._get_valid_tablename('PREDICT_NAIVEBAYES_')
        else:
            outtable = ibmdbpy.utils.check_tablename(outtable)
            if idadf._idadb.exists_table(outtable):
                idadf._idadb.drop_table(outtable)

        if outtableProb is not None:
            outtableProb = ibmdbpy.utils.check_tablename(outtableProb)
            if idadf._idadb.exists_table(outtableProb):
                idadf._idadb.drop_table(outtableProb)

        self.outtable = outtable
        self.outtableProb = outtableProb
        self.mestimation = mestimation

        # Create a temporay view
        idadf.internal_state._create_view()
        tmp_view_name = idadf.internal_state.current_state

        #if "." in tmp_view_name:
        #tmp_view_name = tmp_view_name.split('.')[-1]

        try:
            idadf._idadb._call_stored_procedure("IDAX.PREDICT_NAIVEBAYES ",
                                                model=self.modelname,
                                                intable=tmp_view_name,
                                                id=column_id,
                                                outtable=self.outtable,
                                                outtableProb=self.outtableProb,
                                                mestimation=self.mestimation)
        except:
            raise
        finally:
            idadf.internal_state._delete_view()
            idadf._idadb._autocommit()

        self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb, self.outtable)
        return self.labels_
Ejemplo n.º 5
0
    def predict(self, idadf, column_id=None, outtable=None):
        """
        Apply the K-means clustering model to new data.

        Parameters
        ----------
        idadf : IdaDataFrame
            IdaDataFrame to be used as input.

        column_id : str
            The column of the input table that identifies a unique instance ID.
            Default: the same id column that is specified in the stored procedure to build the model.

        outtable : str
            The name of the output table where the assigned clusters are stored.
            If this parameter is not specified, it is generated automatically.
            If the parameter corresponds to an existing table in the database,
            it is replaced.

        Returns
        -------
        IdaDataFrame
            IdaDataFrame containing the closest cluster for each data point referenced by its ID.
        """
        if not type(idadf).__name__ == 'IdaDataFrame':
            raise TypeError("Argument should be an IdaDataFrame")

        # Check the ID
        if column_id is None:
            column_id = self._column_id
        if column_id not in idadf.columns:
            raise ValueError(
                "No id columns is available in IdaDataFrame:" + column_id +
                ". Either create a new ID column using add_column_id function"
                + " or give the name of a column that can be used as ID")

        if self._idadb is None:
            raise IdaKMeansError("No KMeans model was trained before")

        if outtable is None:
            outtable = idadf._idadb._get_valid_modelname('PREDICT_KMEANS_')
        else:
            if self.outtable:
                outtable = self.outtable
            outtable = ibmdbpy.utils.check_tablename(outtable)
            if idadf._idadb.exists_table(outtable):
                idadf._idadb.drop_table(outtable)

        self.outtable = outtable
        # Create a temporay view
        idadf.internal_state._create_view()
        tmp_view_name = idadf.internal_state.current_state

        if "." in tmp_view_name:
            tmp_view_name = tmp_view_name.split('.')[-1]

        try:
            idadf._idadb._call_stored_procedure("IDAX.PREDICT_KMEANS ",
                                                model=self.modelname,
                                                intable=tmp_view_name,
                                                id=column_id,
                                                outtable=self.outtable)
        except:
            raise
        finally:
            idadf.internal_state._delete_view()
            idadf._idadb.commit()

        self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb,
                                            outtable,
                                            indexer=column_id)
        return self.labels_
Ejemplo n.º 6
0
    def predict(self,
                idadf,
                outtable=None,
                transaction_id=None,
                item_id=None,
                type="rules",
                limit=1,
                sort=None):
        """
        Apply the rules and patterns of an association rules model to other
        transactions. You can apply all rules or only specific rules according
        to specified criteria.

        Parameters
        ----------
        idadf : IdaDataFrame
            IdaDataFrame to be used as input.

        outtable : str, optional
            The name of the output table in which the mapping between the input 
            sequences and the associated rules or patterns is written. If the 
            parameter corresponds to an existing table in the database, it is 
            replaced.

        transaction_id : str, optional
            The column of the input table that identifies the transaction ID. 
            By default, this is the same tid column that is specified in the 
            stored procedure to build the model.


        item_id : str, optional
            The column of the input table that identifies an item of the 
            transaction. By default, this is the same item column that is 
            specified in the stored procedure to build the model.


        type : str, optional, default : "rules"
            The type of information that is written in the output table. The 
            following values are possible: ‘rules’ and ‘patterns’.

        limit : int, optional, >=1, default: 1
            The maximum number of rules or patterns that is written in the 
            output table for each input sequence.

        sort : str or list, optional
            A list of keywords that indicates the order in which the rules or 
            patterns are written in the output table. The order of the list is 
            descending. The items are separated by semicolons. The following 
            values are possible: ‘support’, ‘confidence’, ‘lift’, and ‘length’. 
            The ‘confidence’ value can only be specified if the type parameter 
            is ‘rules’. If the type parameter is ‘rules’, the default is: 
            support;confidence;length.  If the type parameter is ‘patterns’, 
            the default is: support;lift;length. 

        Notes
        -----
        When "type" is set to "rules", it looks like nothing is returned.
        """
        if not isinstance(idadf, ibmdbpy.IdaDataFrame):
            raise TypeError("Argument should be an IdaDataFrame")

        if sort is not None:
            sort = ';'.join(sort)

        if transaction_id is None:
            transaction_id = self.transaction_id
        if item_id is None:
            item_id = self.item_id

        # Check the ID
        if transaction_id not in idadf.columns:
            raise ValueError("Transaction id column" + transaction_id +
                             " is not available in IdaDataFrame.")

        if self._idadb is None:
            raise IdaAssociationRulesError(
                "No Association rules model was trained before.")

        # The version where we don't replace the outtable if it exists but raise an exception
        #if outtable is not None:
        #    if idadf._idadb.exists_table(outtable):
        #        raise ValueError("Table "+ outtable +" already exists.")
        #else:
        #    outtable = idadf._idadb._get_valid_modelname('PREDICT_ASSOCRULES_')

        if self.outtable is None:
            self.outtable = idadf._idadb._get_valid_tablename('NAIVEBAYES_')
        else:
            self.outtable = ibmdbpy.utils.check_tablename(self.outtable)
            if idadf._idadb.exists_table(self.outtable):
                idadf._idadb.drop_table(self.outtable)

        self.outtable = outtable
        self.type = type
        self.limit = limit
        self.sort = sort

        # Create a temporay view
        idadf.internal_state._create_view()
        tmp_view_name = idadf.internal_state.current_state

        if "." in tmp_view_name:
            tmp_view_name = tmp_view_name.split('.')[-1]

        try:
            idadf._idadb._call_stored_procedure("IDAX.PREDICT_ASSOCRULES ",
                                                model=self.modelname,
                                                intable=tmp_view_name,
                                                outtable=outtable,
                                                tid=transaction_id,
                                                item=item_id,
                                                type=type,
                                                limit=limit,
                                                sort=sort)
        except:
            raise
        finally:
            idadf.internal_state._delete_view()
            idadf._cursor.commit()

        self.labels_ = ibmdbpy.IdaDataFrame(idadf._idadb, outtable)
        return self.labels_
Ejemplo n.º 7
0
def spearman(idadf, target=None, features=None, ignore_indexer=True):
    """
    Compute the spearman rho correlation coefficients between a set of features 
    and a set of target in an IdaDataFrame.
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    target : str or list of str, optional
        A column or list of columns against to be used as target. Per default, 
        consider all columns
    
    features : str or list of str, optional
        A column or list of columns to be used as features. Per default, 
        consider all columns. 
        
    ignore_indexer : bool, default: True
        Per default, ignore the column declared as indexer in idadf
        
    Returns
    -------
    Pandas.DataFrame or Pandas.Series if only one target
    
    Notes
    -----
    Input columns as target and features should be numerical. 
    This function is a wrapper for pearson. 
    The scalability of this approach is not very good. Should not be used on 
    high dimensional data. 
    
    Examples
    --------
    >>> idadf = IdaDataFrame(idadb, "IRIS")
    >>> spearman(idadf)
    """
    numerical_columns = idadf._get_numerical_columns()
    if features is None:
        features = numerical_columns

    target, features = _check_input(idadf, target, features, ignore_indexer)

    for feature in features:
        if feature not in numerical_columns:
            raise TypeError(
                "Correlation-based measure not available for non-numerical column %s"
                % feature)

    if ignore_indexer is True:
        if idadf.indexer:
            if idadf.indexer in numerical_columns:
                features.remove(idadf.indexer)

    if features is None:
        features = list(idadf.columns)

    numerical_features = [x for x in features if x in numerical_columns]
    numerical_targets = [x for x in target if x in numerical_columns]

    numerical_features = list(set(numerical_features) | set(numerical_targets))

    agg_list = [
        "CAST(RANK() OVER (ORDER BY \"%s\") AS INTEGER) AS \"%s\"" % (x, x)
        for x in numerical_features
    ]
    agg_string = ', '.join(agg_list)

    expression = "SELECT %s FROM %s" % (agg_string, idadf.name)

    viewname = idadf._idadb._create_view_from_expression(expression)

    try:
        idadf_rank = ibmdbpy.IdaDataFrame(idadf._idadb, viewname)
        return pearson(idadf_rank,
                       target=target,
                       features=numerical_features,
                       ignore_indexer=ignore_indexer)
    except:
        raise
    finally:
        idadf._idadb.drop_view(viewname)
Ejemplo n.º 8
0
 def test_idadf_empty(self, idadb, df):
     idadb._create_table(df, "TEST_EMPTY_3496593727406047264076")
     to_test = ibmdbpy.IdaDataFrame(idadb,
                                    "TEST_EMPTY_3496593727406047264076")
     assert (to_test.empty is True)
     idadb.drop_table("TEST_EMPTY_3496593727406047264076")
Ejemplo n.º 9
0
def discretize(idadf,
               columns=None,
               disc="em",
               target=None,
               bins=None,
               outtable=None,
               clear_existing=False):
    """
    Discretize a set of numerical columns from an IdaDataFrame and returns an 
    IdaDataFrame open on the discretized version of the dataset. 
    
    Parameters
    ----------
    idadf : IdaDataFrame
    
    columns : str or list of str, optional
        A column or list of columns to be discretized
    
    disc : "ef", "em", "ew", "ewn" default: "em"
        Discretization method to be used
        
        - ef: Discretization bins of equal frequency 
        
        - em: Discretization bins of minimal entropy 
        
        - ew: Discretization bins of equal width
        
        - ewn: Discretization bins of equal width with human-friendly limits 
    
    target : str
        Target column again which the discretization will be done. Relevant
        only for "em" discretization. 
        
    bins: int, optional
        Number of bins. Not relevant for "em" discretization. 
        
    outtable: str, optional
        The name of the output table where the assigned clusters are stored.
        If this parameter is not specified, it is generated automatically.
        If the parameter corresponds to an existing table in the database,
        it is replaced.
    
    clear_existing: bool, default: False
        If set to True, a table will be replaced when a table with the same 
        name already exists  in the database.
    """
    if columns is None:
        columns = idadf._get_numerical_columns()
        if target is not None:
            columns = [x for x in columns if columns != target]
    else:
        if isinstance(columns, six.string_types):
            columns = [columns]

    stored_proc = _check(idadf, columns, disc, target, bins, outtable)

    bound_outtable = idadf._idadb._get_valid_tablename('DISC_BOUNDS_%s_' %
                                                       idadf.tablename)
    intable = idadf.name  # either the table or a view on the top
    incolumn = "\";\"".join(columns)

    # Calculate bounds
    idadf._idadb._call_stored_procedure("IDAX.%s" % stored_proc,
                                        outtable=bound_outtable,
                                        intable=intable,
                                        incolumn=incolumn,
                                        target=target,
                                        bins=bins)

    # Create discretized dataset

    if outtable is None:
        disc_outtable = idadf._idadb._get_valid_tablename('DISC_%s_' %
                                                          idadf.tablename)
    else:
        if clear_existing is True:
            try:
                idadf._idadb.drop_table(outtable)
            except:
                pass
        disc_outtable = outtable

    try:
        idadf._idadb._call_stored_procedure("IDAX.APPLY_DISC",
                                            outtable=disc_outtable,
                                            intable=intable,
                                            btable=bound_outtable,
                                            replace="T")
    except:
        raise
    finally:
        idadf._idadb.drop_table(bound_outtable)

    return ibmdbpy.IdaDataFrame(idadf._idadb, disc_outtable)