Ejemplo n.º 1
0
def score_train_test(
        X,
        Y,
        train,
        test,
        X_treat=None,
        Y_treat=None,
        FoldNumber=None,  # pylint: disable=unused-argument
        grad_splits=None,
        progress=None,  # pylint: disable=unused-argument
        **kwargs):
    """ 
    Presents a unified api for ct_v_matrix and loo_v_matrix
    and returns the v_mat, w_pen (possibly calculated, possibly a parameter), and the score

    :param X: Matrix of covariates for untreated units
    :type X: coercible to :class:`numpy.matrix`

    :param Y: Matrix of outcomes for untreated units
    :type Y: coercible to :class:`numpy.matrix`

    :param train: List of rows in the current training set
    :type train: int[]

    :param test: LIst of rows in the current test set
    :type test: int[]

    :param X_treat: Optional matrix of covariates for treated units
    :type X_treat: coercible to :class:`numpy.matrix`

    :param Y_treat: Optional matrix of outcomes for treated units
    :type Y_treat: ``None`` or coercible to :class:`numpy.matrix`

    :param FoldNumber: Unused, for API compatibility only.
    :type FoldNumber: ``None``

    :param grad_splits: Splits for Fitted v.s. Control units in each gradient
                       descent step. An integer, or a list/generator of train
                       and test units in each fold of the gradient descent.
    :type grad_splits: int or int[][], optional

    :param progress: Should progress messages be printed to the console?
    :type progress: boolean

    :param kwargs: additional arguments passed to the underlying matrix method

    :raises ValueError: when X, Y, X_treat, or Y_treat are not coercible to a
       :class:`numpy.matrix` or have incompatible dimensions

    :raises RuntimeError: When a MemoryError is raised and grad_splits
        (which reduces memory requirements) is not used.

    :returns: tuple containing the matrix of covariate weights, the unit
        weights penalty, and the out-of-sample score
    :rtype: tuple
    """
    if (X_treat is None) != (Y_treat is None):
        raise ValueError(
            "parameters `X_treat` and `Y_treat` must both be Matrices or None")

    if X_treat is not None:
        # >> K-fold validation on the Treated units; assuming that Y and
        # Y_treat are pre-intervention outcomes

        # PARAMETER QC
        try:
            X = np.asmatrix(X)
        except ValueError:
            raise ValueError("X is not coercible to a matrix")
        try:
            Y = np.asmatrix(Y)
        except ValueError:
            raise ValueError("Y is not coercible to a matrix")
        if X_treat.shape[1] == 0:
            raise ValueError("X_treat.shape[1] == 0")
        if Y_treat.shape[1] == 0:
            raise ValueError("Y_treat.shape[1] == 0")
        if X_treat.shape[0] != Y_treat.shape[0]:
            raise ValueError(
                "X_treat and Y_treat have different number of rows (%s and %s)"
                % (X.shape[0], Y.shape[0]))

        # FIT THE V-MATRIX AND POSSIBLY CALCULATE THE w_pen
        # note that the weights, score, and loss function value returned here
        # are for the in-sample predictions
        _, v_mat, _, _, w_pen, _ = ct_v_matrix(
            X=np.vstack((X, X_treat[train, :])),
            Y=np.vstack((Y, Y_treat[train, :])),
            treated_units=[X.shape[0] + i for i in range(len(train))],
            **kwargs)

        # GET THE OUT-OF-SAMPLE PREDICTION ERROR
        s = ct_score(
            X=np.vstack((X, X_treat[test, :])),
            Y=np.vstack((Y, Y_treat[test, :])),
            treated_units=[X.shape[0] + i for i in range(len(test))],
            V=v_mat,
            w_pen=w_pen,
        )

    else:  # X_treat *is* None
        # >> K-fold validation on the only control units; assuming that Y
        # contains post-intervention outcomes
        if grad_splits is not None:

            try:
                iter(grad_splits)
            except TypeError:
                # not iterable
                pass
            else:
                # TRIM THE GRAD SPLITS NEED TO THE TRAINING SET

                # inspired by R's match() function
                match = lambda a, b: np.concatenate(
                    [np.where(a == x)[0] for x in b])

                grad_splits = [(match(train, _X), match(train, _Y))
                               for _X, _Y in grad_splits]

            # FIT THE V-MATRIX AND POSSIBLY CALCULATE THE w_pen
            # note that the weights, score, and loss function value returned
            # here are for the in-sample predictions
            _, v_mat, _, _, w_pen, _ = fold_v_matrix(
                X=X[train, :],
                Y=Y[train, :],
                # treated_units = [X.shape[0] + i for i in  range(len(train))],
                grad_splits=grad_splits,
                **kwargs)

            # GET THE OUT-OF-SAMPLE PREDICTION ERROR (could also use loo_score, actually...)
            s = ct_score(
                X=X,
                Y=Y,  # formerly: fold_score
                treated_units=test,
                V=v_mat,
                w_pen=w_pen,
            )

        else:

            # FIT THE V-MATRIX AND POSSIBLY CALCULATE THE w_pen
            # note that the weights, score, and loss function value returned
            # here are for the in-sample predictions
            try:
                _, v_mat, _, _, w_pen, _ = loo_v_matrix(
                    X=X[train, :],
                    Y=Y[train, :],
                    # treated_units = [X.shape[0] + i for i in  range(len(train))],
                    **kwargs)
            except MemoryError:
                raise RuntimeError(
                    "MemoryError encountered.  Try setting `grad_splits` " +
                    "parameter to reduce memory requirements.")

            # GET THE OUT-OF-SAMPLE PREDICTION ERROR
            s = ct_score(X=X, Y=Y, treated_units=test, V=v_mat, w_pen=w_pen)

    return v_mat, w_pen, s
Ejemplo n.º 2
0
def tensor(X, Y, X_treat=None, Y_treat=None, grad_splits=None, **kwargs):
    """ Presents a unified api for ct_v_matrix and loo_v_matrix
    """
    # PARAMETER QC
    try:
        X = np.float64(X)
    except ValueError:
        raise ValueError("X is not coercible to float64")
    try:
        Y = np.float64(Y)
    except ValueError:
        raise ValueError("Y is not coercible to float64")

    Y = np.asmatrix(Y) # this needs to be deprecated properly -- bc Array.dot(Array) != matrix(Array).dot(matrix(Array)) -- not even close !!!
    X = np.asmatrix(X)

    if X.shape[1] == 0:
        raise ValueError("X.shape[1] == 0")
    if Y.shape[1] == 0:
        raise ValueError("Y.shape[1] == 0")
    if X.shape[0] != Y.shape[0]:
        raise ValueError(
            "X and Y have different number of rows (%s and %s)"
            % (X.shape[0], Y.shape[0])
        )

    if (X_treat is None) != (Y_treat is None):
        raise ValueError(
            "parameters `X_treat` and `Y_treat` must both be Matrices or None"
        )

    if X_treat is not None:
        # Fit the Treated units to the control units; assuming that Y contains
        # pre-intervention outcomes:

        # PARAMETER QC
        try:
            X_treat = np.float64(X_treat)
        except ValueError:
            raise ValueError("X_treat is not coercible to float64")
        try:
            Y_treat = np.float64(Y_treat)
        except ValueError:
            raise ValueError("Y_treat is not coercible to float64")

        Y_treat = np.asmatrix(Y_treat) # this needs to be deprecated properly -- bc Array.dot(Array) != matrix(Array).dot(matrix(Array)) -- not even close !!!
        X_treat = np.asmatrix(X_treat)

        if X_treat.shape[1] == 0:
            raise ValueError("X_treat.shape[1] == 0")
        if Y_treat.shape[1] == 0:
            raise ValueError("Y_treat.shape[1] == 0")
        if X_treat.shape[0] != Y_treat.shape[0]:
            raise ValueError(
                "X_treat and Y_treat have different number of rows (%s and %s)"
                % (X_treat.shape[0], Y_treat.shape[0])
            )

        # FIT THE V-MATRIX AND POSSIBLY CALCULATE THE w_pen
        # note that the weights, score, and loss function value returned here
        # are for the in-sample predictions
        _, v_mat, _, _, _, _ = ct_v_matrix(
            X=np.vstack((X, X_treat)),
            Y=np.vstack((Y, Y_treat)),
            control_units=np.arange(X.shape[0]),
            treated_units=np.arange(X_treat.shape[0]) + X.shape[0],
            **kwargs
        )

    else:
        # Fit the control units to themselves; Y may contain post-intervention outcomes:

        adjusted=False
        if kwargs["w_pen"] < 1:
            adjusted=True

        if grad_splits is not None:
            _, v_mat, _, _, _, _ = fold_v_matrix(
                X=X,
                Y=Y,
                control_units=np.arange(X.shape[0]),
                treated_units=np.arange(X.shape[0]),
                grad_splits=grad_splits,
                **kwargs
            )

            #if adjusted:
            #    print("vmat: %s" % (np.diag(v_mat)))

        else:
            _, v_mat, _, _, _, _ = loo_v_matrix(
                X=X,
                Y=Y,
                control_units=np.arange(X.shape[0]),
                treated_units=np.arange(X.shape[0]),
                **kwargs
            )
    return v_mat
Ejemplo n.º 3
0
def get_max_v_pen(X, Y, w_pen=None, X_treat=None, Y_treat=None, **kwargs):
    """ 
    Calculates maximum value of v_pen for which the elements of tensor
    matrix (V) are not all zero conditional on the provided w_pen.  If w_pen is
    not provided, a guestimate is used.

    Provides a unified wrapper to the various *_v_matrix functions, passing the
    parameter ``return_max_v_pen = True`` in order to obtain the gradient
    instead of he matrix
    """

    # PARAMETER QC
    try:
        X = np.float64(X)
    except ValueError:
        raise ValueError("X is not coercible to a matrix")
    try:
        Y = np.float64(Y)
    except ValueError:
        raise ValueError("Y is not coercible to a matrix")
    Y = np.asmatrix(
        Y
    )  # this needs to be deprecated properly -- bc Array.dot(Array) != matrix(Array).dot(matrix(Array)) -- not even close !!!
    X = np.asmatrix(X)

    if (X_treat is None) != (Y_treat is None):
        raise ValueError(
            "parameters `X_treat` and `Y_treat` must both be Matrices or None")
    if X.shape[1] == 0:
        raise ValueError("X.shape[1] == 0")
    if Y.shape[1] == 0:
        raise ValueError("Y.shape[1] == 0")
    if X.shape[0] != Y.shape[0]:
        raise ValueError("X and Y have different number of rows (%s and %s)" %
                         (X.shape[0], Y.shape[0]))
    if w_pen is None:
        w_pen = np.mean(np.var(X, axis=0))

    if X_treat is not None:

        # PARAMETER QC
        if not isinstance(X_treat, np.matrix):
            raise TypeError("X_treat is not a matrix")
        if not isinstance(Y_treat, np.matrix):
            raise TypeError("Y_treat is not a matrix")
        if X_treat.shape[1] == 0:
            raise ValueError("X_treat.shape[1] == 0")
        if Y_treat.shape[1] == 0:
            raise ValueError("Y_treat.shape[1] == 0")
        if X_treat.shape[0] != Y_treat.shape[0]:
            raise ValueError(
                "X_treat and Y_treat have different number of rows (%s and %s)"
                % (X.shape[0], Y.shape[0]))

        control_units = np.arange(X.shape[0])
        treated_units = np.arange(X.shape[0], X.shape[0] + X_treat.shape[0])

        try:
            _v_pen = iter(w_pen)
        except TypeError:
            # w_pen is a single value
            return ct_v_matrix(X=np.vstack((X, X_treat)),
                               Y=np.vstack((Y, Y_treat)),
                               w_pen=w_pen,
                               control_units=control_units,
                               treated_units=treated_units,
                               return_max_v_pen=True,
                               gradient_message=_GRADIENT_MESSAGE,
                               **kwargs)

        else:
            # w_pen is an iterable of values
            return [
                ct_v_matrix(X=np.vstack((X, X_treat)),
                            Y=np.vstack((Y, Y_treat)),
                            control_units=control_units,
                            treated_units=treated_units,
                            return_max_v_pen=True,
                            gradient_message=_GRADIENT_MESSAGE,
                            w_pen=_w_pen,
                            **kwargs) for _w_pen in w_pen
            ]

    else:

        try:
            _v_pen = iter(w_pen)
        except TypeError:
            if "grad_splits" in kwargs:
                # w_pen is a single value
                return fold_v_matrix(X=X,
                                     Y=Y,
                                     w_pen=w_pen,
                                     return_max_v_pen=True,
                                     gradient_message=_GRADIENT_MESSAGE,
                                     **kwargs)
            # w_pen is a single value
            try:
                return loo_v_matrix(X=X,
                                    Y=Y,
                                    w_pen=w_pen,
                                    return_max_v_pen=True,
                                    gradient_message=_GRADIENT_MESSAGE,
                                    **kwargs)
            except MemoryError:
                raise RuntimeError(
                    "MemoryError encountered.  Try setting `grad_splits` "
                    "parameter to reduce memory requirements.")
        else:
            if "grad_splits" in kwargs:

                # w_pen is an iterable of values
                return [
                    fold_v_matrix(X=X,
                                  Y=Y,
                                  w_pen=_w_pen,
                                  return_max_v_pen=True,
                                  gradient_message=_GRADIENT_MESSAGE,
                                  **kwargs) for _w_pen in w_pen
                ]

            # w_pen is an iterable of values
            try:
                return [
                    loo_v_matrix(X=X,
                                 Y=Y,
                                 w_pen=_w_pen,
                                 return_max_v_pen=True,
                                 gradient_message=_GRADIENT_MESSAGE,
                                 **kwargs) for _w_pen in w_pen
                ]
            except MemoryError:
                raise RuntimeError(
                    "MemoryError encountered.  Try setting `grad_splits` "
                    "parameter to reduce memory requirements.")