コード例 #1
0
def predict_er(X, E, window=0.21, step=10, q=5, use_box_cox=True):
    qt = QuantileTransformer(n_quantiles=q, random_state=0)

    lr = HuberRegressor()
    lr.fit(X, E)

    E_pred = lr.predict(X)
    idx_sorted = np.argsort(E)
    X = X[idx_sorted]
    E_pred = E_pred[idx_sorted]
    E = E[idx_sorted]
    
    # use box-cox + quantile transformation so that the data lies uniformly in the interval [0, 1] 
    if use_box_cox:
        E_quantile = qt.fit_transform(np.log(E).reshape(-1, 1)).reshape(-1)
    else:
        E_quantile = qt.fit_transform(E.reshape(-1, 1)).reshape(-1)
    
    E_pred = lr.predict(X)
    x, y = rolling_window_er(E_quantile, (E - E_pred) / E, window=window, step=step)
    if use_box_cox:
        x = np.exp(qt.inverse_transform(x.reshape(-1, 1)).reshape(-1))
    else:
        x = qt.inverse_transform(x.reshape(-1, 1)).reshape(-1)
        
        
    return x, y
コード例 #2
0
    def fit(self):
        folds = KFold(n_splits=10, random_state=11, shuffle=True)
        y = self.train_df[self.target]
        oof_preds = np.zeros((len(self.train_df)))
        qt = QuantileTransformer(output_distribution='normal', random_state=11)
        qt.fit(y.values.reshape((-1, 1)))
        test_preds = []
        y = qt.transform(y.values.reshape((-1, 1)))
        X = np.asarray(list(range(0, len(self.train_df))))
        X_test = np.asarray(list(range(len(self.train_df), len(self.train_df) + len(self.test_df))))
        X_test = self.convert_x(X_test)
        for tr_idx, val_idx in folds.split(X):
            X_tr, X_val = X[tr_idx], X[val_idx]
            y_tr, y_val = y[tr_idx], y[val_idx]

            X_tr = self.convert_x(X_tr)
            X_val = self.convert_x(X_val)

            mod = sm.OLS(y_tr, X_tr)
            res = mod.fit()
            oof_preds[val_idx] = qt.inverse_transform(res.predict(X_val).reshape((-1, 1))).ravel()

            test_preds.append(qt.inverse_transform(res.predict(X_test).reshape((-1, 1))).ravel())

        test_preds = np.mean(test_preds, axis = 0)
        
        return test_preds, oof_preds, res
コード例 #3
0
def test_transform_default_params():
    N = 1000
    rng = np.random.RandomState(22922)
    data = np.stack([
        rng.lognormal(10, 5, N),
        rng.uniform(-10, 0, N),
        rng.normal(10, 10, N),
        rng.normal(-1, 1, N)
    ],
                    axis=1)
    transformer = QuantileTransformer(output_distribution="normal",
                                      random_state=3434)
    data_transformed_sk = transformer.fit_transform(data)
    data_double_transformed_sk = transformer.inverse_transform(
        data_transformed_sk)
    np.testing.assert_allclose(data, data_double_transformed_sk)

    transformer_tf = QuantileTransformerTF(transformer)
    data_transformed_tf = transformer_tf.transform(data.astype(np.float64),
                                                   False)
    data_double_transformed_tf = transformer_tf.transform(
        data_transformed_tf, True)

    with tf.Session() as session:
        data_transformed_tf_val, data_double_transformed_tf_val = session.run(
            [data_transformed_tf, data_double_transformed_tf])
    np.testing.assert_allclose(data_transformed_sk, data_transformed_tf_val)
    np.testing.assert_allclose(data, data_double_transformed_tf_val)
コード例 #4
0
ファイル: scalers.py プロジェクト: zds0/dfencoder
class GaussRankScaler(object):
    """
    So-called "Gauss Rank" scaling.
    Forces a transformation, uses bins to perform
        inverse mapping.

    Uses sklearn QuantileTransformer to work.
    """
    def __init__(self):
        self.transformer = QuantileTransformer(output_distribution='normal')

    def fit(self, x):
        x = x.reshape(-1, 1)
        self.transformer.fit(x)

    def transform(self, x):
        x = x.reshape(-1, 1)
        result = self.transformer.transform(x)
        return result.reshape(-1)

    def inverse_transform(self, x):
        x = x.reshape(-1, 1)
        result = self.transformer.inverse_transform(x)
        return result.reshape(-1)

    def fit_transform(self, x):
        self.fit(x)
        return self.transform(x)
コード例 #5
0
def test_transform():
    N = 10000
    rng = np.random.RandomState(223532)
    data_2 = rng.normal(0, 1, N // 4)
    data = np.stack([
        rng.uniform(-10, 10, N),
        rng.lognormal(10, 5, N),
        np.concatenate([data_2] * 4),
        rng.normal(-1, 1, N)
    ],
                    axis=1)
    transformer = QuantileTransformer(output_distribution="normal",
                                      random_state=34214)
    data_transformed_sk = transformer.fit_transform(data)
    data_double_transformed_sk = transformer.inverse_transform(
        data_transformed_sk)
    np.testing.assert_allclose(data, data_double_transformed_sk)

    # To test that QuantileTransformerTF picks up the right columns
    # we ask it only for [1, 2, 3] columns and when testing use data[:, 1:]
    transformer_tf = QuantileTransformerTF(transformer, [1, 2, 3],
                                           dtype=np.float64)
    data_transformed_tf = transformer_tf.transform(
        data[:, 1:].astype(np.float64), False)
    data_double_transformed_tf = transformer_tf.inverse_transform(
        data_transformed_tf)

    with tf.Session() as session:
        data_transformed_tf_val, data_double_transformed_tf_val = session.run(
            [data_transformed_tf, data_double_transformed_tf])
    np.testing.assert_allclose(data_transformed_sk[:, 1:],
                               data_transformed_tf_val)
    np.testing.assert_allclose(data[:, 1:], data_double_transformed_tf_val)
コード例 #6
0
class Distributed:
    def __init__(self, my_name):
        self.my_name = my_name
        self.store = None

    def say_my_name(self):
        return self.my_name

    def fit(self, array):

        if self.my_name == 'Nothing':

            pass

        elif self.my_name == 'Simple':

            self.store = {'mean': array.mean(), 'std': array.std(ddof=1)}

        elif self.my_name == 'Normal':

            # from sklearn.preprocessing import PowerTransformer
            # self.store = PowerTransformer()
            from sklearn.preprocessing import QuantileTransformer
            self.store = QuantileTransformer(output_distribution='normal')
            arr = array.copy().astype(dtype=numpy.float64)
            self.store.fit(arr)

        elif self.my_name == 'Uniform':

            # from sklearn.preprocessing import PowerTransformer
            # self.store = PowerTransformer()
            from sklearn.preprocessing import QuantileTransformer
            self.store = QuantileTransformer(output_distribution='uniform')
            arr = array.copy().astype(dtype=numpy.float64)
            self.store.fit(arr)

        else:

            raise Exception("Not Yet!")

    def forward(self, array):

        if self.my_name == 'Nothing':

            arr = array.copy()

            return arr

        elif self.my_name == 'Simple':

            arr = array.copy().astype(dtype=numpy.float64)
            arr = (arr - self.store['mean']) / self.store['std']
            arr = stop_out(arr)

            return arr

        elif self.my_name == 'Normal':

            arr = array.copy().astype(dtype=numpy.float64)
            arr = self.store.transform(arr)
            arr = stop_out(arr)

            return arr

        elif self.my_name == 'Uniform':

            arr = array.copy().astype(dtype=numpy.float64)
            arr = self.store.transform(arr)
            arr = stop_out(arr)

            return arr

        else:

            raise Exception("Not Yet!")

    def backward(self, array):

        if self.my_name == 'Nothing':

            arr = array.copy()

            return arr

        elif self.my_name == 'Simple':

            arr = array.copy().astype(dtype=numpy.float64)
            arr = arr * self.store['std'] + self.store['mean']
            arr = stop_out(arr)

            return arr

        elif self.my_name == 'Normal':

            arr = array.copy().astype(dtype=numpy.float64)
            arr = self.store.inverse_transform(arr)
            arr = stop_out(arr)

            return arr

        elif self.my_name == 'Uniform':

            arr = array.copy().astype(dtype=numpy.float64)
            arr = self.store.inverse_transform(arr)
            arr = stop_out(arr)

            return arr

        else:

            raise Exception("Not Yet!")
コード例 #7
0
class CustomQuantileTransformer(TransformerMixin, BaseEstimator):
    def __init__(
        self,
        cols=None,
        n_quantiles=1000,
        output_distribution="normal",
        random_state=42,
        **kwargs,
    ):
        """
        cols: pass column names
        n_quantiles:
        """
        if isinstance(cols, str):
            self.cols = [cols]
        else:
            self.cols = cols
        self.n_quantiles = n_quantiles
        self.output_distribution = output_distribution
        self.random_state = random_state

    def fit(self, X, y=None):
        """
        fit
        """
        self.quant_trans = QuantileTransformer(
            n_quantiles=self.n_quantiles,
            output_distribution=self.output_distribution,
            random_state=self.random_state,
        )
        if isinstance(X, pd.DataFrame):
            self.quant_trans.fit(X[self.cols])
        elif isinstance(X, np.ndarray):
            self.quant_trans.fit(X)
        else:
            raise ValueError("input should be DataFrame or array")
        return self

    def transform(self, X):
        """
        transform
        """
        if isinstance(X, pd.DataFrame):
            Xo = self.quant_trans.transform(X[self.cols])
            Xo = pd.DataFrame(Xo, columns=self.cols)
            Xo = pd.concat([X.drop(self.cols, axis=1), Xo], axis=1)
        elif isinstance(X, np.ndarray):
            Xo = self.quant_trans.transform(X)
        else:
            raise ValueError("input should be DataFrame or array")
        return Xo

    def inverse_transform(self, X):
        """
        inverse_transform
        """
        if isinstance(X, pd.DataFrame):
            Xo = self.quant_trans.inverse_transform(X[self.cols])
            Xo = pd.DataFrame(Xo, columns=self.cols)
            Xo = pd.concat([X.drop(self.cols, axis=1), Xo], axis=1)
        elif isinstance(X, np.ndarray):
            Xo = self.quant_trans.inverse_transform(X)
        else:
            raise ValueError("input should be DataFrame or array")

        return Xo
    for i, n, a in dataset:
        print(counter)
        dim = int(math.sqrt(int(a.shape[0])))
        i = np.reshape(i, (1, 3, 512, 512))
        n = np.reshape(n, (dim, 2))
        a = np.reshape(a, (dim, dim))

        i = torch.Tensor(i).cuda()
        n = torch.Tensor(n).cuda()
        a = torch.Tensor(a).cuda()
        inputi = torch.cat((i, arr), 1)
        metric, nodes_out, edges_out = train_step(inputi, n, a, dim, optimizer)
        node_features = nodes_out.cpu().detach().numpy()
        new_adj = edges_out.cpu().detach().numpy()
        i = i.cpu().detach().numpy()

        node_features = (node_features - node_b) / node_a
        node_features = qt.inverse_transform(node_features)

        #new_adj = np.where(new_adj>.5, 1.0 , 0)
        np.savetxt('./data/output/adj' + str(counter) + '.txt', new_adj)
        np.savetxt('./data/output/node' + str(counter) + '.txt', node_features)
        image = i * 255.0
        image = np.reshape(image, (512, 512, 3))
        image = np.asarray(image, dtype=np.uint8)

        cv2.imwrite('./data/output/img' + str(counter) + '.png', image)
        counter += 1
        if counter == 6:
            break
コード例 #9
0
class KDEQuantileTransformer(TransformerMixin, BaseEstimator):
    """ Quantile tranformer class using for each variable the CDF obtained with kernel density estimation
    """
    def __init__(self,
                 n_quantiles=1000,
                 output_distribution='uniform',
                 smooth_peaks=True,
                 mirror_left=None,
                 mirror_right=None,
                 rho=0.5,
                 n_adaptive=1,
                 x_min=None,
                 x_max=None,
                 n_integral_bins=1000,
                 use_KDE=True,
                 use_inverse_qt=False,
                 random_state=0,
                 copy=True):
        """ Parameters with the class KDEQuantileTransformer

        KDEQuantileTransformer is a quantile tranformer class using for each variable the CDF obtained with
        kernel density estimation. Besides normal transformation functions, the class also provides the jacobian
        and inverse jacobian of the transformation and inverse transformation respectively.

        The KDE quantile transformation happens in four steps, two of which are transformations:

        1. First KDE PDFs and CDFs are formed for all marginalized input variables.
        2. Using the (smooth) CDFs, all input variables are transformed to uniform distributions.
        3. Using the existing quantile transformer of sklearn, these uniform distributions are then transformed to
           normal distributions.
        4. The KDE PDFs are used to calculate the (inverse) jacobian of the transformation.

        Concerning KDE evaluation of the PDF and CDF, the adaptive bandwidths are evaluated with the eqns described in:
        Cranmer KS, Kernel Estimation in High-Energy Physics. Computer Physics Communications 136:198-207, 2001
        e-Print Archive: hep ex/0011057

        In theory both transformations could be combined into one, but there are practical advantages of using two.
        Essentially the second transformation is a backup against the first one, to smooth out residual bumps.
        For certain edge case distributions, for example those with strange discrete peaks in them at the edge
        of a distribution, it may happen that a single transformation fails, in which case doing two quantile
        transformations catches any potential imperfections in the first.
        In the inverse transformation, by default the two transformations are combined into one however, b/c else
        the impact of KDE smoothing is cancelled.

        :param int n_quantiles: number of quantiles/bins used in output histogram. If greater than number of samples,
            this is reset to number of samples. Default is 1000.
        :param str output_distribution: 'uniform' or 'normal' distribution.
        :param bool smooth_peaks: if False, do not smear peaks of non-unique values.
        :param mirror_left: array. Mirror the data on a value on the left to counter signal leakage.
            Default is None, which is no mirroring.
        :param mirror_right: array. Mirror the data on a value on the right to counter signal leakage.
            Default is None, which is no mirroring.
        :param float rho: KDE bandwidth scale parameter. default is 0.5.
        :param int n_adaptive: KDE number of adaptive iterations to be applied to improve the band width. default is 1.
        :param x_min: array. minimum value of pdf's x range. default is None (= - inf)
        :param x_max: array. maximum value of pdf's x range. default is None (= + inf)
        :param int n_integral_bins: for internal evaluation, number of integration bins beyond x-range. default is 1000.
        :param bool use_KDE: Default is True. If false, KDE smoothing is off, using default quantile transformation.
        :param bool use_inverse_qt: Default is False. If true, KDE is not used in inverse transformation.
        :param int random_state: when an integer, the seed given random generator.
        :param copy: Copy the data before transforming. Default is True.
        """
        self.n_quantiles = n_quantiles
        self.output_distribution = output_distribution
        self.smooth_peaks = smooth_peaks
        self.n_adaptive = n_adaptive
        self.copy = copy
        self.use_inverse_qt = use_inverse_qt
        self.use_KDE = use_KDE
        self.n_integral_bins = max(n_integral_bins, 1000)
        self.random_state = random_state

        # integration range
        self.x_min = np.array(x_min) if isinstance(x_min, (list, tuple, np.ndarray)) else None
        self.x_max = np.array(x_max) if isinstance(x_max, (list, tuple, np.ndarray)) else None

        # left and right-hand mirror points
        self.mirror_left = np.array(mirror_left) if isinstance(mirror_left, (list, tuple, np.ndarray)) else None
        self.mirror_right = np.array(mirror_right) if isinstance(mirror_right, (list, tuple, np.ndarray)) else None

        # copy x ranges if mirror points not set
        self.mirror_left = self.x_min if self.mirror_left is None else self.mirror_left
        self.mirror_right = self.x_max if self.mirror_right is None else self.mirror_right

        # bandwidth rescaling factor
        self.rho = np.array(rho) if isinstance(rho, (list, tuple, np.ndarray)) else rho

        # basic checks on attributes
        if self.n_quantiles <= 0:
            raise ValueError("Invalid value for 'n_quantiles': %d. The number of quantiles must be at least one."
                             % self.n_quantiles)
        if self.output_distribution not in ('normal', 'uniform'):
            raise ValueError("'output_distribution' has to be either 'normal' or 'uniform'. Got '{}' instead."
                             % self.output_distribution)
        if (isinstance(self.rho, np.ndarray) and any([r <= 0 for r in self.rho])) or \
                (isinstance(self.rho, (float, np.number)) and self.rho <= 0):
            raise ValueError("Invalid value(s) for 'rho': %f. The number(s) must be greater than zero." % self.rho)
        if self.n_adaptive < 0:
            raise ValueError("Invalid value for 'n_adaptive': %d. Must be positive." % self.n_adaptive)

    def fit(self, X, y=None):
        """Compute the kde-based quantiles used for transforming.

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        :param y: Ignored
        :return: self : object
        """
        X = check_array(X, copy=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan")

        # sample profiles
        n_samples, n_features = X.shape

        # continuation of basic checks, now that we know X
        if isinstance(self.rho, np.ndarray):
            if self.rho.shape[0] != n_features:
                raise ValueError("Invalid size of 'rho': %d. The number should match the data: %d."
                                 % (self.rho.shape[0], n_features))
        else:
            self.rho = np.array([self.rho] * n_features)
        if isinstance(self.mirror_left, np.ndarray):
            if self.mirror_left.shape[0] != n_features:
                raise ValueError("Invalid size of 'mirror_left': %d. The number should match the data: %d."
                                 % (self.mirror_left.shape[0], n_features))
        else:
            self.mirror_left = np.array([None] * n_features)
        if isinstance(self.mirror_right, np.ndarray):
            if self.mirror_right.shape[0] != n_features:
                raise ValueError("Invalid size of 'mirror_right': %d. The number should match the data: %d."
                                 % (self.mirror_right.shape[0], n_features))
        else:
            self.mirror_right = np.array([None] * n_features)
        if isinstance(self.x_min, np.ndarray):
            if self.x_min.shape[0] != n_features:
                raise ValueError("Invalid size of 'x_min': %d. The number should match the data: %d."
                                 % (self.x_min.shape[0], n_features))
        else:
            self.x_min = np.array([None] * n_features)
        if isinstance(self.x_max, np.ndarray):
            if self.x_max.shape[0] != n_features:
                raise ValueError("Invalid size of 'x_max': %d. The number should match the data: %d."
                                 % (self.x_max.shape[0], n_features))
        else:
            self.x_max = np.array([None] * n_features)

        # number of quantiles cannot be higher than number of data points. If so, reset.
        if self.n_quantiles > n_samples:
            warnings.warn("n_quantiles (%s) is greater than the total number "
                          "of samples (%s). n_quantiles is set to "
                          "n_samples."
                          % (self.n_quantiles, n_samples))
        self.n_quantiles = max(1, min(self.n_quantiles, n_samples))

        # set the (x_min, x_max) transformation range
        # if not set, by default widen the range beyond min/max to account for signal leakage
        if any([x is None for x in self.x_min]) or any([x is None for x in self.x_max]):
            gstd = np.std(X, axis=0)
            bw = np.power(4 / 3, 0.2) * gstd * np.power(n_samples, -0.2)
            min_orig = np.min(X, axis=0) - 10 * bw
            max_orig = np.max(X, axis=0) + 10 * bw
            for i in range(n_features):
                self.x_min[i] = min_orig[i] if (self.x_min[i] is None and gstd[i] > 0) else self.x_min[i]
                self.x_max[i] = max_orig[i] if (self.x_max[i] is None and gstd[i] > 0) else self.x_max[i]

        if self.use_KDE:
            # Do the actual KDE fit (to uniform distributions)
            self._kde_fit(X)
            # prepare X to do quantile transformer fit.
            # add extreme points so QT knows the true edges for inverse transformation after sampling
            X = self._kde_transform(X)
            low = np.array([[0] * X.shape[1]])
            high = np.array([[1] * X.shape[1]])
            X = np.concatenate([X, low, high], axis=0)
        elif self.smooth_peaks:
            X = self._smooth_peaks(X)
            # create pdf for quantile transformation
            self._qt_pdf(X)

        # perform quantile transformation to smooth out any residual imperfections after kde
        # standard quantile transformer helps to smooth out any residual imperfections after kde transformation,
        # and does conversion to normal.
        self.qt_ = QuantileTransformer(
            n_quantiles=self.n_quantiles, 
            output_distribution=self.output_distribution,
            copy=self.copy,
            random_state=self.random_state,
        )
        self.qt_.fit(X)

        return self

    def _qt_pdf(self, X, min_pdf_value=1e-20):
        """Internal function to make quantile transformer pdf

        Is only run when use_KDE=False

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        """
        self.pdf_ = []

        n_samples, n_features = X.shape
        ps = np.linspace(0, 1, self.n_quantiles + 1)

        # calculate quantiles and pdf
        for i in range(n_features):
            x = X[:, i]
            bin_edges = np.quantile(x, ps)
            bin_entries = [1./self.n_quantiles] * self.n_quantiles
            bin_diffs = np.diff(bin_edges)
            pdf_norm = np.divide(bin_entries, bin_diffs, out=np.zeros_like(bin_entries), where=bin_diffs != 0)
            # ensure interpolate works up to last bin edge, somehow ignored otherwise
            pdf_norm = np.concatenate([pdf_norm, [pdf_norm[-1]]])
            fast_pdf = interpolate.interp1d(bin_edges, pdf_norm, kind='previous', bounds_error=False,
                                            fill_value=(min_pdf_value, min_pdf_value))
            self.pdf_.append({'fast': fast_pdf, 'bin_edges': bin_edges, 'bin_entries': bin_entries})

    def _kde_fit(self, X):
        """Internal function to compute the kde-based quantiles used for transforming.

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        :return: self : object
        """
        # reset
        self.pdf_ = []
        self.cdf_ = []

        n_features = X.shape[1]

        for i in range(n_features):
            # do kde fit, store each pdf
            bin_entries, bin_mean = kde_process_data(X[:, i], self.n_quantiles, self.smooth_peaks,
                                                     self.mirror_left[i], self.mirror_right[i],
                                                     random_state=self.random_state)
            band_width = kde_bw(bin_mean, bin_entries, self.rho[i], self.n_adaptive)
            # transformers to uniform distribution and back
            fast_pdf, F, Finv, kde_norm = kde_make_transformers(bin_mean, bin_entries, band_width,
                                                                x_min=self.x_min[i], x_max=self.x_max[i],
                                                                n_bins=self.n_integral_bins)
            # store cdf, inverse-cdf, and pdf.
            self.cdf_.append((F, Finv))
            pdf = {'bin_entries': bin_entries, 'bin_mean': bin_mean, 'band_width': band_width,
                   'norm': kde_norm, 'fast': fast_pdf}
            self.pdf_.append(pdf)

        return self

    def _smooth_peaks(self, X):
        """Internal function to smooth non-unique peaks

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, n_features)
            The transformed data
        """
        X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan")

        n_features = X.shape[1]
        for feature_idx in range(n_features):
            x = X[:, feature_idx]
            # smooth peaks - note: this adds a random component to the data
            # applying smoothing to data that's already been smoothed has no impact, b/c all peaks are already gone.
            x = kde_smooth_peaks_1dim(x, self.mirror_left[feature_idx], self.mirror_right[feature_idx],
                                      copy=False, random_state=self.random_state, smoothing_width=1e-5)
            X[:, feature_idx] = x
        return X

    def _kde_transform(self, X):
        """Internal function to transform the data

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, n_features)
            The transformed data
        """
        X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan")

        n_features = X.shape[1]
        for feature_idx in range(n_features):
            x = X[:, feature_idx]
            # smooth peaks - note: this adds a random component to the data
            # applying smoothing to data that's already been smoothed has no impact, b/c all peaks are already gone.
            if self.smooth_peaks:
                x = kde_smooth_peaks_1dim(x, self.mirror_left[feature_idx], self.mirror_right[feature_idx],
                                          copy=False, random_state=self.random_state)
            # transform distribution to uniform
            y = self.cdf_[feature_idx][0](x)
            # transform uniform [0,1] distribution to normal
            # X[:, feature_idx] = np.sqrt(2.) * erfinv(2. * y - 1.) if self.output_distribution == 'normal' else y
            X[:, feature_idx] = y

        return X

    def transform(self, X):
        """Transform the data

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, n_features)
            The transformed data
        """
        # 1. kde transformation to uniform.
        if self.use_KDE:
            X = self._kde_transform(X)
        elif self.smooth_peaks:
            X = self._smooth_peaks(X)

        # 2. quantile transformation to smooth out residual bumps and do conversion to normal distribution
        return self.qt_.transform(X)

    def _kde_inverse_transform(self, X):
        """Internal function to inverse transform the data

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to inverse scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, n_features)
            The inverse-transformed data
        """
        n_features = X.shape[1]
        for feature_idx in range(n_features):
            x = X[:, feature_idx]
            # transform normal back to uniform [0,1]
            if not self.use_inverse_qt:
                x = (0.5 + 0.5 * erf(x/np.sqrt(2.))) if self.output_distribution == 'normal' else x
            # transform uniform back to original distribution
            X[:, feature_idx] = self.cdf_[feature_idx][1](x)

        return X

    def inverse_transform(self, X):
        """Inverse transform the data

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to inverse scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, n_features)
            The inverse-transformed data
        """
        # 1. quantile transformation back to kde
        if self.use_inverse_qt or not self.use_KDE:
            X = self.qt_.inverse_transform(X)
        # 2. inverse kde transformation
        return self._kde_inverse_transform(X) if self.use_KDE else X

    def jacobian(self, X):
        """Provide the Jacobian of the transformation

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, )
            An array with the jacobian of each data point
        """
        X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan")

        # smoothing of peaks
        if self.smooth_peaks:
            X = self._smooth_peaks(X)

        jac = 1.0

        for idx in range(X.shape[1]):
            kdfi = self.pdf_[idx]['fast']
            jac /= kdfi(X[:, idx])

        if self.output_distribution == 'normal':
            X = self.transform(X)
            for idx in range(X.shape[1]):
                jac *= norm.pdf(X[:, idx])

        return jac

    def inverse_jacobian(self, X):
        """Provide the Jacobian of the inverse transformation

        :param X: ndarray or sparse matrix, shape (n_samples, n_features)
            The data used to inverse scale along the features axis.
        :return: ndarray or sparse matrix, shape (n_samples, )
            An array with the jacobian of the inverse transformation of each input data point
        """
        X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan")

        inv_jac = 1.0

        if self.output_distribution == 'normal':
            for idx in range(X.shape[1]):
                inv_jac /= norm.pdf(X[:, idx])

        X = self.inverse_transform(X)

        for idx in range(X.shape[1]):
            kdfi = self.pdf_[idx]['fast']
            inv_jac *= kdfi(X[:, idx])

        return inv_jac
コード例 #10
0
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold

kfold = KFold(n_splits=10, random_state=seed)
results = cross_val_score(pipeline, X_train, y_train_norm, cv=kfold)
print("Standardized: %.15f (%.15f) MSE" % (results.mean(), results.std()))

predictions_train_trans = cross_val_predict(pipeline,
                                            X_train,
                                            y_train_norm,
                                            cv=kfold)

#To be able to save the model and evaluate it
#we have to fit again because cross val does not store the fit parameters
pipeline.fit(X_train, y_train_norm)
predictions_train_new = quantile_transformer.inverse_transform(
    pipeline.predict(X_train))
predictions_test = quantile_transformer.inverse_transform(
    pipeline.predict(X_test))

#Score of the regression
from sklearn.metrics import mean_squared_error
print("MSE train : %.15f " %
      mean_squared_error(y_train, predictions_train_new))
print("MSE test : %.15f " % mean_squared_error(y_test, predictions_test))

with open('Results.txt', 'a+', newline='\n') as f:
    f.write('Results for 1 layer with 13 neurons \n')
    f.write('Standardized: %.15f (%.15f) MSEi \n' %
            (results.mean(), results.std()))
    f.write('MSE train : %.15f \n' %
            mean_squared_error(y_train, predictions_train_new))
コード例 #11
0
#Get the Test data
finalPreds = []
for i in range(14):
    x_test = []
    x_test.append(scaled_data[-60:])
    #format test data
    x_test = np.array(x_test)
    x_test = np.reshape(x_test, (x_test.shape[0], x_test.shape[1], 1))

    #make prediction
    predictions = model.predict(x_test)
    test = random.randint(0, 1)
    scaled_data = np.append(scaled_data,
                            predictions[0] + random.uniform(-0.35, 0.35))
    scaled_data = scaled_data.transpose()
    predictions = scaler.inverse_transform(predictions)
    finalPreds.append(predictions)

#Plot the results
train = data[training_data_len - 60:training_data_len]
valid = data[len(scaled_data) - len(finalPreds):]
valid['Predictions'] = finalPreds
plt.figure(figsize=(16, 8))
plt.title('COVID-19 Predicted Cases(Training)')
plt.xticks(np.arange(0, 1000, 5))
plt.xlabel('Date', fontsize=8)
plt.ylabel('Number of Cases', fontsize=18)
plt.plot(train['Cases'])
plt.plot(valid['Predictions'])
plt.legend(['Train', 'Val', 'Predictions'], loc='lower right')
plt.show()
コード例 #12
0
class Dataset:
    def __init__(self, type, parameters):
        # Constructs dataset to train and test the following:
        # 1) Pressure to Acoustic converter: converts the output pressures of tubetalker to acoustic parameters.
        # 2) Feedforward controller: Controller that accepts acoustic parameters and estimates muscle parameters, i.e. inputs to tubetalker.
        # 3) Feedback controllers:
        #       a) Somatosensory feedback controller: controller that accepts somatosensory error (targets - predictions of talker) and estimates muscle parameters (inputs to tubetalker).
        #       b) Acoustic feedback controller: controller that accepts acoutic error (targets - predictions of PtoA converter) and estimates muscle parameters (inputs to tubetalker).
        # The datasets consist of targets for 1, 2 & 3 mentioned above, i.e., targets for muscle, acoustic and somatosensory parameters.
        super(Dataset, self).__init__()
        self.training_data, self.testing_data = self.readFile(type, parameters)
        print("training data shape: {}, testing data shape: {}".format(
            self.training_data.shape, self.testing_data.shape))
        pass

    def readFile(self, type, parameters):
        # Reads the PtoA converter acoustics file and controllers parameters(muscle, acoustic and somatosensory parameters) file
        ptoa_acoustics_data = np.asarray(pd.read_csv(
            parameters["acoustics-file"]).to_numpy(),
                                         dtype=np.float32)
        controllers_data = np.asarray(pd.read_csv(
            parameters["controllers-file"]).to_numpy(),
                                      dtype=np.float32)
        controllers_acoustics_data = controllers_data[:, 8:12]

        ptoa_data_length = ptoa_acoustics_data.shape[0]
        controllers_data_length = ptoa_data_length + controllers_acoustics_data.shape[
            0]

        # Gather all acoustic data and apply normal transformation to the data.
        # This will normalize the data distribution and better trains the neural network controllers.
        acoustics_data = np.concatenate(
            (ptoa_acoustics_data, controllers_acoustics_data), axis=0)
        acoustics_data_normal = self.transform_to_normal_dist(acoustics_data)

        # Split the normally distributed acoustic data into PtoA acoustic data and controller acoustic data.
        ptoa_acoustics_data_normal = acoustics_data_normal[
            0:ptoa_data_length, :]
        controllers_acoustics_data_normal = acoustics_data_normal[
            ptoa_data_length:controllers_data_length, :]

        # Testing data and training datasets and constructed according to the type given by the user.
        if type == "PtoA":
            self.pressure_maximum = 35681.0  # Max value for max normalization.
            training_length = 1500000  # Total length of training data.
            if (parameters["PtoA"]["perform"] == "training"):
                pressure_training_data = np.asarray(
                    pd.read_csv(
                        parameters["pressure-training-file"]).to_numpy(),
                    dtype=np.float32)  # Import data from training file
                training_data = np.concatenate(
                    (pressure_training_data / self.pressure_maximum,
                     ptoa_acoustics_data_normal[0:training_length, :]),
                    axis=-1)  # Select training data as per training_length
                testing_data = np.zeros((10, 1106), dtype=np.float32)
            elif (parameters["PtoA"]["perform"] == "testing"):
                pressure_testing_data = np.asarray(pd.read_csv(
                    parameters["pressure-testing-file"]).to_numpy(),
                                                   dtype=np.float32)
                training_data = np.zeros((10, 1106), dtype=np.float32)
                testing_data = np.concatenate(
                    (pressure_testing_data / self.pressure_maximum,
                     ptoa_acoustics_data_normal[training_length:, :]),
                    axis=-1)
        elif type == "FBControl":  # Feedback controller training.
            self.pressure_maximum = 35681.0
            training_length = 40000
            som_training_data = controllers_data[0:training_length, 4:8]
            som_testing_data = controllers_data[training_length:, 4:8]
            som_training_data[:, 1:] = np.log(som_training_data[:, 1:])
            som_testing_data[:, 1:] = np.log(som_testing_data[:, 1:])
            self.somatosensory_maximums = np.amax(np.concatenate(
                (som_training_data, som_testing_data), axis=0),
                                                  axis=0)
            self.somatosensory_minimums = np.amin(np.concatenate(
                (som_training_data, som_testing_data), axis=0),
                                                  axis=0)
            training_data = np.concatenate(
                (controllers_data[0:training_length, 0:4],
                 (som_training_data - self.somatosensory_minimums) /
                 (self.somatosensory_maximums - self.somatosensory_minimums),
                 controllers_acoustics_data_normal[0:training_length, :]),
                axis=-1)
            testing_data = np.concatenate(
                (controllers_data[training_length:, 0:4],
                 (som_testing_data - self.somatosensory_minimums) /
                 (self.somatosensory_maximums - self.somatosensory_minimums),
                 controllers_acoustics_data_normal[training_length:, :]),
                axis=-1)
            print("pressure max: {}, somatosensory maxs: {}, mins: {}".format(
                self.pressure_maximum, self.somatosensory_maximums,
                self.somatosensory_minimums))
        training_data = np.reshape(
            training_data,
            (training_data.shape[0], 1, training_data.shape[-1]))
        testing_data = np.reshape(
            testing_data, (testing_data.shape[0], 1, testing_data.shape[-1]))
        print("training data shape: {}, testing data shape: {}".format(
            training_data.shape, testing_data.shape))
        return training_data, testing_data

    def transform_to_normal_dist(self, acoustics):
        # Transforms acoustics data to normally distributed acoustic data.
        # Each acoustic parameters fo, SPL, SC, SNR are normalized independently.
        self.qt1 = QuantileTransformer(n_quantiles=1000,
                                       output_distribution='normal',
                                       random_state=0)
        self.qt2 = QuantileTransformer(n_quantiles=1000,
                                       output_distribution='normal',
                                       random_state=0)
        self.qt3 = QuantileTransformer(n_quantiles=1000,
                                       output_distribution='normal',
                                       random_state=0)
        self.qt4 = QuantileTransformer(n_quantiles=1000,
                                       output_distribution='normal',
                                       random_state=0)
        fo, SPL, SC, SNR = np.split(acoustics, 4, axis=-1)
        fob = self.qt1.fit_transform(fo)
        SPLb = self.qt2.fit_transform(SPL)
        SCb = self.qt3.fit_transform(SC)
        SNRb = self.qt4.fit_transform(SNR)
        # Computing maxima/minima of each acoustic parameter for normalization and denormalization.
        self.maxima = {}
        self.maxima["fo"] = np.amax(fob, axis=0)
        self.maxima["SPL"] = np.amax(SPLb, axis=0)
        self.maxima["SC"] = np.amax(SCb, axis=0)
        self.maxima["SNR"] = np.amax(SNRb, axis=0)
        self.minima = {}
        self.minima["fo"] = np.amin(fob, axis=0)
        self.minima["SPL"] = np.amin(SPLb, axis=0)
        self.minima["SC"] = np.amin(SCb, axis=0)
        self.minima["SNR"] = np.amin(SNRb, axis=0)
        # minmax normalization of parameters.
        fob = (fob - np.amin(fob, axis=0)) / (np.amax(fob, axis=0) -
                                              np.amin(fob, axis=0))
        SPLb = (SPLb - np.amin(SPLb, axis=0)) / (np.amax(SPLb, axis=0) -
                                                 np.amin(SPLb, axis=0))
        SCb = (SCb - np.amin(SCb, axis=0)) / (np.amax(SCb, axis=0) -
                                              np.amin(SCb, axis=0))
        SNRb = (SNRb - np.amin(SNRb, axis=0)) / (np.amax(SNRb, axis=0) -
                                                 np.amin(SNRb, axis=0))
        return np.concatenate((fob, SPLb, SCb, SNRb), axis=-1)

    def inverse_transform_to_original_dist(self, acoustics):
        # Acoustic predictions need to be converted back to their original distribution.
        # The same transformers as those used for inverse-transformation as those used in tranformation.
        fo, SPL, SC, SNR = np.split(acoustics, 4, axis=-1)
        fo = np.reshape(fo, (fo.shape[0], fo.shape[-1])) * (
            self.maxima["fo"] - self.minima["fo"]) + self.minima["fo"]
        SPL = np.reshape(SPL, (SPL.shape[0], SPL.shape[-1])) * (
            self.maxima["SPL"] - self.minima["SPL"]) + self.minima["SPL"]
        SC = np.reshape(SC, (SC.shape[0], SC.shape[-1])) * (
            self.maxima["SC"] - self.minima["SC"]) + self.minima["SC"]
        SNR = np.reshape(SNR, (SNR.shape[0], SNR.shape[-1])) * (
            self.maxima["SNR"] - self.minima["SNR"]) + self.minima["SNR"]
        foi = self.qt1.inverse_transform(fo)
        SPLi = self.qt2.inverse_transform(SPL)
        SCi = self.qt3.inverse_transform(SC)
        SNRi = self.qt4.inverse_transform(SNR)
        foi = np.reshape(foi, (fo.shape[0], 1, fo.shape[-1]))
        SPLi = np.reshape(SPLi, (SPL.shape[0], 1, SPL.shape[-1]))
        SCi = np.reshape(SCi, (SC.shape[0], 1, SC.shape[-1]))
        SNRi = np.reshape(SNRi, (SNR.shape[0], 1, SNR.shape[-1]))
        return np.concatenate((foi, SPLi, SCi, SNRi), axis=-1)

    def getFullShuffledDataset(self, data):
        # shuffles data on the first axis.
        np.random.shuffle(data)
        return data
コード例 #13
0
        x = (x * 256 - .5).int()

        quintiles = np.percentile(x.numpy(), [0, 50])
        q = quintiles.searchsorted(x.numpy())
        # print(quintiles)
        # print(q, q.min(), q.max())
        # 1/0

        qt = QuantileTransformer()
        xt = torch.tensor(qt.fit_transform(x.view(x.size(0), -1).numpy())).float().view(x.size())
        # print(x)
        print(xt.min(), xt.max(), xt.size(), x.size())
        print(quintiles)
        for buckets in [2, 4, 8, 16]:
            xd = (xt * buckets).int().float() / buckets
            xr = torch.tensor(qt.inverse_transform(xd.numpy() * 0 + 1)).float()

            quintiles = np.percentile(x.numpy(), 100 * np.linspace(0, 1, buckets + 1)[:-1])
            print(quintiles)

            out = torch.zeros_like(xr)


            print(list(set(xt[0].numpy())))
            print(list(set(xd[0].numpy())))
            #print(list(set(x[0].numpy())))
            print(sorted(list(set(xr[0].numpy()))))
            1/0

        for deq in [1, 2, 2, 4, 8, 16, 32, 64, 128]:
            xd = ((x // deq) * deq).float()
コード例 #14
0
plt.xlabel('Acceleration (Total)')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(X_normal[:, 1])
plt.xlabel('Gyration (Total)')
plt.ylabel('Frequency')

plt.tight_layout(pad=3)
plt.show()

# In[150]:

# inverse transform (to verify that it works)

X = normalizer.inverse_transform(X_normal)

plt.figure(figsize=(8, 4))
plt.subplot(1, 2, 1)
plt.hist(X[:, 0])
plt.xlabel('Acceleration (Total)')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(X[:, 1])
plt.xlabel('Gyration (Total)')
plt.ylabel('Frequency')

plt.tight_layout(pad=3)
plt.show()
コード例 #15
0
def test_transform_test():
    N = 10000

    def gen_data(seed):
        rng = np.random.RandomState(seed)
        data_1 = rng.uniform(-10, 0, N // 4)
        return np.stack([
            rng.lognormal(10, 5, N),
            np.concatenate([data_1] * 4),
            rng.normal(10, 10, N),
            rng.normal(-1, 1, N)
        ],
                        axis=1)

    data = gen_data(23342)
    transformer = QuantileTransformer(output_distribution="normal",
                                      n_quantiles=100,
                                      random_state=3434)
    data_transformed_sk = transformer.fit_transform(data)
    data_double_transformed_sk = transformer.inverse_transform(
        data_transformed_sk)
    np.testing.assert_allclose(data, data_double_transformed_sk)

    transformer_tf = QuantileTransformerTF(transformer)
    data_transformed_tf = transformer_tf.transform(data.astype(np.float64),
                                                   False)
    data_double_transformed_tf = transformer_tf.transform(
        data_transformed_tf, True)

    data_test = np.vstack(
        [gen_data(1321), [[100, 100, 100, 111], [-100, -124, -241, -1]]])

    test_transformed_sk = transformer.transform(data_test)
    test_double_transformed_sk = transformer.inverse_transform(
        test_transformed_sk)

    test_transformed_tf = transformer_tf.transform(
        data_test.astype(np.float64), False)
    test_double_transformed_tf = transformer_tf.inverse_transform(
        data_transformed_tf)

    rng = np.random.RandomState(223532)
    data_inverse = rng.normal(size=[N, 4])
    inverse_sk = transformer.inverse_transform(data_inverse)
    inverse_tf = transformer_tf.inverse_transform(data_inverse)

    with tf.Session() as session:
        test_transformed_tf_val, xtest_double_transformed_tf_val, \
            data_transformed_tf_val, data_double_transformed_tf_val, \
            inverse_tf_val = session.run([
                test_transformed_tf, test_double_transformed_tf,
                data_transformed_tf, data_double_transformed_tf,
                inverse_tf])

    np.testing.assert_allclose(data_transformed_sk, data_transformed_tf_val)
    np.testing.assert_allclose(data, data_double_transformed_sk)
    np.testing.assert_allclose(data, data_double_transformed_tf_val)

    np.testing.assert_allclose(test_transformed_sk, test_transformed_tf_val)

    np.testing.assert_allclose(inverse_sk, inverse_tf_val)
コード例 #16
0
class DFQuantileTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None, **kwargs):
        self.columns = columns
        self.model = QuantileTransformer(**kwargs)
        self.transform_cols = None
        self.stat_df = None

    def fit(self, X, y=None):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]
        self.model.fit(X[self.transform_cols])

        # Reference: https://help.gooddata.com/doc/en/reporting-and-dashboards/maql-analytical-query-language/maql-expression-reference/aggregation-functions/statistical-functions/predictive-statistical-use-cases/normality-testing-skewness-and-kurtosis
        # Highly skewed:           -1   > Skewness > 1
        # Moderate skewed:         -0.5 < Skewness < -1
        #                           0.5 < Skewness < 1
        # Approximately symmetric: -0.5 < Skewness < 0.5
        skew_df = X[self.transform_cols].skew().to_frame(name='Skewness')
        # Normal distributed kurtosis: 3
        kurt_df = X[self.transform_cols].kurt().to_frame(name='Kurtosis')
        self.stat_df = skew_df.merge(kurt_df,
                                     left_index=True,
                                     right_index=True,
                                     how='left')

        return self

    def transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.copy()
        new_X[self.transform_cols] = self.model.transform(
            X[self.transform_cols])

        # Transformed skewness & kurtosis
        skew_df = new_X[self.transform_cols].skew().to_frame(
            name='Skewness (Transformed)')
        kurt_df = new_X[self.transform_cols].kurt().to_frame(
            name='Kurtosis (Transformed)')
        stat_df = skew_df.merge(kurt_df,
                                left_index=True,
                                right_index=True,
                                how='left')
        self.stat_df = self.stat_df.merge(stat_df,
                                          left_index=True,
                                          right_index=True,
                                          how='left')

        return new_X

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

    def inverse_transform(self, X):
        if self.transform_cols is None:
            raise NotFittedError(
                f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator."
            )

        new_X = X.copy()
        new_X[self.transform_cols] = self.model.inverse_transform(
            X[self.transform_cols])

        return new_X
コード例 #17
0
ファイル: xforms.py プロジェクト: josepm/Utilities
class Transform(object):
    #     Yeo-Johnson transform, which is an extension of Box-Cox transformation but can handle both positive and negative values.
    #     References:
    #     Weisberg, Yeo-Johnson Power Transformations. https://www.stat.umn.edu/arc/yjpower.pdf
    #     Yeo + Johnson, A new family of power transformations to improve normality or symmetry, http://www.stat.wisc.edu/sites/default/files/tr1002.pdf
    #     For Anscombe transform (https://en.wikipedia.org/wiki/Anscombe_transform),
    #
    #     Bias removal
    #     for bias removal, ciw and alpha must be not None, otherwise no bias removal is done and ew return the median rather than the mean
    #     for bias removal, see https://robjhyndman.com/hyndsight/backtransforming/
    #     see http://davegiles.blogspot.co.uk/2013/08/forecasting-from-log-linear-regressions.html
    #     see http://data.princeton.edu/wws509/notes/c2s10.html
    #     and https://robjhyndman.com/hyndsight/backtransforming/

    def __init__(self, method, nqs, ceiling=None, floor=None, unbias=False):
        self.method = method
        self.ceiling = ceiling
        self.floor = floor
        self.lmbda = None
        self.name = method
        self.xf_done = False
        self.unbias = unbias  # not implemented
        self.lbl = None
        if method == 'yeo-johnson' or method == 'box-cox':
            self.xobj = PowerTransformer(
                method=method, standardize=False,
                copy=False)  # MUST have standardize = False
        elif method == 'quantile':
            self.xobj = QuantileTransformer(n_quantiles=int(nqs),
                                            output_distribution='normal',
                                            copy=False)
        elif method == 'logistic':
            self.xobj = Linearizer(ceiling, floor, self.unbias)
        elif method == 'log':
            self.xobj = LogTransform(self.unbias)
        elif method == 'anscombe':
            self.xobj = Anscombe()
        elif method is None:
            self.method = None
            self.xobj = NoTransform()
        else:
            su.my_print('pid: ' + str(os.getpid()) +
                        ' WARNING: set_xform: invalid method: ' + str(method))
            self.method = None
            self.xobj = NoTransform()

    def check_input(self, y):
        if isinstance(y, (float, int, np.float, np.int)):
            y = np.array([y])

        if isinstance(y, np.ndarray) is False:
            su.my_print('pid: ' + str(os.getpid()) +
                        ' WARNING: invalid type: ' + str(type(y)))
            return None

        yc = np.copy(y)
        if len(np.shape(yc)) > 1:
            su.my_print('pid: ' + str(os.getpid()) +
                        ' WARNING: invalid input shape: ' + str(np.shape(yc)))
            return None
        yx = np.reshape(yc, (1, -1))[0]

        if np.max(yx) == np.min(yx):
            su.my_print('pid: ' + str(os.getpid()) +
                        ' WARNING: constant series: ' + str(np.min(yc)))
            return None
        if np.min(
                yx
        ) < 0.0 and self.method == 'box-cox' or self.method == 'log' or self.method == 'anscombe':
            su.my_print('pid: ' + str(os.getpid()) +
                        ' WARNING: invalid range for method: ' + self.method +
                        ' min: ' + str(np.min(yx)) + ' lbda: ' +
                        str(self.lmbda))
            return None
        return np.reshape(yx, (-1, 1))

    def transform(self, y):
        if self.xf_done is False:
            su.my_print('pid: ' + str(os.getpid()) + ' ' + self.method +
                        ' : must fit before transform')
            return None
        if self.method == 'logistic':
            return self.xobj.transform(y)
        elif self.method == 'log':
            return self.xobj.transform(y)
        else:
            return self._transform(y)

    def _transform(self, y):
        if self.method is None:
            return y
        else:
            ys = self.check_input(y)
            if ys is None:
                return None
            yt = self.xobj.transform(ys)
            ya = np.reshape(yt, (1, -1))[0]
            return self.check_output(ya, y)

    def fit(self, y):
        if self.xf_done is True:
            su.my_print('pid: ' + str(os.getpid()) + ' ' + self.method +
                        ' : already fit. Create new object')
            return None
        else:
            self.xf_done = True
            if self.method == 'logistic':
                self.xobj.fit(y)
                return self
            elif self.method == 'log':
                self.xobj.fit(y)
                return self
            else:
                return self._fit(y)

    def _fit(self, y):
        ys = self.check_input(y)
        if ys is None:
            return None
        else:
            try:
                self.xobj.fit(ys)
            except ValueError:
                return None
            try:
                self.lmbda = self.xobj.lambdas_[0]
            except AttributeError:
                self.lmbda = None
            return self

    def fit_transform(self, y):
        if self.xf_done is True:
            su.my_print('pid: ' + str(os.getpid()) + ' ' + self.method +
                        ' : already fit. Create new Transform instance')
            return None
        else:
            r = self.fit(y)
            return None if r is None else self.transform(y)

    def check_output(self, ya, y):
        if len(np.unique(ya)) == 1:  # xform failed: over/underflow?
            su.my_print(' WARNING: transform ' + self.method +
                        ' failed with lambda ' + str(self.lmbda) +
                        ' and label: ' + str(self.lbl) + ' Trying Quantile')
            return self.reset_xform(y)
        else:
            return ya

    def reset_xform(self, y):  # in case of failure, try quantile
        if self.method == 'yeo-johnson' or self.method == 'box-cox':
            self.method = 'quantile'
            del self.xobj  # drops lmbda also
            self.xobj = QuantileTransformer(output_distribution='normal',
                                            copy=False)
            self.xf_done = False
            return self.fit_transform(y)
        else:
            return y

    def fcast_var(self, rdf, w):  # used to unbias inverse transforms
        # see https://robjhyndman.com/hyndsight/backtransforming/
        # rdf contains yhat_upr and yhat_lwr
        # w is is the quantile used for the yhat bounds
        # y_upr/lwr = yhat +/- q * sig
        q = 1.0 - (1.0 - w) / 2.0
        qval = norm.ppf(q) / 2.0
        try:
            diff = np.abs(rdf.diff(axis=1).dropna(axis=1)) / 2.0
        except TypeError:
            return 0.0
        if len(diff) > 0 and len(diff.columns) == 1:
            diff.columns = ['var']
            return (diff / qval)**2  # y_var
        else:
            return 0.0

    def _u_inverse_transform(self, y_mean, y_var):
        # return f(y_mean) + (y_var /2) * f''(y_mean) where f in the inverse transform function
        if self.unbias is False:
            return y_mean
        else:
            if y_mean is None:
                return None
            else:
                y_mean_ = np.reshape(y_mean, (1, -1))[0]
                y_var_ = 0.0 if y_var is None else np.reshape(y_var,
                                                              (1, -1))[0]
                if self.method == 'box-cox':
                    fy, d2fy = self._box_cox_unbias(y_mean_)
                elif self.method == 'yeo-johnson':
                    fy, d2fy = self._yeo_johnson_unbias(y_mean_)
                elif self.method == 'quantile':
                    fy, d2fy = self._quantile_unbias(y_mean_)
                else:
                    fy, d2fy = y_mean_, 0.0
                return None if fy is None else (fy if d2fy is None else fy +
                                                (y_var_ / 2.0) * d2fy)

    def _box_cox_unbias(self, y_mean):
        if np.abs(self.lmbda) > 1.0e-02:
            z = 1 + self.lmbda * y_mean
            fy = np.power(z, 1.0 / self.lmbda)
            fy = self.interpolate_(fy, y_mean, nan_pct=0.2)
            d2fy = (1 - self.lmbda) * np.power(z, (1.0 / self.lmbda) - 2.0)
            d2fy = self.interpolate_(d2fy, y_mean, nan_pct=0.2)
        else:
            fy = np.exp(y_mean)
            d2fy = fy
        return fy, d2fy

    def _yeo_johnson_unbias(self, y_mean):
        fy = np.zeros_like(y_mean)
        d2fy = np.zeros_like(y_mean)
        pos = y_mean >= 0  # binary mask
        if np.abs(self.lmbda) < 1.0e-02:
            fy[pos] = np.exp(y_mean[pos]) - 1
            d2fy[pos] = np.exp(y_mean[pos])
        else:
            z = 1 + self.lmbda * y_mean[pos]
            fy[pos] = np.power(z, (1.0 / self.lmbda)) - 1.0
            d2fy[pos] = (self.lmbda - 1) * np.power(z,
                                                    (1.0 / self.lmbda) - 2.0)

        if np.abs(2 - self.lmbda) < 10.e-02:
            fy[~pos] = 1.0 - np.exp(-y_mean[~pos])
            d2fy[~pos] = -np.exp(-y_mean[~pos])
        else:
            theta = 2 - self.lmbda
            z = 1 - theta * y_mean[~pos]
            fy[~pos] = 1 - np.power(z, 1.0 / theta)
            d2fy[~pos] = (theta - 1) * np.power(z, (1.0 / theta) - 2.0)
        return fy, d2fy

    def _quantile_unbias(self, y_mean):
        return y_mean, 0.0

    def inverse_transform(self, y, y_var, lbl=None):
        self.lbl = lbl
        if y is not None:
            if isinstance(y, (pd.core.series.Series, pd.core.frame.DataFrame)):
                y = y.values
        else:
            return None

        if y_var is not None:
            if isinstance(y, (pd.core.series.Series, pd.core.frame.DataFrame)):
                y_var = y_var.values

        if isinstance(y, np.ndarray) is False:
            su.my_print('pid: ' + str(os.getpid()) +
                        ' WARNING: invalid type: ' + str(type(y)))
            return None

        if self.xf_done is False:
            su.my_print(
                'pid: ' + str(os.getpid()) +
                ' WARNING: cannot inverse_transform before fit is done')
            return None

        yc = copy.deepcopy(y)
        if self.method == 'logistic':
            yt = self.xobj.inverse_transform(y, y_var, lbl=lbl)
            yt = self.interpolate_(yt, yc, nan_pct=0.2)
            if yt is None:
                su.my_print('pid: ' + str(os.getpid()) +
                            ' WARNING: inverse transform failed for label: ' +
                            str(self.lbl) + ' (method: ' + str(self.method))
                return None
            else:
                return yt
        elif self.method == 'log':
            yt = self.xobj.inverse_transform(y, y_var, lbl=lbl)
            yt = self.interpolate_(yt, yc, nan_pct=0.2)
            if yt is None:
                su.my_print('pid: ' + str(os.getpid()) +
                            ' WARNING: inverse transform failed for label: ' +
                            str(self.lbl) + ' (method: ' + str(self.method))
                return None
            else:
                return yt
        elif self.method is None:
            return y
        else:  # box-cox, yj
            yt = self._inverse_transform(y, yc, y_var)
            if yt is None:
                su.my_print('pid: ' + str(os.getpid()) +
                            ' WARNING: inverse transform failed for label: ' +
                            str(self.lbl) + ' (method: ' + str(self.method) +
                            ' and lambda: ' + str(self.lmbda) + ')')
                return None
            else:
                yout = np.reshape(yt,
                                  (1, -1))[0] if self.method is not None else y
                return yout

    def _inverse_transform(self, y, yc, y_var):
        if self.unbias is False:
            ys = np.reshape(y, (-1, 1))
            yt = self.xobj.inverse_transform(
                ys)  # box-cox returns NaN on failure
        else:
            yt = self._u_inverse_transform(copy.deepcopy(y),
                                           y_var)  # unbiased inverse transform
        yt = self.interpolate_(yt, yc, nan_pct=0.2)
        return yt

    def interpolate_(self, y, yt, nan_pct=0.2):
        # y: inverse-transformed values (values in natural scale)
        # yt: pre-inverse transform (values in transformed scale)
        if y is None:
            return None
        else:
            yx = np.reshape(y, (1, -1))[0] if self.method is not None else y
        nulls = pd.Series(yx).isnull().sum()
        pct = 100.0 * np.round(nulls / len(yx), 2)
        if nulls > nan_pct * np.ceil(len(yx)):
            su.my_print('WARNING: Too many NaN to interpolate for label ' +
                        str(self.lbl) + ': ' + str(nulls) + ' out of ' +
                        str(len(yx)) + ' (' + str(pct) +
                        '%) data points and lambda ' + str(self.lmbda))
            f = pd.DataFrame({'yt': list(yt), 'yx': list(yx)})
            f['lmbda'] = self.lmbda
            p_ut.save_df(f, '~/my_tmp/interpolDF')
            return None
        elif 0 < nulls <= nan_pct * np.ceil(
                len(yx)):  # interpolate yhat if some NaNs
            su.my_print('WARNING: interpolating for label ' + str(self.lbl) +
                        ': ' + str(nulls) + ' NaNs out of ' + str(len(yx)) +
                        ' data points (' + str(pct) + '%)')
            st = pd.Series(yx)
            sint = st.interpolate(limit_direction='both')
            yhat = sint.values
            ys = np.reshape(yhat, (1, -1))
            return ys[0]
        else:  # all OK
            return y
コード例 #18
0
ファイル: Mercari-6th-A.py プロジェクト: ffedericoni/Mercari
    def train_model(params, seed, model_num):

        if model_num == 0:
            num_cores = 1
            GPU = False
            CPU = True
            if GPU:
                num_GPU = 1
                num_CPU = 1
            if CPU:
                num_CPU = 1
                num_GPU = 0

            config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \
                                    inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \
                                    device_count={'CPU': num_CPU, 'GPU': num_GPU})
            session = tf.Session(config=config)
            K.set_session(session)

            batchsize = 2000  #3000
            epochs = 3
            np.random.seed(seed)
            tf.set_random_seed(seed)

            model = keras_mercari_model(seed, params)

            train_idx, val_idx = cvlist[seed]

            X_tr = [x[train_idx] for x in X]
            X_val = [x[val_idx] for x in X]

            lr1, lr2, lr3 = params[-3:]
            lrs = [lr1, lr2, lr3]

            def schedule(epoch):
                return lrs[epoch]

            lr_schedule = LearningRateScheduler(schedule)
            # val_store = TestCallback(X_val, X_test)
            gc.collect()
            if valid:
                model.fit(X_tr,
                          y[train_idx],
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          validation_data=(X_val, y[val_idx]),
                          shuffle=True,
                          callbacks=[lr_schedule])
                y_val = y[val_idx, 0]
                y_pred = model.predict(X_val)[:, 0]
                print(np.sqrt(metrics.mean_squared_error(y_val, y_pred)))
            else:
                model.fit(X,
                          y,
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          shuffle=True,
                          callbacks=[lr_schedule])
            y_test_pred = model.predict(X_test)[:, 0]
            K.clear_session()
            return y_test_pred

        if model_num == 1:
            num_cores = 1
            GPU = False
            CPU = True
            if GPU:
                num_GPU = 1
                num_CPU = 1
            if CPU:
                num_CPU = 1
                num_GPU = 0

            config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \
                                    inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \
                                    device_count={'CPU': num_CPU, 'GPU': num_GPU})
            session = tf.Session(config=config)
            K.set_session(session)

            batchsize = 2000
            epochs = 3
            np.random.seed(seed)
            tf.set_random_seed(seed)

            model = keras_mercari_model(seed, params)

            train_idx, val_idx = cvlist[seed]

            X_tr = [x[train_idx] for x in X]
            X_val = [x[val_idx] for x in X]

            lr1, lr2, lr3 = params[-3:]
            lrs = [lr1, lr2, lr3]

            def schedule(epoch):
                return lrs[epoch]

            lr_schedule = LearningRateScheduler(schedule)
            # val_store = TestCallback(X_val, X_test)
            gc.collect()
            if valid:
                model.fit(X_tr,
                          ynorm[train_idx],
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          validation_data=(X_val, ynorm[val_idx]),
                          shuffle=True,
                          callbacks=[lr_schedule])
                y_val = y[val_idx, 0]
                y_pred = model.predict(X_val)[:, 0] * std + mean
                print(np.sqrt(metrics.mean_squared_error(y_val, y_pred)))
            else:
                model.fit(X,
                          ynorm,
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=1,
                          shuffle=True,
                          callbacks=[lr_schedule])
            y_test_pred = model.predict(X_test)[:, 0] * std + mean
            K.clear_session()
            return y_test_pred

        if model_num == 2:
            normll = QuantileTransformer(output_distribution='normal')
            ynorm2 = normll.fit_transform(yrel)
            num_cores = 1
            GPU = False
            CPU = True
            if GPU:
                num_GPU = 1
                num_CPU = 1
            if CPU:
                num_CPU = 1
                num_GPU = 0

            config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \
                                    inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \
                                    device_count={'CPU': num_CPU, 'GPU': num_GPU})
            session = tf.Session(config=config)
            K.set_session(session)

            batchsize = 2000
            epochs = 3
            np.random.seed(seed)
            tf.set_random_seed(seed)

            model = keras_mercari_model(seed, params)

            train_idx, val_idx = cvlist[seed]

            X_tr = [x[train_idx] for x in X]
            X_val = [x[val_idx] for x in X]

            lr1, lr2, lr3 = params[-3:]
            lrs = [lr1, lr2, lr3]

            def schedule(epoch):
                return lrs[epoch]

            lr_schedule = LearningRateScheduler(schedule)
            # val_store = TestCallback(X_val, X_test)
            gc.collect()
            if valid:
                model.fit(X_tr,
                          ynorm2[train_idx],
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          validation_data=(X_val, ynorm2[val_idx]),
                          shuffle=True,
                          callbacks=[lr_schedule])
                y_val = y[val_idx, 0]
                y_pred = (normll.inverse_transform(model.predict(X_val))[:, 0]
                          + 1) * train_data['cat_price'].values[val_idx]
                print(np.sqrt(metrics.mean_squared_error(y_val, y_pred)))
            else:
                model.fit(X,
                          ynorm2,
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          shuffle=True,
                          callbacks=[lr_schedule])
            y_test_pred = (
                normll.inverse_transform(model.predict(X_test))[:, 0] +
                1) * test_data['cat_price'].values

            K.clear_session()
            return y_test_pred

        if model_num == 3:
            num_cores = 1
            GPU = False
            CPU = True
            if GPU:
                num_GPU = 1
                num_CPU = 1
            if CPU:
                num_CPU = 1
                num_GPU = 0

            config = tf.ConfigProto(intra_op_parallelism_threads=num_cores, \
                                    inter_op_parallelism_threads=num_cores, allow_soft_placement=True, \
                                    device_count={'CPU': num_CPU, 'GPU': num_GPU})
            session = tf.Session(config=config)
            K.set_session(session)

            batchsize = 2000
            epochs = 3
            np.random.seed(seed)
            tf.set_random_seed(seed)

            model = keras_mercari_model(seed, params)

            train_idx, val_idx = cvlist[seed]

            X_tr = [x[train_idx] for x in X]
            X_val = [x[val_idx] for x in X]

            lr1, lr2, lr3 = params[-3:]
            lrs = [lr1, lr2, lr3]

            def schedule(epoch):
                return lrs[epoch]

            lr_schedule = LearningRateScheduler(schedule)
            # val_store = TestCallback(X_val, X_test)
            gc.collect()
            if valid:
                model.fit(X_tr,
                          y[train_idx],
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          validation_data=(X_val, y[val_idx]),
                          shuffle=True,
                          callbacks=[lr_schedule])
                y_val = y[val_idx, 0]
                y_pred = model.predict(X_val)[:, 0]
                print(np.sqrt(metrics.mean_squared_error(y_val, y_pred)))
            else:
                model.fit(X,
                          y,
                          batch_size=batchsize,
                          epochs=epochs,
                          verbose=0,
                          shuffle=True,
                          callbacks=[lr_schedule])
            y_test_pred = model.predict(X_test)[:, 0]
            K.clear_session()
            return y_test_pred
コード例 #19
0
class QuantileMapper(BaseEstimator, TransformerMixin):
    """ Transform features using quantile mapping.
    
    Parameters
    ----------
    detrend : boolean, optional
        If True, detrend the data before quantile mapping and add the trend
        back after transforming. Default is False.
    lt_kwargs : dict, optional
        Dictionary of keyword arguments to pass to the LinearTrendTransformer
    qm_kwargs : dict, optional
        Dictionary of keyword arguments to pass to the QuantileMapper

    Attributes
    ----------
    x_cdf_fit_ : QuantileTransformer
        QuantileTranform for fit(X)
    """
    def __init__(self, detrend=False, lt_kwargs={}, qt_kwargs={}):

        self.lt_kwargs = lt_kwargs
        self.qt_kwargs = qt_kwargs

        self.detrend = detrend

    def fit(self, X):
        """ Fit the quantile mapping model.
        
        Parameters
        ----------
        X : array-like, shape  [n_samples, n_features]
            Training data.
        """
        X = ensure_samples_features(X)

        qt_kws = self.qt_kwargs.copy()

        if "n_quantiles" not in qt_kws:
            qt_kws["n_quantiles"] = len(X)

        # maybe detrend the input datasets
        if self.detrend:
            x_to_cdf = LinearTrendTransformer(
                **self.lt_kwargs).fit_transform(X)
        else:
            x_to_cdf = X

        # calculate the cdfs for X
        # TODO: replace this transformer with something that uses robust
        # empirical cdf plotting positions
        self.x_cdf_fit_ = QuantileTransformer(**qt_kws).fit(x_to_cdf)

        return self

    def transform(self, X):
        """ Perform the quantile mapping.

        Parameters
        ----------
        X : array_like, shape [n_samples, n_features]
            Samples.
        """
        X = ensure_samples_features(X)

        # maybe detrend the datasets
        if self.detrend:
            x_trend = LinearTrendTransformer(**self.lt_kwargs).fit(X)
            x_to_cdf = x_trend.transform(X)
        else:
            x_to_cdf = X

        # do the final mapping
        qt_kws = self.qt_kwargs.copy()
        if "n_quantiles" not in qt_kws:
            qt_kws["n_quantiles"] = len(X)

        x_quantiles = quantile_transform(x_to_cdf, copy=True, **qt_kws)
        x_qmapped = self.x_cdf_fit_.inverse_transform(x_quantiles)

        # add the trend back
        if self.detrend:
            x_qmapped = x_trend.inverse_transform(x_qmapped)

        return x_qmapped
コード例 #20
0
class PowerForecaster:
    """
    Check out the class spec at
    https://docs.google.com/document/d/1-ceuHfJ2bNbgmKddLTUCS0HJ1juE5t0042Mts_yEUD8v
    sample data is in
    https://drive.google.com/uc?export=download&id=1z2MBYJ8k4M5J3udlFVc2d8opE_f-S4BK
    """

    def __init__(self, df, model=Models.PROPHET,
                 upsample_freq=None,
                 train_test_split_ratio=Constants.TRAIN_TEST_SPLIT_RATIO.value,
                 epochs=Constants.EPOCHS.value,
                 initial_epoch=Constants.INITIAL_EPOCH.value,
                 batch_size=Constants.BATCH_SIZE.value,
                 sliding_window_size_or_time_steps=Constants.SLIDING_WINDOW_SIZE_OR_TIME_STEPS.value,
                 do_shuffle=True):
        logging.info("resample: {}. future_prediction: {}, epochs: {}, batch_size: {},"
                     " window_size: {}, eurons: {}"
                     .format(Constants.RESAMPLING_FREQ.value
                             , Constants.SHIFT_IN_TIME_STEP_TO_PREDICT.value
                             , epochs
                             , batch_size
                             , sliding_window_size_or_time_steps
                             , Constants.NEURONS.value
                             ))
        if logging.getLogger().isEnabledFor(logging.INFO):
            explore_data(df)
        # first step is to create a timestamp column as index to turn it to a TimeSeries data
        df.index = pd.to_datetime(df[ColumnNames.DATE.value] + df[ColumnNames.TIME.value],
                                  format='%Y-%m-%d%H:%M:%S', errors='raise')
        if 'Unnamed: 0' in df.columns:
            df.drop('Unnamed: 0', axis=1, inplace=True)

        # keep a copy of original dataset for future comparison
        self.df_original = df.copy()

        # we interpolate temperature using prophet to use it in a multivariate forecast
        temperature = ColumnNames.TEMPERATURE.value
        interpolated_df = facebook_prophet_filter(df, temperature,
                                                  Constants.FORECASTED_TEMPERATURE_FILE.value)
        interpolated_df.index = df.index
        df[[temperature]] = interpolated_df[[ColumnNames.FORECAST.value]]

        # lets also interpolate missing kwh using facebook prophet (or we could simply drop them)

        # now turn to kwh and make the format compatible with prophet
        power = ColumnNames.POWER.value
        interpolated_df = facebook_prophet_filter(df, power,
                                                  Constants.FORECASTED_POWER_FILE.value)
        interpolated_df.index = df.index
        df[[power]] = interpolated_df[[ColumnNames.FORECAST.value]]

        df = df.rename(columns={power: ColumnNames.LABEL.value})
        df.drop(columns=[ColumnNames.DATE.value,
                         ColumnNames.TIME.value,
                         ColumnNames.DAY_OF_WEEK.value,
                         ColumnNames.MONTH.value],
                inplace=True
                )
        if upsample_freq is not None:
            df = df.resample(upsample_freq).mean()

        # for any regression or forecasting it is better to work with normalized data
        self.transformer = QuantileTransformer()  # handle outliers better than MinMaxScalar
        features = ColumnNames.FEATURES.value
        normalized = normalize(df, features, transformer=self.transformer)

        # we use the last part (after 12/1/2013) that doesnt have temperature for testing
        cutoff_date = Constants.CUTOFF_DATE.value
        self.df = normalized[normalized.index < cutoff_date]
        self.testing = normalized[normalized.index >= cutoff_date]

        self.df[ColumnNames.DATE_STAMP.value] = self.df.index
        self.df_blocked = None
        self.train_test_split_ratio = train_test_split_ratio
        self.model_type = model
        self.train_X, self.test_X, self.train_test_split_index = self.train_test_split(self.df[features])
        self.train_y, self.test_y, _ = self.train_test_split(self.df[ColumnNames.LABELS.value])
        self.model_fit = None
        self.epochs = epochs
        self.initial_epoch = initial_epoch
        self.batch_size = batch_size
        self.history = None
        # following is defines in sliding_window
        self.do_shuffle = do_shuffle
        self.val_idx = None
        self.shuffled_X = None
        self.shuffled_y = None
        self.train = None
        self.label = None
        self.train_size = None
        self.val_size = None

        if logging.getLogger().isEnabledFor(logging.INFO):
            explore_data(self.df)

    def train_test_split(self, df):
        split_index = int(self.train_test_split_ratio * df.shape[0])
        train = df.iloc[:split_index, :]
        test = df.iloc[split_index:, :]
        return train, test, split_index

    def stationary_test(self):
        dataset = self.test_y.dropna()
        seasonal_dataset = sm.tsa.seasonal_decompose(dataset, freq=365)
        fig = seasonal_dataset.plot()
        fig.set_figheight(8)
        fig.set_figwidth(15)
        fig.show()

        def p_value(dataset):
            # ADF-test(Original-time-series)
            dataset.dropna()
            p_value = sm.tsa.adfuller(dataset, regression='ct')
            logging.debug('p-value:{}'.format(p_value))
            p_value = sm.tsa.adfuller(dataset, regression='c')
            logging.debug('p-value:{}'.format(p_value))

        p_value(self.train_y)
        p_value(self.test_y)

        # Test works for only 12 variables, check the eigenvalues
        johnsen_test = coint_johansen(self.df[ColumnNames.FEATURES.value].dropna(), -1, 1).eig
        return johnsen_test

    def seasonal_prediction(self):
        from statsmodels.tsa.api import SimpleExpSmoothing
        y_hat_avg = self.test_y.copy()
        fit2 = SimpleExpSmoothing(np.asarray(self.train_y['Count'])).fit(smoothing_level=0.6, optimized=False)
        y_hat_avg['SES'] = fit2.forecast(len(self.test_y))
        plt.figure(figsize=(16, 8))
        plt.plot(self.train_y['Count'], label='Train')
        plt.plot(self.test_y['Count'], label='Test')
        plt.plot(y_hat_avg['SES'], label='SES')
        plt.legend(loc='best')
        plt.show()

    def fit(self):
        if self.model_type == Models.PROPHET:
            self.prophet_fit()
        elif self.model_type == Models.ARIMA:
            self.arima_fit()
        elif self.model_type == Models.VAR:
            self.var_fit()
        elif self.model_type == Models.LSTM:
            self.lstm_fit()
        else:
            raise ValueError("{} is not defined".format(self.model_type))

    def evaluate(self):
        self.loss_metrics = self.model_type.value.evaluate(
            self.val_X,
            self.val_y,
            batch_size=self.batch_size,
            verbose=0
        )

        logging.info("Metric names:{}".format(self.model_type.value.metrics_names))
        logging.info("Loss Metrics:{}".format(self.loss_metrics))

    def resultToDataFrame(self, data, start_index, end_index, do_scale_back=False):
        label_column = ColumnNames.LABEL.value
        df = self.df.iloc[start_index:end_index]
        df[label_column] = data
        if do_scale_back:
            features = ColumnNames.FEATURES.value
            df[features] = self.transformer.inverse_transform(df[features])
        return df[[label_column]]


    def block_after_date(self, start_block_date_st):
        index, _ = find_index(self.df, start_block_date_st)
        logging.debug("Index of block is {} with length of {}".format(index, len(self.df) - index))
        self.df_blocked = self.df.iloc[index:]
        self.df_blocked.reindex()
        logging.info("Blocked from {} to {} fromo training and validation"
                     .format(self.df_blocked.index[0], self.df_blocked.index[-1]))


    def adjust_index_and_training_shift(self, start_date_in_labeling_st
                                        , training_duration_in_frequency = None
                                        , start_date_training_st = None
                                        ):
        logging.debug("Original range data of data: [{}-{}]".format(self.df.index[0], self.df.index[-1]))
        index_start_labeling, _ = find_index(self.df, start_date_in_labeling_st)
        if start_date_training_st is not None:
            index_start_training, _ = find_index(self.df, start_date_training_st)
            if index_start_labeling < index_start_training:
                raise ValueError("Labeling should be after training")
            self.shift = index_start_labeling - index_start_training
        else:
            index_start_training = 0
            self.shift = index_start_labeling

        if training_duration_in_frequency is None:
            logging.info("Shift is set to be {}".format(self.shift))
        else:
            final_index = index_start_training + training_duration_in_frequency + self.shift
            logging.debug("start index: {}, final_index: {}".format(index_start_training, final_index))
            self.df = self.df.iloc[index_start_training:index_start_training + training_duration_in_frequency + self.shift]

            logging.info("Shift is set to be {}, we picked the slice of [{} : {}] for trainig".format(
                self.shift, self.df.index[0]
                , self.df.index[-1]
            ))

    def lstm_predict(self, model
                     , start_date_to_predict_st=None
                     , duration_in_freq = None
                     , do_scale_back = False
                     ):
        X, true_y = self.get_whole()

        if start_date_to_predict_st is not None:
            y_index_i, _ = find_index(self.df, start_date_to_predict_st)
            x_index_i = 0 if y_index_i <= self.shift else y_index_i - self.shift
            x_index_f = x_index_i + duration_in_freq
            y_index_f = y_index_i + duration_in_freq

            logging.info("Predicting time slice [{} : {}] from [{} : {}]".format(
                self.df.index[y_index_i],self.df.index[y_index_f]
                , self.df.index[x_index_i], self.df.index[x_index_f]
            ))

            X = X[x_index_i:x_index_f]
            true_y = true_y[y_index_i:y_index_f]

        predicted = model.predict(X)
        logging.debug("Predicted Labels shape: {}".format(predicted.shape))

        plt.plot(predicted, 'r')
        plt.plot(true_y, 'b')
        plt.show()
        df_predicted = self.resultToDataFrame(predicted, x_index_i + self.shift
                                              , x_index_f + self.shift, do_scale_back)
        return df_predicted

    def scale_back(self, df_predicted, start_index, end_index):
        label_column = ColumnNames.LABEL.value
        features = ColumnNames.FEATURES.value
        df = self.df[features].iloc[start_index:end_index]
        df[label_column] = df_predicted[label_column]
        scaled_predicted = self.transformer.inverse_transform(df[features])
        df[features] = scaled_predicted
        return df

    def prophet_fit(self):
        past = self.train_y.copy()
        past[ColumnNames.DATE_STAMP.value] = self.train_y.index
        self.model_type.value.fit(past)

    def arima_fit(self):
        model = sm.tsa.statespace.SARIMAX(self.train_y,
                                          order=Constants.SARIMAX_ORDER.value,
                                          seasonal_order=Constants.SARIMAX_SEASONAL_ORDER.value)
        # ,enforce_stationarity=False, enforce_invertibility=False, freq='15T')
        logging.debug("SARIMAX fitting ....")
        self.model_fit = self.model_type.value.fit()
        self.model_fit.summary()
        logging.debug("SARIMAX forecast", self.model_fit.forecast())

    def var_fit(self):
        logging.debug("making VAR model")
        model = VAR(endog=self.train_X[ColumnNames.FEATURES.value].dropna())
        logging.debug("VAR fitting ....")
        self.model_fit = model.fit()
        print(self.model_fit.summary())

    def lstm_fit(self):
        if logging.getLogger().isEnabledFor(logging.INFO):
            print(self.model_type.value.summary())

        callbacks = Callbacks(Constants.MODEL_NAME.value, self.batch_size, self.epochs)
        X, y = self.get_shuff_train_label()

        self.history = self.model_type.value.fit(
            X,
            y,
            epochs=self.epochs,
            batch_size=self.batch_size,
            validation_split=0.35,
            verbose=0,
            callbacks=callbacks.getDefaultCallbacks(),
            initial_epoch=self.initial_epoch,

        )
        logging.debug("history of performance:{}".format(self.history.history))

    def predict(self, feature_set=None):
        future = feature_set if feature_set is not None \
            else Constants.DEFAULT_FUTURE_PERIODS.value
        if self.model_type == Models.PROPHET:
            self.future = self.model_type.value.make_future_dataframe(periods=future,
                                                                      freq=Constants.DEFAULT_FUTURE_FREQ.value,
                                                                      include_history=False)

        if self.model_type == Models.PROPHET:
            predicted = self.model_type.value.predict(self.future)
            predicted[ColumnNames.LABEL.value] = predicted[ColumnNames.FORECAST.value]
        elif self.model_type == Models.ARIMA:
            predicted = self.arima_predict(future)
        elif self.model_type == Models.VAR:
            predicted = self.var_predict(future)
        elif self.model_type == Models.LSTM:
            return self.lstm_predict(self.model.value,
                                     start_date_to_predict_st="2013-6-01",
                                                    duration_in_freq=3 * 30)
        else:
            raise ValueError("{} is not defined".format(self.model_type))
        df_predicted = self.resultToDataFrame(predicted, self.train_test_split_index
                                              , self.train_test_split_index + len(predicted))

        return df_predicted

    def arima_predict(self, future):
        end = str(self.train_y.index[-1])
        start = str(self.train_y.index[-future])
        print(start, end)
        predicted = self.model_fit.predict(start=start[:10], end=end[:10], dynamic=True)
        return predicted

    def var_predict(self, future):
        predicted_array = self.model_fit.forecast(self.model_fit.y, future)
        predicted = pd.DataFrame(predicted_array)
        predicted.columns = ColumnNames.FEATURES.value
        predicted.index = self.test_y.index[:len(predicted)]
        return predicted

    def sliding_window(self):
        # Generate the data matrix
        length0 = self.df.shape[0]
        window_size = Constants.SLIDING_WINDOW_SIZE_OR_TIME_STEPS.value
        future_time_steps = Constants.SHIFT_IN_TIME_STEP_TO_PREDICT.value
        features_column = ColumnNames.FEATURES.value
        label_column = ColumnNames.LABEL.value

        sliding_window_feature = np.zeros((length0 - window_size - future_time_steps,
                                           window_size, len(features_column)))
        sliding_window_label = np.zeros((length0 - window_size - future_time_steps, 1))

        for counter in range(length0 - window_size - future_time_steps):
            sliding_window_label[counter, :] = self.df[label_column][counter + window_size + future_time_steps]

        for counter in range(length0 - window_size - future_time_steps):
            sliding_window_feature[counter, :] = self.df[features_column][
                                                 counter: counter + window_size]
        if self.do_shuffle:
            logging.debug('Random shuffeling')
        length = sliding_window_feature.shape[0]
        if self.df_blocked is not None:
            length -= len(self.df_blocked)
            logging.info("length of data reduced by {} due to blocking. The last date is {}"
                         .format(len(self.df_blocked), self.df.index[length]))

        logging.debug("sliding window length: {}".format(length))

        split_ratio = Constants.TRAIN_TEST_SPLIT_RATIO.value
        idx = np.random.choice(length, length, replace=False) if self.do_shuffle else np.arange(length)
        self.val_idx = idx[int(split_ratio * length):]

        feature_window_shuffled = sliding_window_feature[idx, :]
        label_window_shuffled = sliding_window_label[idx, :]

        self.shuffled_X = feature_window_shuffled
        self.shuffled_y = label_window_shuffled
        self.train = sliding_window_feature
        self.label = sliding_window_label

        self.train_X = self.shuffled_X[:int(split_ratio * length), :]
        self.train_y = self.shuffled_y[:int(split_ratio * length), :]
        self.train_size = int(split_ratio * length)

        self.val_X = self.shuffled_X[int(split_ratio * length):, :]
        self.val_y = self.shuffled_y[int(split_ratio * length):, :]
        self.val_size = length - self.train_size

    def get_shuff_train_label(self):
        X = self.shuffled_X  # np.expand_dims(self.shuffled_X, axis=-1)
        Y = self.shuffled_y
        return X, Y

    def evaluate_performance(self):
        # make a prediction
        X = self.test_X  # np.expand_dims(self.test_X, axis=-1)
        yhat = self.model_type.value.predict(X)
        test_X = self.test_X.reshape((self.test_X.shape[0], self.test_X.shape[2]))
        # invert scaling for forecast
        inv_yhat = pd.concatenate((yhat, test_X[:, 1:]), axis=1)
        inv_yhat = self.transformer.inverse_transform(inv_yhat)
        inv_yhat = inv_yhat[:, 0]
        # invert scaling for actual
        test_y = self.test_y.reshape((len(self.test_y), 1))
        inv_y = pd.concatenate((test_y, test_X[:, 1:]), axis=1)
        inv_y = self.transformer.inverse_transform(inv_y)
        inv_y = inv_y[:, 0]
        # calculate RMSE
        rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
        logging.debug('Test RMSE: %.3f' % rmse)

    def plot_future(self, predicted):
        self.model_type.value.plot(predicted, xlabel='Date', ylabel='KWH')
        self.model_type.value.plot_components(predicted)

    #        by_dow.plot(xticks=ticks, style=style, title='Averaged on Days of the Week')
    #        plt.show()

    def visual_inspection(self):
        style = [':', '--', '-']
        pd.plotting.register_matplotlib_converters()
        df = self.df

        self.df_original[ColumnNames.ORIGINAL_FEATURES.value].plot(style=style, title='Original Data')
        plt.show()

        self.df[ColumnNames.FEATURES.value].plot(style=style, title='Normalized Data')
        plt.show()

        sampled = df.resample('M').sum()[ColumnNames.FEATURES.value]
        sampled.plot(style=style, title='Aggregated Monthly')
        plt.show()

        sampled = df.resample('W').sum()[ColumnNames.FEATURES.value]
        sampled.plot(style=style, title='Aggregated Weekly')
        plt.show()

        sampled = df.resample('D').sum()[ColumnNames.FEATURES.value]
        sampled.rolling(30, center=True).sum().plot(style=style, title='Aggregated Daily')
        plt.show()

        by_time = df.groupby(by=df.index.time).mean()[ColumnNames.FEATURES.value]
        ticks = 4 * 60 * 60 * np.arange(6)
        by_time.plot(xticks=ticks, style=style, title='Averaged Hourly')
        plt.show()

        days = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

        def tick(x):
            if x % 24 == 12:
                return days[int(x) // 24]
            else:
                return ""

        #        ax.xaxis.set_major_formatter(NullFormatter())
        #        ax.xaxis.set_minor_formatter(FuncFormatter(tick))
        #        ax.tick_params(which="major", axis="x", length=10, width=1.5)

        #by_dow = df.groupby(by=df.dow).mean()[ColumnNames.FEATURES.value]
        #ticks = 4 * 60 * 60 * np.arange(6)

    def plot_prediction(self, start_index, end_index):
        style = [':', '--', '-']
        pd.plotting.register_matplotlib_converters()
        label_column = ColumnNames.LABELS.value
        # import pdb; pdb.set_trace()

        t = self.train.index.iloc[start_index:end_index]
        X = self.train.iloc[start_index: end_index]
        true_y = self.label.iloc[start_index, end_index]
        y = self.model_type.value.predict(X)

        plt.plot(t, y, true_y, style=style)
        plt.show()

    def plot_history(self):
        plt.plot(np.arange(self.epochs - self.initial_epoch),
                 self.history.history['loss'], label='train')
        plt.plot(np.arange(self.epochs - self.initial_epoch),
                 self.history.history['val_loss'], label='validation')
        plt.legend()
        plt.title('model accuracy')
        plt.ylabel('accuracy')
        plt.xlabel('epoch')
        plt.legend(['train', 'test'], loc='upper left')
        plt.show()

    def get_next_train_batch(self):
        # getting the next train batch
        if self.pointer + self.batchsize >= self.train_size:
            end = self.train_size
            start = self.pointer
            self.pointer = 0
            self.epoch += 1
        else:
            end = self.pointer + self.batchsize
            start = self.pointer
            self.pointer += self.batchsize
        X = self.train_data[start:end, :]
        Y = self.train_label[start:end, :]
        return X, Y

    def get_val(self):
        X = np.expand_dims(self.val_data, axis=-1)

        return X, self.val_label[:]

    def get_whole(self):
        # get whole, for validation set
        X = self.train[:, :]  # np.expand_dims(self.train[:, :], axis=-1)
        Y = self.label[:, :]
        return X, Y

    def reset(self):
        self.pointer = 0
        self.epoch = 0