コード例 #1
0
ファイル: regression.py プロジェクト: 0xqq/spark
 def save(self, sc, path):
     """Save a IsotonicRegressionModel."""
     java_boundaries = _py2java(sc, self.boundaries.tolist())
     java_predictions = _py2java(sc, self.predictions.tolist())
     java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel(
         java_boundaries, java_predictions, self.isotonic)
     java_model.save(sc._jsc.sc(), path)
コード例 #2
0
ファイル: classification.py プロジェクト: OspreyX/spark
 def save(self, sc, path):
     java_labels = _py2java(sc, self.labels.tolist())
     java_pi = _py2java(sc, self.pi.tolist())
     java_theta = _py2java(sc, self.theta.tolist())
     java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel(
         java_labels, java_pi, java_theta)
     java_model.save(sc._jsc.sc(), path)
コード例 #3
0
ファイル: clustering.py プロジェクト: 11wzy001/spark
 def save(self, sc, path):
     """
     Save this model to the given path.
     """
     java_centers = _py2java(sc, [_convert_to_vector(c) for c in self.centers])
     java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel(java_centers)
     java_model.save(sc._jsc.sc(), path)
コード例 #4
0
ファイル: ARIMA.py プロジェクト: pegli/spark-timeseries
 def forecast(self, ts, nfuture):
     """
     Provided fitted values for timeseries ts as 1-step ahead forecasts, based on current
     model parameters, and then provide `nFuture` periods of forecast. We assume AR terms
     prior to the start of the series are equal to the model's intercept term (or 0.0, if fit
     without and intercept term).Meanwhile, MA terms prior to the start are assumed to be 0.0. If
     there is differencing, the first d terms come from the original series.
    
     Parameters
     ----------
     ts:
         Timeseries to use as gold-standard. Each value (i) in the returning series
         is a 1-step ahead forecast of ts(i). We use the difference between ts(i) -
         estimate(i) to calculate the error at time i, which is used for the moving
         average terms. Numpy array.
     nFuture:
         Periods in the future to forecast (beyond length of ts)
         
     Returns a series consisting of fitted 1-step ahead forecasts for historicals and then
     `nFuture` periods of forecasts. Note that in the future values error terms become
     zero and prior predictions are used for any AR terms.
     
     """
     jts = _py2java(self._ctx, Vectors.dense(ts))
     jfore = self._jmodel.forecast(jts, nfuture)
     return _java2py(self._ctx, jfore)
コード例 #5
0
ファイル: classification.py プロジェクト: vijaykiran/spark
 def save(self, sc, path):
     """
     Save this model to the given path.
     """
     java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel(
         _py2java(sc, self._coeff), self.intercept, self.numFeatures, self.numClasses)
     java_model.save(sc._jsc.sc(), path)
コード例 #6
0
ファイル: classification.py プロジェクト: vijaykiran/spark
 def save(self, sc, path):
     """
     Save this model to the given path.
     """
     java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel(
         _py2java(sc, self._coeff), self.intercept)
     java_model.save(sc._jsc.sc(), path)
コード例 #7
0
ファイル: util.py プロジェクト: Anhmike/spark-sklearn
def _call_java(sc, java_obj, name, *args):
    """
    Method copied from pyspark.ml.wrapper.  Uses private Spark APIs.
    """
    m = getattr(java_obj, name)
    java_args = [_py2java(sc, arg) for arg in args]
    return _java2py(sc, m(*java_args))
コード例 #8
0
ファイル: ARIMA.py プロジェクト: zachahuy/spark-timeseries
def autofit(ts, maxp=5, maxd=2, maxq=5, sc=None):
    """
    Utility function to help in fitting an automatically selected ARIMA model based on approximate
    Akaike Information Criterion (AIC) values. The model search is based on the heuristic
    developed by Hyndman and Khandakar (2008) and described in [[http://www.jstatsoft
    .org/v27/i03/paper]]. In contrast to the algorithm in the paper, we use an approximation to
    the AIC, rather than an exact value. Note that if the maximum differencing order provided
    does not suffice to induce stationarity, the function returns a failure, with the appropriate
    message. Additionally, note that the heuristic only considers models that have parameters
    satisfying the stationarity/invertibility constraints. Finally, note that our algorithm is
    slightly more lenient than the original heuristic. For example, the original heuristic
    rejects models with parameters "close" to violating stationarity/invertibility. We only
    reject those that actually violate it.
   
    This functionality is even less mature than some of the other model fitting functions here, so
    use it with caution.
   
    Parameters
    ----------
    ts:
        time series to which to automatically fit an ARIMA model
    maxP:
        limit for the AR order
    maxD:
        limit for differencing order
    maxQ:
        limit for the MA order
    sc:
        The SparkContext, required.
    
    returns an ARIMAModel
    """
    jmodel = sc._jvm.com.cloudera.sparkts.models.ARIMA.autoFit(_py2java(sc, ts), maxp, maxd, maxq)
    return ARIMAModel(jmodel=jmodel, sc=sc)
コード例 #9
0
def perform_pca(matrix, row_count, nr_principal_components=2):
    """Return principal components of the input matrix.

    This function uses MLlib's ``RowMatrix`` to compute principal components.

    Args:
        matrix: An RDD[int, (int, float)] representing a sparse matrix. This
            is returned by ``center_matrix`` but it is not required to center
            the matrix first.
        row_count: The size (N) of the N x N ``matrix``.
        nr_principal_components: Number of components we want to obtain. This
            value must be less than or equal to the number of rows in the input
            square matrix.

    Returns:
        An array of ``nr_principal_components`` columns, and same number of rows
        as the input ``matrix``. This array is a ``numpy`` array.
    """

    py_rdd = matrix.map(lambda row: linalg.Vectors.sparse(row_count, row))
    sc = pyspark.SparkContext._active_spark_context
    java_rdd = mllib_common._py2java(sc, py_rdd)
    scala_rdd = java_rdd.rdd()
    sc = pyspark.SparkContext._active_spark_context
    row_matrix = (sc._jvm.org.apache.spark.mllib.linalg.distributed.
        RowMatrix(scala_rdd)
    )
    pca = row_matrix.computePrincipalComponents(nr_principal_components)
    pca = mllib_common._java2py(sc, pca)
    return pca.toArray()
コード例 #10
0
ファイル: GARCH.py プロジェクト: BabelTower/spark-timeseries
 def log_likelihood(self, ts):
     """
     Returns the log likelihood of the parameters on the given time series.
     
     Based on http://www.unc.edu/~jbhill/Bollerslev_GARCH_1986.pdf
     """
     likelihood = self._jmodel.logLikelihood(_py2java(self._ctx, Vectors.dense(ts)))
     return _java2py(self._ctx, likelihood)
コード例 #11
0
ファイル: wrapper.py プロジェクト: Atry/spark
 def _make_java_param_pair(self, param, value):
     """
     Makes a Java parm pair.
     """
     sc = SparkContext._active_spark_context
     param = self._resolveParam(param)
     java_param = self._java_obj.getParam(param.name)
     java_value = _py2java(sc, value)
     return java_param.w(java_value)
コード例 #12
0
ファイル: ARIMA.py プロジェクト: zachahuy/spark-timeseries
 def add_time_dependent_effects(self, ts, destts):
     """
     Given a timeseries, apply an ARIMA(p, d, q) model to it.
     We assume that prior MA terms are 0.0 and prior AR terms are equal to the intercept or 0.0 if
     fit without an intercept
     
     Parameters
     ----------
     ts:
         Time series of i.i.d. observations as a DenseVector
     destts:
         Time series with added time-dependent effects as a DenseVector.
     
     returns the dest series, representing the application of the model to provided error
      terms, for convenience.
     """
     result =  self._jmodel.addTimeDependentEffects(_py2java(self._ctx, ts), _py2java(self._ctx, destts))
     return _java2py(self._ctx, result)
コード例 #13
0
ファイル: ARIMA.py プロジェクト: zachahuy/spark-timeseries
 def remove_time_dependent_effects(self, ts, destts):
     """
     Given a timeseries, assume that it is the result of an ARIMA(p, d, q) process, and apply
     inverse operations to obtain the original series of underlying errors.
     To do so, we assume prior MA terms are 0.0, and prior AR are equal to the model's intercept or
     0.0 if fit without an intercept
     
     Parameters
     ----------
     ts:
         Time series of observations with this model's characteristics as a DenseVector
     destts:
         Time series with removed time-dependent effects as a DenseVector.
     
     returns The dest series, representing remaining errors, for convenience.
     """
     result =  self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, ts), _py2java(self._ctx, destts))
     return _java2py(self._ctx, result)
コード例 #14
0
ファイル: util.py プロジェクト: Anhmike/spark-sklearn
def _new_java_obj(sc, java_class, *args):
    """
    Construct a new Java object.
    """
    java_obj = _jvm()
    for name in java_class.split("."):
        java_obj = getattr(java_obj, name)
    java_args = [_py2java(sc, arg) for arg in args]
    return java_obj(*java_args)
コード例 #15
0
ファイル: GARCH.py プロジェクト: BabelTower/spark-timeseries
 def gradient(self, ts):
     """
     Find the gradient of the log likelihood with respect to the given time series.
     
     Based on http://www.unc.edu/~jbhill/Bollerslev_GARCH_1986.pdf
     
     Returns an 3-element array containing the gradient for the alpha, beta, and omega parameters.
     """
     gradient = self._jmodel.gradient(_py2java(self._ctx, Vectors.dense(ts)))
     return _java2py(self._ctx, gradient)
コード例 #16
0
ファイル: wrapper.py プロジェクト: Atry/spark
 def _new_java_obj(java_class, *args):
     """
     Returns a new Java object.
     """
     sc = SparkContext._active_spark_context
     java_obj = _jvm()
     for name in java_class.split("."):
         java_obj = getattr(java_obj, name)
     java_args = [_py2java(sc, arg) for arg in args]
     return java_obj(*java_args)
コード例 #17
0
ファイル: _model.py プロジェクト: BabelTower/spark-timeseries
 def remove_time_dependent_effects(self, ts):
     """
     Given a timeseries, apply inverse operations to obtain the original series of underlying errors.
     Parameters
     ----------
     ts:
         Time series of observations with this model's characteristics as a Numpy array
     
     returns the time series with removed time-dependent effects as a Numpy array
     """
     destts = Vectors.dense(np.array([0] * len(ts)))
     result =  self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
     return _java2py(self._ctx, result.toArray())
コード例 #18
0
ファイル: _model.py プロジェクト: BabelTower/spark-timeseries
 def add_time_dependent_effects(self, ts):
     """
     Given a timeseries, apply a model to it.
     
     Parameters
     ----------
     ts:
         Time series of i.i.d. observations as a Numpy array
     
     returns the time series with added time-dependent effects as a Numpy array.
     """
     destts = Vectors.dense([0] * len(ts))
     result =  self._jmodel.addTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
     return _java2py(self._ctx, result.toArray())
コード例 #19
0
ファイル: ARIMA.py プロジェクト: pegli/spark-timeseries
    def log_likelihood_css(self, y):
        """
        log likelihood based on conditional sum of squares
        
        Source: http://www.nuffield.ox.ac.uk/economics/papers/1997/w6/ma.pdf

        Parameters
        ----------
        y:
            time series as a DenseVector

        returns log likelihood as a double
        """
        likelihood = self._jmodel.logLikelihoodCSS(_py2java(self._ctx, y))
        return _java2py(self._ctx, likelihood)
コード例 #20
0
ファイル: ARIMA.py プロジェクト: pegli/spark-timeseries
 def approx_aic(self, ts):
     """
     Calculates an approximation to the Akaike Information Criterion (AIC). This is an approximation
     as we use the conditional likelihood, rather than the exact likelihood. Please see
     [[https://en.wikipedia.org/wiki/Akaike_information_criterion]] for more information on this
     measure.
     
     Parameters
     ----------
     ts:
         the timeseries to evaluate under current model
         
     Returns an approximation to the AIC under the current model as a double
     """
     return self._jmodel.approxAIC(_py2java(self._ctx, Vectors.dense(ts)))
コード例 #21
0
def fit_model(ts, sc=None):
    """
    Fits an AR(1) + GARCH(1, 1) model to the given time series.
    
    Parameters
    ----------
    ts:
        the time series to which we want to fit a AR+GARCH model as a Numpy array
        
    Returns an ARGARCH model
    """
    assert sc != None, "Missing SparkContext"
    
    jvm = sc._jvm
    jmodel = jvm.com.cloudera.sparkts.models.ARGARCH.fitModel(_py2java(sc, Vectors.dense(ts)))
    return ARGARCHModel(jmodel=jmodel, sc=sc)
コード例 #22
0
def fit_model(ts, maxLag=1, noIntercept=False, sc=None):
    """
    Fits an AR(1) model to the given time series
    
    Parameters
    ----------
    ts:
        the time series to which we want to fit an autoregression model as a Numpy array
        
    Returns an ARModel
    """
    assert sc != None, "Missing SparkContext"
    
    jvm = sc._jvm
    jmodel = jvm.com.cloudera.sparkts.models.Autoregression.fitModel(_py2java(sc, Vectors.dense(ts)), maxLag, noIntercept)
    return ARModel(jmodel=jmodel, sc=sc)
コード例 #23
0
ファイル: ARIMA.py プロジェクト: pegli/spark-timeseries
def fit_model(p, d, q, ts, includeIntercept=True, method="css-cgd", userInitParams=None, sc=None):
    """
    Given a time series, fit a non-seasonal ARIMA model of order (p, d, q), where p represents
    the autoregression terms, d represents the order of differencing, and q moving average error
    terms. If includeIntercept is true, the model is fitted with an intercept. In order to select
    the appropriate order of the model, users are advised to inspect ACF and PACF plots, or compare
    the values of the objective function. Finally, while the current implementation of
    `fitModel` verifies that parameters fit stationarity and invertibility requirements,
    there is currently no function to transform them if they do not. It is up to the user
    to make these changes as appropriate (or select a different model specification)

    Parameters
    ----------
    p:
        autoregressive order
    d:
        differencing order
    q:
        moving average order
    ts:
        time series to which to fit an ARIMA(p, d, q) model as a Numpy array.
    includeIntercept:
        if true the model is fit with an intercept term. Default is true
    method:
        objective function and optimization method, current options are 'css-bobyqa',
        and 'css-cgd'. Both optimize the log likelihood in terms of the
        conditional sum of squares. The first uses BOBYQA for optimization, while
        the second uses conjugate gradient descent. Default is 'css-cgd'
    userInitParams:
        A set of user provided initial parameters for optimization as a float list.
        If null (default), initialized using Hannan-Rissanen algorithm. If provided,
        order of parameter should be: intercept term, AR parameters (in
        increasing order of lag), MA parameters (in increasing order of lag).
    sc:
        The SparkContext, required.
    
    returns an ARIMAModel
    """
    assert sc != None, "Missing SparkContext"
    
    jvm = sc._jvm
    jmodel = jvm.com.cloudera.sparkts.models.ARIMA.fitModel(p, d, q, _py2java(sc, Vectors.dense(ts)), includeIntercept, method, _py2java_double_array(sc, userInitParams))
    return ARIMAModel(jmodel=jmodel, sc=sc)
コード例 #24
0
ファイル: tuning.py プロジェクト: Atry/spark
    def _to_java(self):
        """
        Transfer this instance to a Java CrossValidatorModel. Used for ML persistence.

        :return: Java object equivalent to this instance.
        """

        sc = SparkContext._active_spark_context

        _java_obj = JavaParams._new_java_obj("org.apache.spark.ml.tuning.CrossValidatorModel",
                                             self.uid,
                                             self.bestModel._to_java(),
                                             _py2java(sc, []))
        estimator, epms, evaluator = super(CrossValidatorModel, self)._to_java_impl()

        _java_obj.set("evaluator", evaluator)
        _java_obj.set("estimator", estimator)
        _java_obj.set("estimatorParamMaps", epms)
        return _java_obj
コード例 #25
0
    def execute(self, script):
        """
        Execute a DML / PyDML script.

        Parameters
        ----------
        script: Script instance
            Script instance defined with the appropriate input and output variables.

        Returns
        -------
        ml_results: MLResults
            MLResults instance.
        """
        if not isinstance(script, Script):
            raise ValueError("Expected script to be an instance of Script")
        scriptString = script.scriptString
        if script.scriptType == "dml":
            if scriptString.endswith(".dml"):
                if os.path.exists(scriptString):
                    script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dmlFromFile(scriptString)
                else:
                    raise ValueError("path: %s does not exist" % scriptString)
            else:
                script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.dml(scriptString)
        elif script.scriptType == "pydml":
            if scriptString.endswith(".pydml"):
                if os.path.exists(scriptString):
                    script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydmlFromFile(scriptString)
                else:
                    raise ValueError("path: %s does not exist" % scriptString)
            else:
                script_java = self._sc._jvm.org.apache.sysml.api.mlcontext.ScriptFactory.pydml(scriptString)

        for key, val in script._input.items():
            script_java.input(key, _py2java(self._sc, val))
        for val in script._output:
            script_java.out(val)
        return MLResults(self._ml.execute(script_java), self._sc)
コード例 #26
0
ファイル: EWMA.py プロジェクト: BabelTower/spark-timeseries
def fit_model(ts, sc=None):
    """
    Fits an EWMA model to a time series. Uses the first point in the time series as a starting
    value. Uses sum squared error as an objective function to optimize to find smoothing parameter
    The model for EWMA is recursively defined as S_t = (1 - a) * X_t + a * S_{t-1}, where
    a is the smoothing parameter, X is the original series, and S is the smoothed series
    Note that the optimization is performed as unbounded optimization, although in its formal
    definition the smoothing parameter is <= 1, which corresponds to an inequality bounded
    optimization. Given this, the resulting smoothing parameter should always be sanity checked
    https://en.wikipedia.org/wiki/Exponential_smoothing
    
    Parameters
    ----------
    ts:
        the time series to which we want to fit an EWMA model as a Numpy array
        
    Returns an EWMA model
    """
    assert sc != None, "Missing SparkContext"

    jvm = sc._jvm
    jmodel = jvm.com.cloudera.sparkts.models.EWMA.fitModel(_py2java(sc, Vectors.dense(ts)))
    return EWMAModel(jmodel=jmodel, sc=sc)
コード例 #27
0
ファイル: regression.py プロジェクト: BeforeRain/spark
 def save(self, sc, path):
     java_model = sc._jvm.org.apache.spark.mllib.regression.RidgeRegressionModel(
         _py2java(sc, self._coeff), self.intercept)
     java_model.save(sc._jsc.sc(), path)
コード例 #28
0
def py2java(x):
    sc = SparkContext._active_spark_context
    return _py2java(sc, x)
コード例 #29
0
 def forecast(self, ts, ts1):
     jts = _py2java(self._ctx, Vectors.dense(ts))
     jts1 = _py2java(self._ctx, Vectors.dense(ts1))
     jfore = self._jmodel.forecast(jts, jts1)
     return _java2py(self._ctx, jfore)
コード例 #30
0
ファイル: regression.py プロジェクト: 0xqq/spark
 def save(self, sc, path):
     """Save a LassoModel."""
     java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel(
         _py2java(sc, self._coeff), self.intercept)
     java_model.save(sc._jsc.sc(), path)
コード例 #31
0
ファイル: wrapper.py プロジェクト: Atry/spark
 def _call_java(self, name, *args):
     m = getattr(self._java_obj, name)
     sc = SparkContext._active_spark_context
     java_args = [_py2java(sc, arg) for arg in args]
     return _java2py(sc, m(*java_args))
コード例 #32
0
ファイル: classification.py プロジェクト: OspreyX/spark
 def save(self, sc, path):
     java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel(
         _py2java(sc, self._coeff), self.intercept)
     java_model.save(sc._jsc.sc(), path)