コード例 #1
0
ファイル: regression.py プロジェクト: MickDavies/spark
 def load(cls, sc, path):
     java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel.load(
         sc._jsc.sc(), path)
     py_boundaries = _java2py(sc, java_model.boundaryVector()).toArray()
     py_predictions = _java2py(sc, java_model.predictionVector()).toArray()
     return IsotonicRegressionModel(py_boundaries, py_predictions,
                                    java_model.isotonic)
コード例 #2
0
ファイル: classification.py プロジェクト: bopopescu/SparkNew
 def load(cls, sc, path):
     java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load(
         sc._jsc.sc(), path)
     py_labels = _java2py(sc, java_model.labels())
     py_pi = _java2py(sc, java_model.pi())
     py_theta = _java2py(sc, java_model.theta())
     return NaiveBayesModel(py_labels, py_pi, numpy.array(py_theta))
コード例 #3
0
    def __init__(self, regressionCoeff=None, d=0, q=0, coefficients=None, hasIntercept=False, jmodel=None, sc=None):
        """
        Parameters
        ----------
        regressionCoeff:
            coefficients for regression , including intercept , for example. if model has 3 regressors
            then length of [[regressionCoeff]] is 4
        arimaOrders:
            p,d,q for the arima error structure, length of [[arimaOrders]] must be 3
        arimaCoeff:
            AR, d, and MA terms, length of arimaCoeff = p+d+q
        """
        assert sc != None, "Missing SparkContext"

        self._ctx = sc
        if jmodel == None:
            self._jmodel = self._ctx._jvm.com.cloudera.sparkts.models.RegressionARIMAModel(
                _py2java_double_array(self._ctx, regressionCoeff),
                _py2java_int_array(self._ctx, arimaOrders),
                _py2scala_arraybuffer(self._ctx, arimaCoeff),
            )
        else:
            self._jmodel = jmodel

        self.regressionCoeff = _java2py(sc, self._jmodel.regressionCoeff())
        self.arimaOrders = _java2py(sc, self._jmodel.arimaOrders())
        self.arimaCoeff = _java2py(sc, self._jmodel.arimaCoeff())
コード例 #4
0
ファイル: regression.py プロジェクト: 0xqq/spark
 def load(cls, sc, path):
     """Load a IsotonicRegressionModel."""
     java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel.load(
         sc._jsc.sc(), path)
     py_boundaries = _java2py(sc, java_model.boundaryVector()).toArray()
     py_predictions = _java2py(sc, java_model.predictionVector()).toArray()
     return IsotonicRegressionModel(py_boundaries, py_predictions, java_model.isotonic)
コード例 #5
0
ファイル: classification.py プロジェクト: OspreyX/spark
 def load(cls, sc, path):
     java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load(
         sc._jsc.sc(), path)
     py_labels = _java2py(sc, java_model.labels())
     py_pi = _java2py(sc, java_model.pi())
     py_theta = _java2py(sc, java_model.theta())
     return NaiveBayesModel(py_labels, py_pi, numpy.array(py_theta))
コード例 #6
0
    def __init__(self,
                 regressionCoeff=None,
                 d=0,
                 q=0,
                 coefficients=None,
                 hasIntercept=False,
                 jmodel=None,
                 sc=None):
        """
        Parameters
        ----------
        regressionCoeff:
            coefficients for regression , including intercept , for example. if model has 3 regressors
            then length of [[regressionCoeff]] is 4
        arimaOrders:
            p,d,q for the arima error structure, length of [[arimaOrders]] must be 3
        arimaCoeff:
            AR, d, and MA terms, length of arimaCoeff = p+d+q
        """
        assert sc != None, "Missing SparkContext"

        self._ctx = sc
        if jmodel == None:
            self._jmodel = self._ctx._jvm.com.cloudera.sparkts.models.RegressionARIMAModel(
                _py2java_double_array(self._ctx, regressionCoeff),
                _py2java_int_array(self._ctx, arimaOrders),
                _py2scala_arraybuffer(self._ctx, arimaCoeff))
        else:
            self._jmodel = jmodel

        self.regressionCoeff = _java2py(sc, self._jmodel.regressionCoeff())
        self.arimaOrders = _java2py(sc, self._jmodel.arimaOrders())
        self.arimaCoeff = _java2py(sc, self._jmodel.arimaCoeff())
コード例 #7
0
ファイル: util.py プロジェクト: vipul1409/handyspark
def call_scala_method(py_class, scala_method, df, *args):
    """Given a Python class, calls a method from its Scala equivalent
    """
    sc = df.sql_ctx._sc
    # Gets the Java class from the JVM, given the name built from the Python class
    java_class = getattr(sc._jvm, get_jvm_class(py_class))
    # Converts all columns into doubles and access it as Java DF
    jdf = df.select(*(F.col(col).astype('double') for col in df.columns))._jdf
    # Creates a Java object from both Java class and DataFrame
    java_obj = java_class(jdf)
    # Converts remaining args from Python to Java as well
    args = [_py2java(sc, a) for a in args]
    # Gets method from Java Object and passes arguments to it to get results
    java_res = getattr(java_obj, scala_method)(*args)
    # Converts results from Java back to Python
    res = _java2py(sc, java_res)
    # If result is an RDD, it could be the case its elements are still
    # serialized tuples from Scala...
    if isinstance(res, RDD):
        try:
            # Takes the first element from the result, to check what it is
            first = res.take(1)[0]
            # If it is a dictionary, we need to check its value
            if isinstance(first, dict):
                first = list(first.values())[0]
                # If the value is a scala tuple, we need to deserialize it
                if first.startswith('scala.Tuple'):
                    serde = sc._jvm.org.apache.spark.mllib.api.python.SerDe
                    # We assume it is a Tuple2 and deserialize it
                    java_res = serde.fromTuple2RDD(java_res)
                    # Finally, we convert the deserialized result from Java to Python
                    res = _java2py(sc, java_res)
        except IndexError:
            pass
    return res
コード例 #8
0
ファイル: classification.py プロジェクト: MickDavies/spark
 def load(cls, sc, path):
     java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load(
         sc._jsc.sc(), path)
     # Can not unpickle array.array from Pyrolite in Python3 with "bytes"
     py_labels = _java2py(sc, java_model.labels(), "latin1")
     py_pi = _java2py(sc, java_model.pi(), "latin1")
     py_theta = _java2py(sc, java_model.theta(), "latin1")
     return NaiveBayesModel(py_labels, py_pi, numpy.array(py_theta))
コード例 #9
0
ファイル: classification.py プロジェクト: AsafZ/spark
 def load(cls, sc, path):
     java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load(
         sc._jsc.sc(), path)
     # Can not unpickle array.array from Pyrolite in Python3 with "bytes"
     py_labels = _java2py(sc, java_model.labels(), "latin1")
     py_pi = _java2py(sc, java_model.pi(), "latin1")
     py_theta = _java2py(sc, java_model.theta(), "latin1")
     return NaiveBayesModel(py_labels, py_pi, numpy.array(py_theta))
コード例 #10
0
ファイル: regression.py プロジェクト: zero323/spark
    def load(cls, sc: SparkContext, path: str) -> "IsotonicRegressionModel":
        """Load an IsotonicRegressionModel."""
        assert sc._jvm is not None

        java_model = sc._jvm.org.apache.spark.mllib.regression.IsotonicRegressionModel.load(
            sc._jsc.sc(), path)
        py_boundaries = _java2py(sc, java_model.boundaryVector()).toArray()
        py_predictions = _java2py(sc, java_model.predictionVector()).toArray()
        return IsotonicRegressionModel(py_boundaries, py_predictions,
                                       java_model.isotonic)
コード例 #11
0
ファイル: ARIMA.py プロジェクト: zachahuy/spark-timeseries
 def __init__(self, p=0, d=0, q=0, coefficients=None, hasIntercept=False, jmodel=None, sc=None):
     self._ctx = sc
     if jmodel == None:
         self._jmodel = self._ctx._jvm.com.cloudera.sparkts.models.ARIMAModel(p, d, q, _py2java(self._ctx, coefficients), hasIntercept)
     else:
         self._jmodel = jmodel
         
     self.p = _java2py(sc, self._jmodel.p())
     self.d = _java2py(sc, self._jmodel.d())
     self.q = _java2py(sc, self._jmodel.q())
     self.coefficients = _java2py(sc, self._jmodel.coefficients())
     self.has_intercept = _java2py(sc, self._jmodel.hasIntercept())
コード例 #12
0
ファイル: ARIMA.py プロジェクト: BabelTower/spark-timeseries
    def __init__(self, p=0, d=0, q=0, coefficients=None, hasIntercept=True, jmodel=None, sc=None):
        assert sc != None, "Missing SparkContext"

        self._ctx = sc
        if jmodel == None:
            self._jmodel = self._ctx._jvm.com.cloudera.sparkts.models.ARIMAModel(p, d, q, _py2java_double_array(self._ctx, coefficients), hasIntercept)
        else:
            self._jmodel = jmodel
            
        self.p = _java2py(sc, self._jmodel.p())
        self.d = _java2py(sc, self._jmodel.d())
        self.q = _java2py(sc, self._jmodel.q())
        self.coefficients = _java2py(sc, self._jmodel.coefficients())
        self.has_intercept = _java2py(sc, self._jmodel.hasIntercept())
コード例 #13
0
ファイル: classification.py プロジェクト: zhengruifeng/spark
    def load(cls, sc: SparkContext, path: str) -> "NaiveBayesModel":
        """
        Load a model from the given path.
        """
        assert sc._jvm is not None

        java_model = sc._jvm.org.apache.spark.mllib.classification.NaiveBayesModel.load(
            sc._jsc.sc(), path
        )
        # Can not unpickle array.array from Pickle in Python3 with "bytes"
        py_labels = _java2py(sc, java_model.labels(), "latin1")
        py_pi = _java2py(sc, java_model.pi(), "latin1")
        py_theta = _java2py(sc, java_model.theta(), "latin1")
        return NaiveBayesModel(py_labels, py_pi, numpy.array(py_theta))
コード例 #14
0
ファイル: regression.py プロジェクト: yunchat/spark
 def load(cls, sc, path):
     java_model = sc._jvm.org.apache.spark.mllib.regression.RidgeRegressionModel.load(
         sc._jsc.sc(), path)
     weights = _java2py(sc, java_model.weights())
     intercept = java_model.intercept()
     model = RidgeRegressionModel(weights, intercept)
     return model
コード例 #15
0
def arima_ts(df):

    sc = SparkContext.getOrCreate()

    train = df.filter(df['date'].between('2013-01-01', '2014-11-01'))
    test = df.filter(df['date'].between('2014-11-01', '2015-05-01'))

    tr = numpy.array(train.select("sales").collect()).flatten()
    te = numpy.array(test.select("sales").collect()).flatten()
    nte = len(te)

    #model = autofit(Vectors.dense(tr), sc=sc)
    model = fit_model(p=0, d=1, q=0, ts=Vectors.dense(tr), sc=sc)
    prev = model.forecast(Vectors.dense(tr), nte)

    x = _java2py(sc, prev)[len(tr):]

    #print("ARIMA spark-ts R2: ", r2_score(te, x))

    test = test.toPandas()
    test = test.set_index('date')

    df = df.toPandas()
    df = df.set_index('date')

    x = pd.DataFrame(x, index=test.index, columns=['prediction'])

    pd.concat([test, x], axis=1).plot()
    pd.concat([df, x], axis=1).plot()

    return r2_score(te, x)
コード例 #16
0
def perform_pca(matrix, row_count, nr_principal_components=2):
    """Return principal components of the input matrix.

    This function uses MLlib's ``RowMatrix`` to compute principal components.

    Args:
        matrix: An RDD[int, (int, float)] representing a sparse matrix. This
            is returned by ``center_matrix`` but it is not required to center
            the matrix first.
        row_count: The size (N) of the N x N ``matrix``.
        nr_principal_components: Number of components we want to obtain. This
            value must be less than or equal to the number of rows in the input
            square matrix.

    Returns:
        An array of ``nr_principal_components`` columns, and same number of rows
        as the input ``matrix``. This array is a ``numpy`` array.
    """

    py_rdd = matrix.map(lambda row: linalg.Vectors.sparse(row_count, row))
    sc = pyspark.SparkContext._active_spark_context
    java_rdd = mllib_common._py2java(sc, py_rdd)
    scala_rdd = java_rdd.rdd()
    sc = pyspark.SparkContext._active_spark_context
    row_matrix = (sc._jvm.org.apache.spark.mllib.linalg.distributed.
        RowMatrix(scala_rdd)
    )
    pca = row_matrix.computePrincipalComponents(nr_principal_components)
    pca = mllib_common._java2py(sc, pca)
    return pca.toArray()
コード例 #17
0
ファイル: gorspark.py プロジェクト: gorpipe/gor-spark
def gor(self, qry):
    df = _py2java(sc, self)
    ReflectionUtil = spark._jvm.py4j.reflection.ReflectionUtil
    Rowclass = ReflectionUtil.classForName("org.apache.spark.sql.Row")
    ct = spark._jvm.scala.reflect.ClassTag.apply(Rowclass)
    gds = spark._jvm.org.gorpipe.spark.GorDatasetFunctions(df, ct, ct)
    return _java2py(sc, gds.gor(qry, True, sgs))
コード例 #18
0
 def load(cls, sc, path):
     """
     Load a model from the given path.
     """
     java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel.load(
         sc._jsc.sc(), path)
     return KMeansModel(_java2py(sc, java_model.clusterCenters()))
コード例 #19
0
ファイル: ARIMA.py プロジェクト: zachahuy-zz/spark-timeseries
 def gradient_log_likelihood_css_arma(self, diffedy):
     """
     Calculates the gradient for the log likelihood function using CSS
     Derivation:
         L(y | \theta) = -\frac{n}{2}log(2\pi\sigma^2) - \frac{1}{2\pi}\sum_{i=1}^n \epsilon_t^2 \\
         \sigma^2 = \frac{\sum_{i = 1}^n \epsilon_t^2}{n} \\
         \frac{\partial L}{\partial \theta} = -\frac{1}{\sigma^2}
         \sum_{i = 1}^n \epsilon_t \frac{\partial \epsilon_t}{\partial \theta} \\
         \frac{\partial \epsilon_t}{\partial \theta} = -\frac{\partial \hat{y}}{\partial \theta} \\
         \frac{\partial\hat{y}}{\partial c} = 1 +
         \phi_{t-q}^{t-1}*\frac{\partial \epsilon_{t-q}^{t-1}}{\partial c} \\
         \frac{\partial\hat{y}}{\partial \theta_{ar_i}} =  y_{t - i} +
         \phi_{t-q}^{t-1}*\frac{\partial \epsilon_{t-q}^{t-1}}{\partial \theta_{ar_i}} \\
         \frac{\partial\hat{y}}{\partial \theta_{ma_i}} =  \epsilon_{t - i} +
         \phi_{t-q}^{t-1}*\frac{\partial \epsilon_{t-q}^{t-1}}{\partial \theta_{ma_i}} \\
     
     Parameters
     ----------
     diffedY:
         array of differenced values
     
     returns the gradient log likelihood as an array of double
     """
     # need to copy diffedy to a double[] for Java
     result = self._jmodel.gradientlogLikelihoodCSSARMA(
         _py2java_double_array(diffedy, self._ctx._gateway))
     return _java2py(self._ctx, result)
コード例 #20
0
ファイル: regression.py プロジェクト: BeforeRain/spark
 def load(cls, sc, path):
     java_model = sc._jvm.org.apache.spark.mllib.regression.RidgeRegressionModel.load(
         sc._jsc.sc(), path)
     weights = _java2py(sc, java_model.weights())
     intercept = java_model.intercept()
     model = RidgeRegressionModel(weights, intercept)
     return model
コード例 #21
0
def _call_java(sc, java_obj, name, *args):
    """
    Method copied from pyspark.ml.wrapper.  Uses private Spark APIs.
    """
    m = getattr(java_obj, name)
    java_args = [_py2java(sc, arg) for arg in args]
    return _java2py(sc, m(*java_args))
コード例 #22
0
ファイル: ARIMA.py プロジェクト: zachahuy-zz/spark-timeseries
 def forecast(self, ts, nfuture):
     """
     Provided fitted values for timeseries ts as 1-step ahead forecasts, based on current
     model parameters, and then provide `nFuture` periods of forecast. We assume AR terms
     prior to the start of the series are equal to the model's intercept term (or 0.0, if fit
     without and intercept term).Meanwhile, MA terms prior to the start are assumed to be 0.0. If
     there is differencing, the first d terms come from the original series.
    
     Parameters
     ----------
     ts:
         Timeseries to use as gold-standard. Each value (i) in the returning series
         is a 1-step ahead forecast of ts(i). We use the difference between ts(i) -
         estimate(i) to calculate the error at time i, which is used for the moving
         average terms.
     nFuture:
         Periods in the future to forecast (beyond length of ts)
         
     Returns a series consisting of fitted 1-step ahead forecasts for historicals and then
     `nFuture` periods of forecasts. Note that in the future values error terms become
     zero and prior predictions are used for any AR terms.
     
     """
     jts = _py2java(self._ctx, ts)
     jfore = self._jmodel.forecast(jts, nfuture)
     return _java2py(self._ctx, jfore)
コード例 #23
0
ファイル: tests.py プロジェクト: tomwilson28/spark
 def check_params(self, py_stage):
     if not hasattr(py_stage, "_to_java"):
         return
     java_stage = py_stage._to_java()
     if java_stage is None:
         return
     for p in py_stage.params:
         java_param = java_stage.getParam(p.name)
         py_has_default = py_stage.hasDefault(p)
         java_has_default = java_stage.hasDefault(java_param)
         self.assertEqual(
             py_has_default, java_has_default,
             "Default value mismatch of param %s for Params %s" %
             (p.name, str(py_stage)))
         if py_has_default:
             if p.name == "seed":
                 return  # Random seeds between Spark and PySpark are different
             java_default =\
                 _java2py(self.sc, java_stage.clear(java_param).getOrDefault(java_param))
             py_stage._clear(p)
             py_default = py_stage.getOrDefault(p)
             self.assertEqual(
                 java_default, py_default,
                 "Java default %s != python default %s of param %s for Params %s"
                 % (str(java_default), str(py_default), p.name,
                    str(py_stage)))
コード例 #24
0
ファイル: ARIMA.py プロジェクト: pegli/spark-timeseries
 def forecast(self, ts, nfuture):
     """
     Provided fitted values for timeseries ts as 1-step ahead forecasts, based on current
     model parameters, and then provide `nFuture` periods of forecast. We assume AR terms
     prior to the start of the series are equal to the model's intercept term (or 0.0, if fit
     without and intercept term).Meanwhile, MA terms prior to the start are assumed to be 0.0. If
     there is differencing, the first d terms come from the original series.
    
     Parameters
     ----------
     ts:
         Timeseries to use as gold-standard. Each value (i) in the returning series
         is a 1-step ahead forecast of ts(i). We use the difference between ts(i) -
         estimate(i) to calculate the error at time i, which is used for the moving
         average terms. Numpy array.
     nFuture:
         Periods in the future to forecast (beyond length of ts)
         
     Returns a series consisting of fitted 1-step ahead forecasts for historicals and then
     `nFuture` periods of forecasts. Note that in the future values error terms become
     zero and prior predictions are used for any AR terms.
     
     """
     jts = _py2java(self._ctx, Vectors.dense(ts))
     jfore = self._jmodel.forecast(jts, nfuture)
     return _java2py(self._ctx, jfore)
コード例 #25
0
def perform_pca(matrix, row_count, nr_principal_components=2):
    """Return principal components of the input matrix.

    This function uses MLlib's ``RowMatrix`` to compute principal components.

    Args:
        matrix: An RDD[int, (int, float)] representing a sparse matrix. This
            is returned by ``center_matrix`` but it is not required to center
            the matrix first.
        row_count: The size (N) of the N x N ``matrix``.
        nr_principal_components: Number of components we want to obtain. This
            value must be less than or equal to the number of rows in the input
            square matrix.

    Returns:
        An array of ``nr_principal_components`` columns, and same number of rows
        as the input ``matrix``. This array is a ``numpy`` array.
    """

    py_rdd = matrix.map(lambda row: linalg.Vectors.sparse(row_count, row))
    sc = pyspark.SparkContext._active_spark_context
    java_rdd = mllib_common._py2java(sc, py_rdd)
    scala_rdd = java_rdd.rdd()
    sc = pyspark.SparkContext._active_spark_context
    row_matrix = (sc._jvm.org.apache.spark.mllib.linalg.distributed.
        RowMatrix(scala_rdd)
    )
    pca = row_matrix.computePrincipalComponents(nr_principal_components)
    pca = mllib_common._java2py(sc, pca)
    return pca.toArray()
コード例 #26
0
ファイル: ARIMA.py プロジェクト: pegli/spark-timeseries
 def gradient_log_likelihood_css_arma(self, diffedy):
     """
     Calculates the gradient for the log likelihood function using CSS
     Derivation:
         L(y | \theta) = -\frac{n}{2}log(2\pi\sigma^2) - \frac{1}{2\pi}\sum_{i=1}^n \epsilon_t^2 \\
         \sigma^2 = \frac{\sum_{i = 1}^n \epsilon_t^2}{n} \\
         \frac{\partial L}{\partial \theta} = -\frac{1}{\sigma^2}
         \sum_{i = 1}^n \epsilon_t \frac{\partial \epsilon_t}{\partial \theta} \\
         \frac{\partial \epsilon_t}{\partial \theta} = -\frac{\partial \hat{y}}{\partial \theta} \\
         \frac{\partial\hat{y}}{\partial c} = 1 +
         \phi_{t-q}^{t-1}*\frac{\partial \epsilon_{t-q}^{t-1}}{\partial c} \\
         \frac{\partial\hat{y}}{\partial \theta_{ar_i}} =  y_{t - i} +
         \phi_{t-q}^{t-1}*\frac{\partial \epsilon_{t-q}^{t-1}}{\partial \theta_{ar_i}} \\
         \frac{\partial\hat{y}}{\partial \theta_{ma_i}} =  \epsilon_{t - i} +
         \phi_{t-q}^{t-1}*\frac{\partial \epsilon_{t-q}^{t-1}}{\partial \theta_{ma_i}} \\
     
     Parameters
     ----------
     diffedY:
         array of differenced values
     
     returns the gradient log likelihood as an array of double
     """
     # need to copy diffedy to a double[] for Java
     result =  self._jmodel.gradientlogLikelihoodCSSARMA(_py2java_double_array(self._ctx, diffedy))
     return _java2py(self._ctx, result)
コード例 #27
0
def checkLoadBalancing(df: DataFrame,
                       kind: str = "frac",
                       numberOfElements: int = -1):
    """
    DataFrame containing the weight of each partition.
    You can choose between outputing the size (number of rows) of each partition
    or the fractional size (%) to the total number of rows.
    size of the dataset (in percent). This is useful to check whether the
    load is correctly balanced.

    Parameters
    ----------
    df : DataFrame
        Input DataFrame
    kind : str
        print the load balancing in terms of fractional size (kind="frac")
        or number of rows per partition (kind="size"). Default is "frac".
    numberOfElements : int
        (optional). Total number of elements in the DataFrame.
        Only needed if you choose to output fractional sizes (kind="frac").
        If not provided (i.e. default value of -1) and kind="frac",
        it will be computed (count).

    Returns
    ----------
    dfout : DataFrame containing the weight of each partition.

    Examples
    ----------
    Load data
    >>> df = spark.read.format("fits")\
        .option("hdu", 1)\
        .load("../src/test/resources/astro_obs.fits")

    Fake repartitioning in 10 equal sized partitions
    >>> df = df.repartition(10)

    Compute the load balancing %
    >>> df_load = checkLoadBalancing(df, kind="frac")

    Note that this is a DataFrame, so you can use df.show()
    Here we will check that the total is indeed 100%
    >>> val = df_load.select("Load (%)").collect()
    >>> assert(int(sum([i[0] for i in val])) == 100)

    Same using number of rows instead of fractional contribution
    >>> df_load = checkLoadBalancing(df, kind="size")
    >>> val = df_load.select("Load (#Rows)").collect()

    >>> assert(int(sum([i[0] for i in val])) == df.count())
    """
    prefix = "com.astrolabsoftware.spark3d"
    scalapath = "{}.Checkers.checkLoadBalancing".format(prefix)
    scalaclass = load_from_jvm(scalapath)

    dfout = _java2py(get_spark_context(),
                     scalaclass(df._jdf, kind, numberOfElements))

    return dfout
コード例 #28
0
ファイル: regression.py プロジェクト: Ignalina/spark311
 def load(cls, sc, path):
     """Load a LassoModel."""
     java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel.load(
         sc._jsc.sc(), path)
     weights = _java2py(sc, java_model.weights())
     intercept = java_model.intercept()
     model = LassoModel(weights, intercept)
     return model
コード例 #29
0
ファイル: GARCH.py プロジェクト: BabelTower/spark-timeseries
 def log_likelihood(self, ts):
     """
     Returns the log likelihood of the parameters on the given time series.
     
     Based on http://www.unc.edu/~jbhill/Bollerslev_GARCH_1986.pdf
     """
     likelihood = self._jmodel.logLikelihood(_py2java(self._ctx, Vectors.dense(ts)))
     return _java2py(self._ctx, likelihood)
コード例 #30
0
ファイル: regression.py プロジェクト: 0xqq/spark
 def load(cls, sc, path):
     """Load a LassoModel."""
     java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel.load(
         sc._jsc.sc(), path)
     weights = _java2py(sc, java_model.weights())
     intercept = java_model.intercept()
     model = LassoModel(weights, intercept)
     return model
コード例 #31
0
 def to_double_rdd(self, column_index):
     """
     Returns a RDD by converting values to double of given column index
     :param column_index:One column index in TransformableRDD
     :return:RDD
     """
     rdd = self._transformable_rdd.toDoubleRDD(column_index).rdd()
     return _java2py(self.spark_context, rdd)
コード例 #32
0
 def to_double_rdd(self, column_index):
     """
     Returns a RDD by converting values to double of given column index
     :param column_index:One column index in TransformableRDD
     :return:RDD
     """
     rdd = self._transformable_rdd.toDoubleRDD(column_index).rdd()
     return _java2py(self.spark_context, rdd)
コード例 #33
0
ファイル: hbaseUtils.py プロジェクト: saucam/fink-broker
def explodearrayofstruct(df: DataFrame, columnname: str) -> DataFrame:
    """From a nested column (array of struct),
    create one column per array element.

    The routine accesses the JVM under the hood, and calls the
    Scala routine explodeArrayOfStruct. Make sure you have the fink_broker jar
    in your classpath.

    Example:
    |    |-- prv_candidates: array (nullable = true)
    |    |    |-- element: struct (containsNull = true)
    |    |    |    |-- jd: double (nullable = true)
    |    |    |    |-- fid: integer (nullable = true)

    Would become:
    |-- prv_candidates_jd: array (nullable = true)
    |    |-- element: double (containsNull = true)
    |-- prv_candidates_fid: array (nullable = true)
    |    |-- element: integer (containsNull = true)

    Parameters
    ----------
    df : DataFrame
        Input nested Spark DataFrame
    columnname : str
        The name of the column to explode

    Returns
    -------
    DataFrame
        Spark DataFrame with new columns from the input column.

    Examples
    -------
    >>> df = spark.read.format("avro").load(ztf_alert_sample)

    # Candidate is nested
    >>> s = df.schema
    >>> typeOf = {i.name: i.dataType.typeName() for  i in s.fields}
    >>> typeOf['prv_candidates'] == 'array'
    True

    # Flatten it
    >>> df_flat = explodearrayofstruct(df, "prv_candidates")
    >>> "prv_candidates_ra" in df_flat.schema.fieldNames()
    True

    # Each new column contains array element cast to string
    >>> s_flat = df_flat.schema
    >>> typeOf = {i.name: i.dataType.typeName() for  i in s_flat.fields}
    >>> typeOf['prv_candidates_ra'] == 'string'
    True
    """
    sc = get_spark_context()
    obj = sc._jvm.com.astrolabsoftware.fink_broker.catalogUtils
    _df = obj.explodeArrayOfStruct(df._jdf, columnname)
    df_flatten = _java2py(sc, _df)
    return df_flatten
コード例 #34
0
 def multiply_columns(self, first_column, second_column):
     """
     Returns a RDD which is a product of the values in @first_column and @second_column
     :param first_column: One column index
     :param second_column: Another column index
     :return: RDD
     """
     _rdd = self._transformable_rdd.multiplyColumns(first_column, second_column).rdd()
     return _java2py(self.spark_context, _rdd)
コード例 #35
0
ファイル: clustering.py プロジェクト: wangyum/spark
    def load(cls, sc: SparkContext, path: str) -> "KMeansModel":
        """
        Load a model from the given path.
        """
        assert sc._jvm is not None

        java_model = sc._jvm.org.apache.spark.mllib.clustering.KMeansModel.load(
            sc._jsc.sc(), path)
        return KMeansModel(_java2py(sc, java_model.clusterCenters()))
コード例 #36
0
ファイル: regression.py プロジェクト: zhengruifeng/spark
    def load(cls, sc: SparkContext, path: str) -> "LassoModel":
        """Load a LassoModel."""
        assert sc._jvm is not None

        java_model = sc._jvm.org.apache.spark.mllib.regression.LassoModel.load(sc._jsc.sc(), path)
        weights = _java2py(sc, java_model.weights())
        intercept = java_model.intercept()
        model = LassoModel(weights, intercept)
        return model
コード例 #37
0
ファイル: classification.py プロジェクト: bopopescu/SparkNew
 def load(cls, sc, path):
     java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel.load(
         sc._jsc.sc(), path)
     weights = _java2py(sc, java_model.weights())
     intercept = java_model.intercept()
     threshold = java_model.getThreshold().get()
     model = SVMModel(weights, intercept)
     model.setThreshold(threshold)
     return model
コード例 #38
0
ファイル: GARCH.py プロジェクト: vidhyamanisankar/SML
 def log_likelihood(self, ts):
     """
     Returns the log likelihood of the parameters on the given time series.
     
     Based on http://www.unc.edu/~jbhill/Bollerslev_GARCH_1986.pdf
     """
     likelihood = self._jmodel.logLikelihood(
         _py2java(self._ctx, Vectors.dense(ts)))
     return _java2py(self._ctx, likelihood)
コード例 #39
0
ファイル: classification.py プロジェクト: OspreyX/spark
 def load(cls, sc, path):
     java_model = sc._jvm.org.apache.spark.mllib.classification.SVMModel.load(
         sc._jsc.sc(), path)
     weights = _java2py(sc, java_model.weights())
     intercept = java_model.intercept()
     threshold = java_model.getThreshold().get()
     model = SVMModel(weights, intercept)
     model.setThreshold(threshold)
     return model
コード例 #40
0
ファイル: queries.py プロジェクト: shongscience/spark3D
def knn(df: DataFrame, p: list, k: int, coordSys: str, unique: bool):
    """ Finds the K nearest neighbors of the query object.

    The naive implementation here searches through all the the objects in
    the DataFrame to get the KNN. The nearness of the objects here
    is decided on the basis of the distance between their centers.

    Parameters
    ----------
    df : DataFrame
        Input Dataframe. Must have 3 columns corresponding to the
        coordinate (x, y, z) if cartesian or (r, theta, phi) f spherical.
    p : list of float
        Targeted point for which we want neighbors.
    k : int
        Number of neighbours
    coordSys : str
        Coordinate system: spherical or cartesian
    unique : bool
        Boolean. If true, returns only distinct objects. Default is false.

    Returns
    --------
    out : DataFrame
        DataFrame with the coordinates of the k neighbours found.

    Examples
    --------
    >>> df = spark.read.format("fits")\
        .option("hdu", 1)\
        .load("../src/test/resources/cartesian_points.fits")

    Get the 100 closest neighbours around the point [0.2, 0.2, 0.2]
    >>> K = 100
    >>> target = [0.2, 0.2, 0.2]
    >>> unique = False
    >>> neighbours = knn(df.select("x", "y", "z"), target, K, "spherical", unique)

    >>> print(neighbours.count())
    100

    You can add back the metadata
    >>> neighboursWithMeta = df.join(neighbours, ["x", "y", "z"], "left_semi")
    """
    prefix = "com.astrolabsoftware.spark3d"
    scalapath = "{}.Queries.KNN".format(prefix)
    scalaclass = load_from_jvm(scalapath)

    # # To convert python List to Scala Map
    convpath = "{}.python.PythonClassTag.javaListtoscalaList".format(prefix)
    conv = load_from_jvm(convpath)

    out = _java2py(get_spark_context(),
                   scalaclass(df._jdf, conv(p), k, coordSys, unique))

    return out
コード例 #41
0
ファイル: wrapper.py プロジェクト: mindis/snappy-spark
 def _transfer_params_from_java(self):
     """
     Transforms the embedded params from the companion Java object.
     """
     sc = SparkContext._active_spark_context
     for param in self.params:
         if self._java_obj.hasParam(param.name):
             java_param = self._java_obj.getParam(param.name)
             value = _java2py(sc, self._java_obj.getOrDefault(java_param))
             self._paramMap[param] = value
コード例 #42
0
 def smooth(self, column_index, smoothing_method):
     """
     Returns a new RDD containing smoothed values of @column_index using @smoothing_method
     :param column_index: Index of the column
     :param smoothing_method: smoothing method by which you want to smooth data
     :return: RDD
     """
     method = smoothing_method._get_smoothing_method(self.spark_context)
     rdd = self._transformable_rdd.smooth(column_index, method)
     return _java2py(self.spark_context, rdd.rdd())
コード例 #43
0
ファイル: wrapper.py プロジェクト: AsafZ/spark
 def _transfer_params_from_java(self):
     """
     Transforms the embedded params from the companion Java object.
     """
     sc = SparkContext._active_spark_context
     for param in self.params:
         if self._java_obj.hasParam(param.name):
             java_param = self._java_obj.getParam(param.name)
             value = _java2py(sc, self._java_obj.getOrDefault(java_param))
             self._paramMap[param] = value
コード例 #44
0
 def smooth(self, column_index, smoothing_method):
     """
     Returns a new RDD containing smoothed values of @column_index using @smoothing_method
     :param column_index: Index of the column
     :param smoothing_method: smoothing method by which you want to smooth data
     :return: RDD
     """
     method = smoothing_method._get_smoothing_method(self.spark_context)
     rdd = self._transformable_rdd.smooth(column_index, method)
     return _java2py(self.spark_context, rdd.rdd())
コード例 #45
0
ファイル: GARCH.py プロジェクト: BabelTower/spark-timeseries
 def gradient(self, ts):
     """
     Find the gradient of the log likelihood with respect to the given time series.
     
     Based on http://www.unc.edu/~jbhill/Bollerslev_GARCH_1986.pdf
     
     Returns an 3-element array containing the gradient for the alpha, beta, and omega parameters.
     """
     gradient = self._jmodel.gradient(_py2java(self._ctx, Vectors.dense(ts)))
     return _java2py(self._ctx, gradient)
コード例 #46
0
 def multiply_columns(self, first_column, second_column):
     """
     Returns a RDD which is a product of the values in @first_column and @second_column
     :param first_column: One column index
     :param second_column: Another column index
     :return: RDD
     """
     _rdd = self._transformable_rdd.multiplyColumns(first_column,
                                                    second_column).rdd()
     return _java2py(self.spark_context, _rdd)
コード例 #47
0
ファイル: hbaseUtils.py プロジェクト: saucam/fink-broker
def flattenstruct(df: DataFrame, columnname: str) -> DataFrame:
    """ From a nested column (struct of primitives),
    create one column per struct element.

    The routine accesses the JVM under the hood, and calls the
    Scala routine flattenStruct. Make sure you have the fink_broker jar
    in your classpath.

    Example:
    |-- candidate: struct (nullable = true)
    |    |-- jd: double (nullable = true)
    |    |-- fid: integer (nullable = true)

    Would become:
    |-- candidate_jd: double (nullable = true)
    |-- candidate_fid: integer (nullable = true)

    Parameters
    ----------
    df : DataFrame
        Nested Spark DataFrame
    columnname : str
        The name of the column to flatten.

    Returns
    -------
    DataFrame
        Spark DataFrame with new columns from the input column.

    Examples
    -------
    >>> df = spark.read.format("avro").load(ztf_alert_sample)

    # Candidate is nested
    >>> s = df.schema
    >>> typeOf = {i.name: i.dataType.typeName() for  i in s.fields}
    >>> typeOf['candidate'] == 'struct'
    True

    # Flatten it
    >>> df_flat = flattenstruct(df, "candidate")
    >>> "candidate_ra" in df_flat.schema.fieldNames()
    True

    # Each new column contains array element
    >>> s_flat = df_flat.schema
    >>> typeOf = {i.name: i.dataType.typeName() for  i in s_flat.fields}
    >>> typeOf['candidate_ra'] == 'double'
    True
    """
    sc = get_spark_context()
    obj = sc._jvm.com.astrolabsoftware.fink_broker.catalogUtils
    _df = obj.flattenStruct(df._jdf, columnname)
    df_flatten = _java2py(sc, _df)
    return df_flatten
コード例 #48
0
 def get(self, *outputs):
     """
     Parameters
     ----------
     outputs: string, list of strings
         Output variables as defined inside the DML script.
     """
     outs = [_java2py(self.sc, self._java_results.get(out)) for out in outputs]
     if len(outs) == 1:
         return outs[0]
     return outs
コード例 #49
0
ファイル: classification.py プロジェクト: AsafZ/spark
 def load(cls, sc, path):
     java_model = sc._jvm.org.apache.spark.mllib.classification.LogisticRegressionModel.load(
         sc._jsc.sc(), path)
     weights = _java2py(sc, java_model.weights())
     intercept = java_model.intercept()
     numFeatures = java_model.numFeatures()
     numClasses = java_model.numClasses()
     threshold = java_model.getThreshold().get()
     model = LogisticRegressionModel(weights, intercept, numFeatures, numClasses)
     model.setThreshold(threshold)
     return model
コード例 #50
0
 def __init__(self, c=0, coefficients=None, jmodel=None, sc=None):
     assert sc != None, "Missing SparkContext"
     
     self._ctx = sc
     if jmodel == None:
         self._jmodel = self._ctx._jvm.com.cloudera.sparkts.models.ARModel(c, _py2java_double_array(self._ctx, coefficients))
     else:
         self._jmodel = jmodel
     
     self.c = self._jmodel.c()
     self.coefficients = _java2py(self._ctx, self._jmodel.coefficients())
コード例 #51
0
ファイル: wrapper.py プロジェクト: hbhanawat/spark
 def _transfer_param_map_from_java(self, javaParamMap):
     """
     Transforms a Java ParamMap into a Python ParamMap.
     """
     sc = SparkContext._active_spark_context
     paramMap = dict()
     for pair in javaParamMap.toList():
         param = pair.param()
         if self.hasParam(str(param.name())):
             paramMap[self.getParam(param.name())] = _java2py(sc, pair.value())
     return paramMap
コード例 #52
0
ファイル: wrapper.py プロジェクト: 15652101501/spark
 def _transfer_params_from_java(self):
     """
     Transforms the embedded params from the companion Java object.
     """
     sc = SparkContext._active_spark_context
     for param in self.params:
         if self._java_obj.hasParam(param.name):
             java_param = self._java_obj.getParam(param.name)
             # SPARK-14931: Only check set params back to avoid default params mismatch.
             if self._java_obj.isSet(java_param):
                 value = _java2py(sc, self._java_obj.getOrDefault(java_param))
                 self._set(**{param.name: value})
コード例 #53
0
ファイル: _model.py プロジェクト: BabelTower/spark-timeseries
 def remove_time_dependent_effects(self, ts):
     """
     Given a timeseries, apply inverse operations to obtain the original series of underlying errors.
     Parameters
     ----------
     ts:
         Time series of observations with this model's characteristics as a Numpy array
     
     returns the time series with removed time-dependent effects as a Numpy array
     """
     destts = Vectors.dense(np.array([0] * len(ts)))
     result =  self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
     return _java2py(self._ctx, result.toArray())
コード例 #54
0
ファイル: ARIMA.py プロジェクト: pegli/spark-timeseries
 def sample(self, n):
     """
     Sample a series of size n assuming an ARIMA(p, d, q) process.
     
     Parameters
     ----------
     n:
         size of sample
         
     Returns a series reflecting ARIMA(p, d, q) process as a DenseVector
     """
     rg = self._ctx._jvm.org.apache.commons.math3.random.JDKRandomGenerator()
     return _java2py(self._ctx, self._jmodel.sample(n, rg))
コード例 #55
0
 def sample(self, n):
     """
     Samples a random time series of a given length with the properties of the model.
     
     Parameters
     ----------
     n:
         The length of the time series to sample.
     
     Returns the sampled time series.
     """
     rg = self._ctx._jvm.org.apache.commons.math3.random.JDKRandomGenerator()
     return _java2py(self._ctx, self._jmodel.sample(n, rg))
コード例 #56
0
 def __init__(self, c=0.0, coefficients=[], yMaxLag=0, xMaxLag=0, includesOriginalX=True, jmodel=None, sc=None):
     assert sc != None, "Missing SparkContext"
     
     self._ctx = sc
     if jmodel == None:
         self._jmodel = self._ctx._jvm.com.cloudera.sparkts.models.ARXModel(float(c), _py2java_double_array(self._ctx, coefficients), yMaxLag, xMaxLag, includesOriginalX)
     else:
         self._jmodel = jmodel
     
     self.c = self._jmodel.c()
     self.coefficients = _java2py(self._ctx, self._jmodel.coefficients())
     self.yMaxLag = self._jmodel.yMaxLag()
     self.xMaxLag = self._jmodel.xMaxLag()
コード例 #57
0
ファイル: _model.py プロジェクト: BabelTower/spark-timeseries
 def add_time_dependent_effects(self, ts):
     """
     Given a timeseries, apply a model to it.
     
     Parameters
     ----------
     ts:
         Time series of i.i.d. observations as a Numpy array
     
     returns the time series with added time-dependent effects as a Numpy array.
     """
     destts = Vectors.dense([0] * len(ts))
     result =  self._jmodel.addTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
     return _java2py(self._ctx, result.toArray())
コード例 #58
0
ファイル: ARIMA.py プロジェクト: pegli/spark-timeseries
    def log_likelihood_css(self, y):
        """
        log likelihood based on conditional sum of squares
        
        Source: http://www.nuffield.ox.ac.uk/economics/papers/1997/w6/ma.pdf

        Parameters
        ----------
        y:
            time series as a DenseVector

        returns log likelihood as a double
        """
        likelihood = self._jmodel.logLikelihoodCSS(_py2java(self._ctx, y))
        return _java2py(self._ctx, likelihood)