Exemple #1
0
    def stronglyConnectedComponents(self, maxIter):
        """
        Runs the strongly connected components algorithm on this graph.

        See Scala documentation for more details.

        :param maxIter: the number of iterations to run
        :return: DataFrame with new vertex column "component"
        """
        jdf = self._jvm_graph.stronglyConnectedComponents().maxIter(
            maxIter).run()
        return DataFrame(jdf, self._sqlContext)
Exemple #2
0
    def vacuum(self, retentionHours=None):
        """
        Recursively delete files and directories in the table that are not needed by the table for
        maintaining older versions up to the given retention threshold. This method will return an
        empty DataFrame on successful completion.

        Example::

            deltaTable.vacuum()     # vacuum files not required by versions more than 7 days old

            deltaTable.vacuum(100)  # vacuum files not required by versions more than 100 hours old

        :param retentionHours: Optional number of hours retain history. If not specified, then the
                               default retention period of 168 hours (7 days) will be used.
        """
        jdt = self._jdt
        if retentionHours is None:
            return DataFrame(jdt.vacuum(), self._spark._wrapped)
        else:
            return DataFrame(jdt.vacuum(float(retentionHours)),
                             self._spark._wrapped)
Exemple #3
0
    def inDegrees(self):
        """
        The in-degree of each vertex in the graph, returned as a DataFame with two columns:
         - "id": the ID of the vertex
         - "inDegree" (int) storing the in-degree of the vertex

        Note that vertices with 0 in-edges are not returned in the result.

        :return:  DataFrame with new vertices column "inDegree"
        """
        jdf = self._jvm_graph.inDegrees()
        return DataFrame(jdf, self._sqlContext)
Exemple #4
0
    def degrees(self):
        """
        The degree of each vertex in the graph, returned as a DataFrame with two columns:
         - "id": the ID of the vertex
         - 'degree' (integer) the degree of the vertex

        Note that vertices with 0 edges are not returned in the result.

        :return:  DataFrame with new vertices column "degree"
        """
        jdf = self._jvm_graph.degrees()
        return DataFrame(jdf, self._sqlContext)
 def Embedding2Caption(self, embeddingDF, vocab, embeddingColumn,
                       captionColumn):
     """Get the captions from the embeddings
     :param embeddingDF: the dataframe which contains the embedding
     :param vocab: the vocab object
     :param embeddingColumn: the embedding column name in embeddingDF which contains the caption embedding
     """
     df = self.__dict__.get('conversions').Embedding2Caption(
         embeddingDF._jdf, vocab.vocabObject, embeddingColumn,
         captionColumn)
     pydf = DataFrame(df, self.__dict__.get('sqlContext'))
     return pydf
Exemple #6
0
 def successMetricsAsDataFrame(self):
     try:
         df = self.jvmAnalyzerContext.successMetricsAsDataFrame(
             self._jsparkSession, self.jvmAnalyzerContext,
             getattr(self.jvmAnalyzerContext,
                     "successMetricsAsDataFrame$default$3")())
         out = DataFrame(df, self.spark)
         return out
     except Exception:
         self.spark.sparkContext._gateway.close()
         self.spark.stop()
         raise AttributeError
Exemple #7
0
    def countKmers(self, kmerLength):
        """
        Cuts reads into _k_-mers, and then counts the number of occurrences of each _k_-mer.

        :param int kmerLength: The value of _k_ to use for cutting _k_-mers.
        :return: Returns an RDD containing k-mer/count pairs.
        :rtype: DataFrame containing "kmer" string and "count" long.
        """

        return DataFrame(
            self._jvmRdd.countKmersAsDataset(kmerLength).toDF(),
            SQLContext(self.sc))
Exemple #8
0
    def fromJava(self, stuff):
        if stuff.__class__.__name__ == "JavaObject":
            clazz = stuff.getClass().getName()
            if clazz == "org.apache.spark.sql.Dataset":
                stuff = stuff.toDF()
                clazz = "org.apache.spark.sql.DataFrame"

            if clazz == "org.apache.spark.sql.DataFrame":
                if Environment.sparkVersion == 1:
                    return DataFrame(
                        stuff,
                        SQLContext(SparkContext.getOrCreate(),
                                   stuff.sqlContext()))
                else:
                    return DataFrame(
                        stuff,
                        ShellAccess.SparkSession.builder.getOrCreate()._wrapped
                    )
            elif clazz == "org.apache.spark.sql.SQLContext":
                return SQLContext(SparkContext.getOrCreate(), stuff)
        return stuff
Exemple #9
0
    def aggProfiles(self,
                    datetime_ts: Optional[datetime] = None,
                    timestamp_ms: int = None) -> DataFrame:  # noqa
        if datetime_ts is not None:
            timestamp_ms = int(datetime_ts.timestamp() * 1000)
        elif timestamp_ms is None:
            timestamp_ms = int(
                datetime.now(tz=timezone.utc).timestamp() * 1000)

        jdf = self._create_j_session().aggProfiles(timestamp_ms)

        return DataFrame(jdf=jdf, sql_ctx=self._df.sql_ctx)
Exemple #10
0
    def executeCompaction(self) -> DataFrame:
        """
        Compact the small files in selected partitions.

        :return: DataFrame containing the OPTIMIZE execution metrics
        :rtype: pyspark.sql.DataFrame
        """
        return DataFrame(
            self._jbuilder.executeCompaction(),
            getattr(self._spark, "_wrapped",
                    self._spark)  # type: ignore[attr-defined]
        )
Exemple #11
0
    def _convertJavaOutputToPythonObject(self, X, output):
        """
        Converts the a java-side object output (either MatrixBlock or Java DataFrame) to a python object (based on the type of X).

        Parameters
        ----------
        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
        output: a java-side object (either MatrixBlock or Java DataFrame)
        """
        if isinstance(X, SUPPORTED_TYPES) and self.transferUsingDF:
            retDF = DataFrame(output, self.sparkSession)
            retPDF = retDF.sort('__INDEX').select('prediction').toPandas()
            return retPDF.as_matrix().flatten() if isinstance(X, np.ndarray) else retPDF
        elif isinstance(X, SUPPORTED_TYPES):
            return convertToNumPyArr(self.sc, output)
        elif hasattr(X, '_jdf'):
            retDF = DataFrame(output, self.sparkSession)
            # Return DF
            return retDF.sort('__INDEX')
        else:
            raise Exception('Unsupported input type')
Exemple #12
0
def streamBingImages(self,
                     searchTerms,
                     key,
                     url,
                     batchSize=10,
                     imgsPerBatch=10):
    ctx = SparkContext.getOrCreate()
    reader = ctx._jvm.org.apache.spark.sql.execution.streaming.BingImageSource
    sql_ctx = pyspark.SQLContext.getOrCreate(ctx)
    jsession = sql_ctx.sparkSession._jsparkSession
    jdf = reader(searchTerms, key, url, batchSize, imgsPerBatch).load(jsession)
    return DataFrame(jdf, sql_ctx)
Exemple #13
0
    def triplets(self):
        """
        The triplets: (source vertex)-[edge]->(destination vertex) for all edges in the graph.
        Returned as a :class:`DataFrame` with three columns:
         - 'src': source vertex with schema matching 'vertices'
         - 'edge': edge with schema matching 'edges'
         - 'dst': destination vertex with schema matching 'vertices'

        :return:  DataFrame with columns 'src', 'edge', and 'dst'
        """
        jdf = self._jvm_graph.triplets()
        return DataFrame(jdf, self._sqlContext)
Exemple #14
0
def _java2py(gateway, r, encoding="bytes"):
    if isinstance(r, JavaObject):
        clsName = r.getClass().getSimpleName()
        # convert RDD into JavaRDD
        if clsName != 'JavaRDD' and clsName.endswith("RDD"):
            r = r.toJavaRDD()
            clsName = 'JavaRDD'

        if clsName == 'JavaRDD':
            jrdd = gateway.jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.javaToPython(
                r)
            return RDD(jrdd, get_spark_context())

        if clsName == 'DataFrame':
            return DataFrame(r, get_spark_sql_context(get_spark_context()))

        if clsName == 'Dataset':
            return DataFrame(r, get_spark_sql_context(get_spark_context()))

        if clsName == "ImageFrame[]":
            return r

        if clsName in _picklable_classes:
            r = gateway.jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.dumps(
                r)
        elif isinstance(r, (JavaArray, JavaList)) and len(r) != 0 \
                and isinstance(r[0], JavaObject) \
                and r[0].getClass().getSimpleName() in ['DataFrame', 'Dataset']:
            spark = get_spark_sql_context(get_spark_context())
            r = list(map(lambda x: DataFrame(x, spark), r))
        elif isinstance(r, (JavaArray, JavaList, JavaMap)):
            try:
                r = gateway.jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.dumps(
                    r)
            except Py4JJavaError:
                pass  # not pickable

        if isinstance(r, (bytearray, bytes)):
            r = PickleSerializer().loads(bytes(r), encoding=encoding)
    return r
Exemple #15
0
    def smvDedupByKey(self, *keys):
        """Remove duplicate records from the DataFrame by arbitrarly selecting the first record from a set of records with same primary key or key combo.

            Args:
                keys (\*string or \*Column): the column names or Columns on which to apply dedup

            Example:
                input DataFrame:

                +-----+---------+---------+
                | id  | product | Company |
                +=====+=========+=========+
                | 1   | A       | C1      |
                +-----+---------+---------+
                | 1   | C       | C2      |
                +-----+---------+---------+
                | 2   | B       | C3      |
                +-----+---------+---------+
                | 2   | B       | C4      |
                +-----+---------+---------+

                >>> df.dedupByKey("id")

                output DataFrame:

                +-----+---------+---------+
                | id  | product | Company |
                +=====+=========+=========+
                | 1   | A       | C1      |
                +-----+---------+---------+
                | 2   | B       | C3      |
                +-----+---------+---------+

                >>> df.dedupByKey("id", "product")

                output DataFrame:

                +-----+---------+---------+
                | id  | product | Company |
                +=====+=========+=========+
                | 1   | A       | C1      |
                +-----+---------+---------+
                | 1   | C       | C2      |
                +-----+---------+---------+
                | 2   | B       | C3      |
                +-----+---------+---------+

            Returns:
                (DataFrame): a DataFrame without duplicates for the specified keys
        """
        jdf = self._jPythonHelper.smvDedupByKey(self._jdf, smv_copy_array(self._sc, *keys))
        return DataFrame(jdf, self._sql_ctx)
Exemple #16
0
    def get_ancestors(self, uri=None, version=None):
        """
        Returns a dataset of all ancestors.
        """
        df = DataFrame(self._jconcept_maps.getAncestors(), self._spark_session)

        if uri is not None:
            df = df.where(df.conceptMapUri == functions.lit(uri))

        if version is not None:
            df = df.where(df.conceptMapVersion == functions.lit(version))

        return df
Exemple #17
0
    def readDataset(self,
                    spark,
                    path,
                    delimiter="|",
                    outputPosCol="tags",
                    outputDocumentCol="text"):

        # ToDo Replace with std pyspark
        jSession = spark._jsparkSession

        jdf = self._java_obj.readDataset(jSession, path, delimiter,
                                         outputPosCol, outputDocumentCol)
        return DataFrame(jdf, spark._wrapped)
Exemple #18
0
    def history(self, limit=None):
        """
        Get the information of the latest `limit` commits on this table as a Spark DataFrame.
        The information is in reverse chronological order.

        Example::

            fullHistoryDF = deltaTable.history()    # get the full history of the table

            lastOperationDF = deltaTable.history(1) # get the last operation

        :param limit: Optional, number of latest commits to returns in the history.
        :return: Table's commit history. See the online Delta Lake documentation for more details.
        :rtype: pyspark.sql.DataFrame

        .. note:: Evolving
        """
        jdt = self._jdt
        if limit is None:
            return DataFrame(jdt.history(), self._spark._wrapped)
        else:
            return DataFrame(jdt.history(limit), self._spark._wrapped)
Exemple #19
0
def _map(fetches, dframe, feed_dict, block, trim, initial_variables=_initial_variables_default):
    fetches = _check_fetches(fetches)
    # We are not dealing for now with registered expansions, but this is something we should add later.
    graph = _get_graph(fetches, initial_variables)
    if block:
        builder = _java_api().map_blocks(dframe._jdf, trim)
    else:
        builder = _java_api().map_rows(dframe._jdf)
    _add_graph(graph, builder)
    ph_names = _add_shapes(graph, builder, fetches)
    _add_inputs(builder, feed_dict, ph_names)
    jdf = builder.buildDF()
    return DataFrame(jdf, _sql)
Exemple #20
0
    def app4(self,
             df=None,
             groupByCol=None,
             aggCol=None,
             appColName=defaultCol):

        self.__mandatoryArgumentCheck(groupByCol, aggCol)

        if df is None: df = self._df

        return DataFrame(
            self._jApp.app4(df._jdf, groupByCol, aggCol, appColName),
            df.sql_ctx)
Exemple #21
0
def openCsv(path, validate=False):
    """Read in a CSV file as a DataFrame

        Args:
            path (str): The path of the CSV file
            validate (bool): If true, validate the CSV before return DataFrame (raise error if malformatted)

        Returns:
            (DataFrame): The resulting DataFrame
    """
    app = SmvApp.getInstance()
    jdf = app.j_smvPyClient.shellOpenCsv(path, validate)
    return DataFrame(jdf, SmvApp.getInstance().sqlContext)
 def ImageCaption2Embedding(self, imageRootFolder, imageCaptionDF, vocab,
                            captionLength):
     """Get the embedding for the images as well as the caption as a dataframe
     :param imageRootFolder: the src folder of the images
     :param imageCaptionDF: the dataframe with the images as well as captions
     :param vocab: the vocab object
     :param captionLength: Length of the embedding to generate for the caption
     """
     df = self.__dict__.get('conversions').ImageCaption2Embedding(
         imageRootFolder, imageCaptionDF._jdf, vocab.vocabObject,
         captionLength)
     pydf = DataFrame(df, self.__dict__.get('sqlContext'))
     return pydf
Exemple #23
0
def main():
    # SparkSession startup
    spark = (SparkSession.builder.master('local[*]').config(
        'spark.jars.packages', 'com.amazon.deequ:deequ:1.0.3-rc2').appName(
            'constrain-example').getOrCreate())
    df = spark.createDataFrame(test_data)

    # Constrain verification
    r = (
        VerificationSuite(spark).onData(df).addCheck(
            Check(spark, 'error', 'examples').hasSize(
                lambda x: x == 8).isUnique('_2').hasCompleteness(
                    '_2', lambda x: x >= 0.75).hasUniqueness(
                        '_1', lambda x: x == 3 / 8).hasDistinctness(
                            '_1', lambda x: x == 5 / 8).hasUniqueValueRatio(
                                '_2',
                                lambda x: x == 0.8).hasNumberOfDistinctValues(
                                    '_2', lambda x: x == 6)
            #.hasHistogram
            .hasEntropy('_3', lambda x: x > 1)
            #.hasMutualInformation('_2', '_3', lambda x: x > 0.5)
            .hasApproxQuantile('_2', 0.5, lambda x: x == 7).hasMinLength(
                '_1', lambda x: x == 6).hasMaxLength(
                    '_3',
                    lambda x: x == 10).hasMin('_2',
                                              lambda x: x == 1).hasMax(
                                                  '_2',
                                                  lambda x: x == 20).hasMean(
                                                      '_2', lambda x: x > 10).
            hasSum('_2', lambda x: x > 50).hasStandardDeviation(
                '_2', lambda x: x > 5).hasApproxCountDistinct(
                    '_2', lambda x: x == 5).hasCorrelation(
                        '_2', '_5',
                        lambda x: x == 1).satisfies("_2 > 15", "MyCondition",
                                                    lambda x: x == 0.25)
            #.hasPattern("_1", "thing([A-Z])", lambda x: x == 1)
            #.hasDataType("_1", "string", lambda x: x == 1)
            .isPositive('_2').isNonNegative('_2').isLessThan(
                '_5', '_2', lambda x: x == 0.375).isLessThanOrEqualTo(
                    '_5', '_2', lambda x: x == 0.375).isGreaterThan(
                        '_5', '_2',
                        lambda x: x == 0.125).isGreaterThanOrEqualTo(
                            '_5', '_2', lambda x: x == 0.125)
            #.isContainedIn('_3', ['DELAYED', 'INTRANSIT'])
            .isInInterval('_5', 1.0, 50.0)).run())
    df = DataFrame(r, spark)
    df.show(df.count(), False)

    # SparkSession and Java Gateway teardown
    spark.sparkContext._gateway.close()
    spark.stop()
    def predict(self, X):
        """
        Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types

        Parameters
        ----------
        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
        """
        if isinstance(X, SUPPORTED_TYPES):
            if self.transferUsingDF:
                pdfX = convertToPandasDF(X)
                df = assemble(self.sqlCtx, pdfX, pdfX.columns, self.featuresCol).select(self.featuresCol)
                retjDF = self.model.transform(df._jdf)
                retDF = DataFrame(retjDF, self.sqlCtx)
                retPDF = retDF.sort('__INDEX').select('prediction').toPandas()
                if isinstance(X, np.ndarray):
                    return retPDF.as_matrix().flatten()
                else:
                    return retPDF
            else:
                retNumPy = convertToNumPyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
                if isinstance(X, np.ndarray):
                    return retNumPy
                else:
                    return retNumPy # TODO: Convert to Pandas
        elif hasattr(X, '_jdf'):
            if self.featuresCol in X.columns:
                # No need to assemble as input DF is likely coming via MLPipeline
                df = X
            else:
                assembler = VectorAssembler(inputCols=X.columns, outputCol=self.featuresCol)
                df = assembler.transform(X)
            retjDF = self.model.transform(df._jdf)
            retDF = DataFrame(retjDF, self.sqlCtx)
            # Return DF
            return retDF.sort('__INDEX')
        else:
            raise Exception('Unsupported input type')
Exemple #25
0
def dedup_with_combiner(df, group_col, order_by_col, desc = True, columns_filter = [], columns_filter_keep = True):
    """
    Used get the 'latest' record (after ordering according to the provided order columns) in each group.
    :param df: DataFrame to operate on
    :param group_col: column to group by the records
    :param order_by_col: column to order the records according to
    :param desc: have the order as desc
    :param columns_filter: columns to filter
    :param columns_filter_keep: indicates whether we should filter the selected columns 'out' or alternatively have only
*                          those columns in the result
    :return: DataFrame representing the data after the operation
    """
    jdf = _get_utils(df).dedupWithCombiner(df._jdf, group_col._jc, order_by_col._jc, desc, columns_filter, columns_filter_keep)
    return DataFrame(jdf, df.sql_ctx)
Exemple #26
0
def broadcast_join_skewed(not_skewed_df, skewed_df, join_col, number_of_custs_to_broadcast):
    """
    Suitable to perform a join in cases when one DF is skewed and the other is not skewed.
    splits both of the DFs to two parts according to the skewed keys.
    1. Map-join: broadcasts the skewed-keys part of the not skewed DF to the skewed-keys part of the skewed DF
    2. Regular join: between the remaining two parts.
    :param not_skewed_df: not skewed DataFrame
    :param skewed_df: skewed DataFrame
    :param join_col: join column
    :param number_of_custs_to_broadcast: number of custs to broadcast
    :return: DataFrame representing the data after the operation
    """
    jdf = _get_utils(skewed_df).broadcastJoinSkewed(not_skewed_df._jdf, skewed_df._jdf, join_col, number_of_custs_to_broadcast)
    return DataFrame(jdf, not_skewed_df.sql_ctx)
Exemple #27
0
def analyze(dframe):
    """ Analyzes a Spark DataFrame for the tensor content, and returns a new dataframe with extra metadata that
     describes the numerical shape of the content.

     This method is useful when a dataframe contains non-scalar tensors, for which the shape must be checked beforehand.

     Note: nullable fields are not accepted.

     The function [print_schema] lets users introspect the information added to the DataFrame.

    :param dframe: a Spark DataFrame
    :return: a Spark DataFrame with metadata information embedded.
    """
    return DataFrame(_java_api().analyze(dframe._jdf), _sql)
Exemple #28
0
    def fromJava(self, stuff):
        if stuff.__class__.__name__ == "JavaObject":
            clazz = stuff.getClass().getName()
            if clazz == "org.apache.spark.sql.Dataset":
                stuff = stuff.toDF()
                clazz = "org.apache.spark.sql.DataFrame"

            if clazz == "org.apache.spark.sql.DataFrame":
                return DataFrame(
                    stuff,
                    SQLContext(SparkContext.getOrCreate(), stuff.sqlContext()))
            elif clazz == "org.apache.spark.sql.SQLContext":
                return SQLContext(SparkContext.getOrCreate(), stuff)
        return stuff
Exemple #29
0
 def getDataFrame(self, *outputs):
     """
     Parameters
     ----------
     outputs: string, list of strings
         Output variables as defined inside the DML script.
     """
     outs = [
         DataFrame(self._java_results.getDataFrame(out),
                   MLResults.sqlContext) for out in outputs
     ]
     if len(outs) == 1:
         return outs[0]
     return outs
Exemple #30
0
async def spark_sql_exe(objs, st, p, tk):

    global __spark
    global __sqlContext

    for obj in objs:
        df = __spark.read.format("edu.berkeley.cs.rise.opaque.EncryptedSource") \
                                .schema(obj.schema) \
                                .load(obj.path)
        qdf = __spark._jvm.org.apache.spark.sql.QShieldDatasetFunctions(
            df._jdf)
        qdfAC = qdf.acPolicyApplied(tk)
        dfAC = DataFrame(qdfAC, __sqlContext)
        dfAC.createOrReplaceTempView(obj.name)

    dfsql = __spark.sql(st)

    qres = __spark._jvm.org.apache.spark.sql.QShieldDatasetFunctions(
        dfsql._jdf)
    qresPrep = qres.resPrepared()
    resPrep = DataFrame(qresPrep, __sqlContext)
    coll_fur = await asyncio.wrap_future(resPrep.collectAsync())
    return coll_fur