Python DataFrame.DataFrame Exemples, pyspark.sql.DataFrame.DataFrame Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : graphframe.py Projet : zhuyuecai/graphframes

    def stronglyConnectedComponents(self, maxIter):
        """
        Runs the strongly connected components algorithm on this graph.

        See Scala documentation for more details.

        :param maxIter: the number of iterations to run
        :return: DataFrame with new vertex column "component"
        """
        jdf = self._jvm_graph.stronglyConnectedComponents().maxIter(
            maxIter).run()
        return DataFrame(jdf, self._sqlContext)

Exemple #2

0

Afficher le fichier

    def vacuum(self, retentionHours=None):
        """
        Recursively delete files and directories in the table that are not needed by the table for
        maintaining older versions up to the given retention threshold. This method will return an
        empty DataFrame on successful completion.

        Example::

            deltaTable.vacuum()     # vacuum files not required by versions more than 7 days old

            deltaTable.vacuum(100)  # vacuum files not required by versions more than 100 hours old

        :param retentionHours: Optional number of hours retain history. If not specified, then the
                               default retention period of 168 hours (7 days) will be used.
        """
        jdt = self._jdt
        if retentionHours is None:
            return DataFrame(jdt.vacuum(), self._spark._wrapped)
        else:
            return DataFrame(jdt.vacuum(float(retentionHours)),
                             self._spark._wrapped)

Exemple #3

0

Afficher le fichier

Fichier : graphframe.py Projet : zhuyuecai/graphframes

    def inDegrees(self):
        """
        The in-degree of each vertex in the graph, returned as a DataFame with two columns:
         - "id": the ID of the vertex
         - "inDegree" (int) storing the in-degree of the vertex

        Note that vertices with 0 in-edges are not returned in the result.

        :return:  DataFrame with new vertices column "inDegree"
        """
        jdf = self._jvm_graph.inDegrees()
        return DataFrame(jdf, self._sqlContext)

Exemple #4

0

Afficher le fichier

Fichier : graphframe.py Projet : zhuyuecai/graphframes

    def degrees(self):
        """
        The degree of each vertex in the graph, returned as a DataFrame with two columns:
         - "id": the ID of the vertex
         - 'degree' (integer) the degree of the vertex

        Note that vertices with 0 edges are not returned in the result.

        :return:  DataFrame with new vertices column "degree"
        """
        jdf = self._jvm_graph.degrees()
        return DataFrame(jdf, self._sqlContext)

Exemple #5

0

Afficher le fichier

Fichier : DFConversions.py Projet : zgj123123/CaffeOnSpark

 def Embedding2Caption(self, embeddingDF, vocab, embeddingColumn,
                       captionColumn):
     """Get the captions from the embeddings
     :param embeddingDF: the dataframe which contains the embedding
     :param vocab: the vocab object
     :param embeddingColumn: the embedding column name in embeddingDF which contains the caption embedding
     """
     df = self.__dict__.get('conversions').Embedding2Caption(
         embeddingDF._jdf, vocab.vocabObject, embeddingColumn,
         captionColumn)
     pydf = DataFrame(df, self.__dict__.get('sqlContext'))
     return pydf

Exemple #6

0

Afficher le fichier

Fichier : base.py Projet : hariprasad-dcube/pydeequ

 def successMetricsAsDataFrame(self):
     try:
         df = self.jvmAnalyzerContext.successMetricsAsDataFrame(
             self._jsparkSession, self.jvmAnalyzerContext,
             getattr(self.jvmAnalyzerContext,
                     "successMetricsAsDataFrame$default$3")())
         out = DataFrame(df, self.spark)
         return out
     except Exception:
         self.spark.sparkContext._gateway.close()
         self.spark.stop()
         raise AttributeError

Exemple #7

0

Afficher le fichier

    def countKmers(self, kmerLength):
        """
        Cuts reads into _k_-mers, and then counts the number of occurrences of each _k_-mer.

        :param int kmerLength: The value of _k_ to use for cutting _k_-mers.
        :return: Returns an RDD containing k-mer/count pairs.
        :rtype: DataFrame containing "kmer" string and "count" long.
        """

        return DataFrame(
            self._jvmRdd.countKmersAsDataset(kmerLength).toDF(),
            SQLContext(self.sc))

Exemple #8

0

Afficher le fichier

    def fromJava(self, stuff):
        if stuff.__class__.__name__ == "JavaObject":
            clazz = stuff.getClass().getName()
            if clazz == "org.apache.spark.sql.Dataset":
                stuff = stuff.toDF()
                clazz = "org.apache.spark.sql.DataFrame"

            if clazz == "org.apache.spark.sql.DataFrame":
                if Environment.sparkVersion == 1:
                    return DataFrame(
                        stuff,
                        SQLContext(SparkContext.getOrCreate(),
                                   stuff.sqlContext()))
                else:
                    return DataFrame(
                        stuff,
                        ShellAccess.SparkSession.builder.getOrCreate()._wrapped
                    )
            elif clazz == "org.apache.spark.sql.SQLContext":
                return SQLContext(SparkContext.getOrCreate(), stuff)
        return stuff

Exemple #9

0

Afficher le fichier

    def aggProfiles(self,
                    datetime_ts: Optional[datetime] = None,
                    timestamp_ms: int = None) -> DataFrame:  # noqa
        if datetime_ts is not None:
            timestamp_ms = int(datetime_ts.timestamp() * 1000)
        elif timestamp_ms is None:
            timestamp_ms = int(
                datetime.now(tz=timezone.utc).timestamp() * 1000)

        jdf = self._create_j_session().aggProfiles(timestamp_ms)

        return DataFrame(jdf=jdf, sql_ctx=self._df.sql_ctx)

Exemple #10

0

Afficher le fichier

Fichier : tables.py Projet : zhangkuantian/delta

    def executeCompaction(self) -> DataFrame:
        """
        Compact the small files in selected partitions.

        :return: DataFrame containing the OPTIMIZE execution metrics
        :rtype: pyspark.sql.DataFrame
        """
        return DataFrame(
            self._jbuilder.executeCompaction(),
            getattr(self._spark, "_wrapped",
                    self._spark)  # type: ignore[attr-defined]
        )

Exemple #11

0

Afficher le fichier

    def _convertJavaOutputToPythonObject(self, X, output):
        """
        Converts the a java-side object output (either MatrixBlock or Java DataFrame) to a python object (based on the type of X).

        Parameters
        ----------
        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
        output: a java-side object (either MatrixBlock or Java DataFrame)
        """
        if isinstance(X, SUPPORTED_TYPES) and self.transferUsingDF:
            retDF = DataFrame(output, self.sparkSession)
            retPDF = retDF.sort('__INDEX').select('prediction').toPandas()
            return retPDF.as_matrix().flatten() if isinstance(X, np.ndarray) else retPDF
        elif isinstance(X, SUPPORTED_TYPES):
            return convertToNumPyArr(self.sc, output)
        elif hasattr(X, '_jdf'):
            retDF = DataFrame(output, self.sparkSession)
            # Return DF
            return retDF.sort('__INDEX')
        else:
            raise Exception('Unsupported input type')

Exemple #12

0

Afficher le fichier

def streamBingImages(self,
                     searchTerms,
                     key,
                     url,
                     batchSize=10,
                     imgsPerBatch=10):
    ctx = SparkContext.getOrCreate()
    reader = ctx._jvm.org.apache.spark.sql.execution.streaming.BingImageSource
    sql_ctx = pyspark.SQLContext.getOrCreate(ctx)
    jsession = sql_ctx.sparkSession._jsparkSession
    jdf = reader(searchTerms, key, url, batchSize, imgsPerBatch).load(jsession)
    return DataFrame(jdf, sql_ctx)

Exemple #13

0

Afficher le fichier

    def triplets(self):
        """
        The triplets: (source vertex)-[edge]->(destination vertex) for all edges in the graph.
        Returned as a :class:`DataFrame` with three columns:
         - 'src': source vertex with schema matching 'vertices'
         - 'edge': edge with schema matching 'edges'
         - 'dst': destination vertex with schema matching 'vertices'

        :return:  DataFrame with columns 'src', 'edge', and 'dst'
        """
        jdf = self._jvm_graph.triplets()
        return DataFrame(jdf, self._sqlContext)

Exemple #14

0

Afficher le fichier

Fichier : common.py Projet : ssattari/BigDL

def _java2py(gateway, r, encoding="bytes"):
    if isinstance(r, JavaObject):
        clsName = r.getClass().getSimpleName()
        # convert RDD into JavaRDD
        if clsName != 'JavaRDD' and clsName.endswith("RDD"):
            r = r.toJavaRDD()
            clsName = 'JavaRDD'

        if clsName == 'JavaRDD':
            jrdd = gateway.jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.javaToPython(
                r)
            return RDD(jrdd, get_spark_context())

        if clsName == 'DataFrame':
            return DataFrame(r, get_spark_sql_context(get_spark_context()))

        if clsName == 'Dataset':
            return DataFrame(r, get_spark_sql_context(get_spark_context()))

        if clsName == "ImageFrame[]":
            return r

        if clsName in _picklable_classes:
            r = gateway.jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.dumps(
                r)
        elif isinstance(r, (JavaArray, JavaList)) and len(r) != 0 \
                and isinstance(r[0], JavaObject) \
                and r[0].getClass().getSimpleName() in ['DataFrame', 'Dataset']:
            spark = get_spark_sql_context(get_spark_context())
            r = list(map(lambda x: DataFrame(x, spark), r))
        elif isinstance(r, (JavaArray, JavaList, JavaMap)):
            try:
                r = gateway.jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.dumps(
                    r)
            except Py4JJavaError:
                pass  # not pickable

        if isinstance(r, (bytearray, bytes)):
            r = PickleSerializer().loads(bytes(r), encoding=encoding)
    return r

Exemple #15

0

Afficher le fichier

    def smvDedupByKey(self, *keys):
        """Remove duplicate records from the DataFrame by arbitrarly selecting the first record from a set of records with same primary key or key combo.

            Args:
                keys (\*string or \*Column): the column names or Columns on which to apply dedup

            Example:
                input DataFrame:

                +-----+---------+---------+
                | id  | product | Company |
                +=====+=========+=========+
                | 1   | A       | C1      |
                +-----+---------+---------+
                | 1   | C       | C2      |
                +-----+---------+---------+
                | 2   | B       | C3      |
                +-----+---------+---------+
                | 2   | B       | C4      |
                +-----+---------+---------+

                >>> df.dedupByKey("id")

                output DataFrame:

                +-----+---------+---------+
                | id  | product | Company |
                +=====+=========+=========+
                | 1   | A       | C1      |
                +-----+---------+---------+
                | 2   | B       | C3      |
                +-----+---------+---------+

                >>> df.dedupByKey("id", "product")

                output DataFrame:

                +-----+---------+---------+
                | id  | product | Company |
                +=====+=========+=========+
                | 1   | A       | C1      |
                +-----+---------+---------+
                | 1   | C       | C2      |
                +-----+---------+---------+
                | 2   | B       | C3      |
                +-----+---------+---------+

            Returns:
                (DataFrame): a DataFrame without duplicates for the specified keys
        """
        jdf = self._jPythonHelper.smvDedupByKey(self._jdf, smv_copy_array(self._sc, *keys))
        return DataFrame(jdf, self._sql_ctx)

Exemple #16

0

Afficher le fichier

Fichier : __init__.py Projet : seetharamireddy540/bunsen

    def get_ancestors(self, uri=None, version=None):
        """
        Returns a dataset of all ancestors.
        """
        df = DataFrame(self._jconcept_maps.getAncestors(), self._spark_session)

        if uri is not None:
            df = df.where(df.conceptMapUri == functions.lit(uri))

        if version is not None:
            df = df.where(df.conceptMapVersion == functions.lit(version))

        return df

Exemple #17

0

Afficher le fichier

Fichier : training.py Projet : yewang01/spark-nlp

    def readDataset(self,
                    spark,
                    path,
                    delimiter="|",
                    outputPosCol="tags",
                    outputDocumentCol="text"):

        # ToDo Replace with std pyspark
        jSession = spark._jsparkSession

        jdf = self._java_obj.readDataset(jSession, path, delimiter,
                                         outputPosCol, outputDocumentCol)
        return DataFrame(jdf, spark._wrapped)

Exemple #18

0

Afficher le fichier

Fichier : tables.py Projet : zzcclp/delta

    def history(self, limit=None):
        """
        Get the information of the latest `limit` commits on this table as a Spark DataFrame.
        The information is in reverse chronological order.

        Example::

            fullHistoryDF = deltaTable.history()    # get the full history of the table

            lastOperationDF = deltaTable.history(1) # get the last operation

        :param limit: Optional, number of latest commits to returns in the history.
        :return: Table's commit history. See the online Delta Lake documentation for more details.
        :rtype: pyspark.sql.DataFrame

        .. note:: Evolving
        """
        jdt = self._jdt
        if limit is None:
            return DataFrame(jdt.history(), self._spark._wrapped)
        else:
            return DataFrame(jdt.history(limit), self._spark._wrapped)

Exemple #19

0

Afficher le fichier

def _map(fetches, dframe, feed_dict, block, trim, initial_variables=_initial_variables_default):
    fetches = _check_fetches(fetches)
    # We are not dealing for now with registered expansions, but this is something we should add later.
    graph = _get_graph(fetches, initial_variables)
    if block:
        builder = _java_api().map_blocks(dframe._jdf, trim)
    else:
        builder = _java_api().map_rows(dframe._jdf)
    _add_graph(graph, builder)
    ph_names = _add_shapes(graph, builder, fetches)
    _add_inputs(builder, feed_dict, ph_names)
    jdf = builder.buildDF()
    return DataFrame(jdf, _sql)

Exemple #20

0

Afficher le fichier

Fichier : Apportionment.py Projet : vidhyamanisankar/SML

    def app4(self,
             df=None,
             groupByCol=None,
             aggCol=None,
             appColName=defaultCol):

        self.__mandatoryArgumentCheck(groupByCol, aggCol)

        if df is None: df = self._df

        return DataFrame(
            self._jApp.app4(df._jdf, groupByCol, aggCol, appColName),
            df.sql_ctx)

Exemple #21

0

Afficher le fichier

def openCsv(path, validate=False):
    """Read in a CSV file as a DataFrame

        Args:
            path (str): The path of the CSV file
            validate (bool): If true, validate the CSV before return DataFrame (raise error if malformatted)

        Returns:
            (DataFrame): The resulting DataFrame
    """
    app = SmvApp.getInstance()
    jdf = app.j_smvPyClient.shellOpenCsv(path, validate)
    return DataFrame(jdf, SmvApp.getInstance().sqlContext)

Exemple #22

0

Afficher le fichier

Fichier : DFConversions.py Projet : zgj123123/CaffeOnSpark

 def ImageCaption2Embedding(self, imageRootFolder, imageCaptionDF, vocab,
                            captionLength):
     """Get the embedding for the images as well as the caption as a dataframe
     :param imageRootFolder: the src folder of the images
     :param imageCaptionDF: the dataframe with the images as well as captions
     :param vocab: the vocab object
     :param captionLength: Length of the embedding to generate for the caption
     """
     df = self.__dict__.get('conversions').ImageCaption2Embedding(
         imageRootFolder, imageCaptionDF._jdf, vocab.vocabObject,
         captionLength)
     pydf = DataFrame(df, self.__dict__.get('sqlContext'))
     return pydf

Exemple #23

0

Afficher le fichier

def main():
    # SparkSession startup
    spark = (SparkSession.builder.master('local[*]').config(
        'spark.jars.packages', 'com.amazon.deequ:deequ:1.0.3-rc2').appName(
            'constrain-example').getOrCreate())
    df = spark.createDataFrame(test_data)

    # Constrain verification
    r = (
        VerificationSuite(spark).onData(df).addCheck(
            Check(spark, 'error', 'examples').hasSize(
                lambda x: x == 8).isUnique('_2').hasCompleteness(
                    '_2', lambda x: x >= 0.75).hasUniqueness(
                        '_1', lambda x: x == 3 / 8).hasDistinctness(
                            '_1', lambda x: x == 5 / 8).hasUniqueValueRatio(
                                '_2',
                                lambda x: x == 0.8).hasNumberOfDistinctValues(
                                    '_2', lambda x: x == 6)
            #.hasHistogram
            .hasEntropy('_3', lambda x: x > 1)
            #.hasMutualInformation('_2', '_3', lambda x: x > 0.5)
            .hasApproxQuantile('_2', 0.5, lambda x: x == 7).hasMinLength(
                '_1', lambda x: x == 6).hasMaxLength(
                    '_3',
                    lambda x: x == 10).hasMin('_2',
                                              lambda x: x == 1).hasMax(
                                                  '_2',
                                                  lambda x: x == 20).hasMean(
                                                      '_2', lambda x: x > 10).
            hasSum('_2', lambda x: x > 50).hasStandardDeviation(
                '_2', lambda x: x > 5).hasApproxCountDistinct(
                    '_2', lambda x: x == 5).hasCorrelation(
                        '_2', '_5',
                        lambda x: x == 1).satisfies("_2 > 15", "MyCondition",
                                                    lambda x: x == 0.25)
            #.hasPattern("_1", "thing([A-Z])", lambda x: x == 1)
            #.hasDataType("_1", "string", lambda x: x == 1)
            .isPositive('_2').isNonNegative('_2').isLessThan(
                '_5', '_2', lambda x: x == 0.375).isLessThanOrEqualTo(
                    '_5', '_2', lambda x: x == 0.375).isGreaterThan(
                        '_5', '_2',
                        lambda x: x == 0.125).isGreaterThanOrEqualTo(
                            '_5', '_2', lambda x: x == 0.125)
            #.isContainedIn('_3', ['DELAYED', 'INTRANSIT'])
            .isInInterval('_5', 1.0, 50.0)).run())
    df = DataFrame(r, spark)
    df.show(df.count(), False)

    # SparkSession and Java Gateway teardown
    spark.sparkContext._gateway.close()
    spark.stop()

Exemple #24

0

Afficher le fichier

Fichier : estimators.py Projet : tornadozou/incubator-systemml

    def predict(self, X):
        """
        Invokes the transform method on Estimator object on JVM if X and y are on of the supported data types

        Parameters
        ----------
        X: NumPy ndarray, Pandas DataFrame, scipy sparse matrix or PySpark DataFrame
        """
        if isinstance(X, SUPPORTED_TYPES):
            if self.transferUsingDF:
                pdfX = convertToPandasDF(X)
                df = assemble(self.sqlCtx, pdfX, pdfX.columns, self.featuresCol).select(self.featuresCol)
                retjDF = self.model.transform(df._jdf)
                retDF = DataFrame(retjDF, self.sqlCtx)
                retPDF = retDF.sort('__INDEX').select('prediction').toPandas()
                if isinstance(X, np.ndarray):
                    return retPDF.as_matrix().flatten()
                else:
                    return retPDF
            else:
                retNumPy = convertToNumPyArr(self.sc, self.model.transform(convertToMatrixBlock(self.sc, X)))
                if isinstance(X, np.ndarray):
                    return retNumPy
                else:
                    return retNumPy # TODO: Convert to Pandas
        elif hasattr(X, '_jdf'):
            if self.featuresCol in X.columns:
                # No need to assemble as input DF is likely coming via MLPipeline
                df = X
            else:
                assembler = VectorAssembler(inputCols=X.columns, outputCol=self.featuresCol)
                df = assembler.transform(X)
            retjDF = self.model.transform(df._jdf)
            retDF = DataFrame(retjDF, self.sqlCtx)
            # Return DF
            return retDF.sort('__INDEX')
        else:
            raise Exception('Unsupported input type')

Exemple #25

0

Afficher le fichier

Fichier : df_utils.py Projet : aniket486/datafu

def dedup_with_combiner(df, group_col, order_by_col, desc = True, columns_filter = [], columns_filter_keep = True):
    """
    Used get the 'latest' record (after ordering according to the provided order columns) in each group.
    :param df: DataFrame to operate on
    :param group_col: column to group by the records
    :param order_by_col: column to order the records according to
    :param desc: have the order as desc
    :param columns_filter: columns to filter
    :param columns_filter_keep: indicates whether we should filter the selected columns 'out' or alternatively have only
*                          those columns in the result
    :return: DataFrame representing the data after the operation
    """
    jdf = _get_utils(df).dedupWithCombiner(df._jdf, group_col._jc, order_by_col._jc, desc, columns_filter, columns_filter_keep)
    return DataFrame(jdf, df.sql_ctx)

Exemple #26

0

Afficher le fichier

Fichier : df_utils.py Projet : aniket486/datafu

def broadcast_join_skewed(not_skewed_df, skewed_df, join_col, number_of_custs_to_broadcast):
    """
    Suitable to perform a join in cases when one DF is skewed and the other is not skewed.
    splits both of the DFs to two parts according to the skewed keys.
    1. Map-join: broadcasts the skewed-keys part of the not skewed DF to the skewed-keys part of the skewed DF
    2. Regular join: between the remaining two parts.
    :param not_skewed_df: not skewed DataFrame
    :param skewed_df: skewed DataFrame
    :param join_col: join column
    :param number_of_custs_to_broadcast: number of custs to broadcast
    :return: DataFrame representing the data after the operation
    """
    jdf = _get_utils(skewed_df).broadcastJoinSkewed(not_skewed_df._jdf, skewed_df._jdf, join_col, number_of_custs_to_broadcast)
    return DataFrame(jdf, not_skewed_df.sql_ctx)

Exemple #27

0

Afficher le fichier

def analyze(dframe):
    """ Analyzes a Spark DataFrame for the tensor content, and returns a new dataframe with extra metadata that
     describes the numerical shape of the content.

     This method is useful when a dataframe contains non-scalar tensors, for which the shape must be checked beforehand.

     Note: nullable fields are not accepted.

     The function [print_schema] lets users introspect the information added to the DataFrame.

    :param dframe: a Spark DataFrame
    :return: a Spark DataFrame with metadata information embedded.
    """
    return DataFrame(_java_api().analyze(dframe._jdf), _sql)

Exemple #28

0

Afficher le fichier

    def fromJava(self, stuff):
        if stuff.__class__.__name__ == "JavaObject":
            clazz = stuff.getClass().getName()
            if clazz == "org.apache.spark.sql.Dataset":
                stuff = stuff.toDF()
                clazz = "org.apache.spark.sql.DataFrame"

            if clazz == "org.apache.spark.sql.DataFrame":
                return DataFrame(
                    stuff,
                    SQLContext(SparkContext.getOrCreate(), stuff.sqlContext()))
            elif clazz == "org.apache.spark.sql.SQLContext":
                return SQLContext(SparkContext.getOrCreate(), stuff)
        return stuff

Exemple #29

0

Afficher le fichier

 def getDataFrame(self, *outputs):
     """
     Parameters
     ----------
     outputs: string, list of strings
         Output variables as defined inside the DML script.
     """
     outs = [
         DataFrame(self._java_results.getDataFrame(out),
                   MLResults.sqlContext) for out in outputs
     ]
     if len(outs) == 1:
         return outs[0]
     return outs

Exemple #30

0

Afficher le fichier

async def spark_sql_exe(objs, st, p, tk):

    global __spark
    global __sqlContext

    for obj in objs:
        df = __spark.read.format("edu.berkeley.cs.rise.opaque.EncryptedSource") \
                                .schema(obj.schema) \
                                .load(obj.path)
        qdf = __spark._jvm.org.apache.spark.sql.QShieldDatasetFunctions(
            df._jdf)
        qdfAC = qdf.acPolicyApplied(tk)
        dfAC = DataFrame(qdfAC, __sqlContext)
        dfAC.createOrReplaceTempView(obj.name)

    dfsql = __spark.sql(st)

    qres = __spark._jvm.org.apache.spark.sql.QShieldDatasetFunctions(
        dfsql._jdf)
    qresPrep = qres.resPrepared()
    resPrep = DataFrame(qresPrep, __sqlContext)
    coll_fur = await asyncio.wrap_future(resPrep.collectAsync())
    return coll_fur