Python RDD.toDF Examples

Programming Language: Python

Namespace/Package Name: pyspark

Class/Type: RDD

Method/Function: toDF

Examples at hotexamples.com: 5

Python RDD.toDF - 5 examples found. These are the top rated real world Python examples of pyspark.RDD.toDF extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

RDD(30)

map(30)

flatMap(16)

count(11)

mapPartitionsWithIndex(10)

getNumPartitions(9)

filter(9)

repartition(6)

mapPartitions(6)

toDF(5)

collect(5)

mapValues(5)

groupByKey(4)

isEmpty(4)

coalesce(3)

cache(3)

take(3)

toDebugString(2)

persist(2)

unpersist(2)

zip(2)

zipWithIndex(2)

__init__(2)

_reserialize(2)

first(2)

distinct(2)

join(2)

sum(1)

_to_java_object_rdd(1)

union(1)

cogroup(1)

countApproxDistinct(1)

sortByKey(1)

subtractByKey(1)

sortBy(1)

sample(1)

randomSplit(1)

foreach(1)

name(1)

groupBy(1)

keys(1)

Example #1

Show file

 def __call__(self, head: RDD):
     self._log.info("Writing %s", self.output)
     return head.toDF() \
         .coalesce(1) \
         .write \
         .option("header", "true") \
         .mode("overwrite") \
         .csv(self.output)

Example #2

Show file

    def __convert_service_format(rdd: RDD) -> RDD:
        if rdd.isEmpty():
            return rdd

        df = rdd.toDF()

        df = add_neighborhoods(df)

        df = df.withColumn("row_id", hasher(df["row_id"])) \
            .withColumn("category_id", hasher(df["category"])) \
            .withColumn("opened",
                        unix_timestamp(to_timestamp("opened", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(
                            IntegerType())) \
            .withColumn("report_datetime",
                        unix_timestamp(to_timestamp("report_datetime", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(
                            IntegerType())) \
            .withColumn("neighborhood_id", hasher(df["neighborhood"]))

        return df.rdd

Example #3

Show file

    def __convert_service_format(rdd: RDD) -> RDD:
        if rdd.isEmpty():
            return rdd
        df = rdd.toDF()

        # Find neighborhoods from lat/lon
        # This is necessary, because a lot of the data from the API is missing neighborhood data
        df = add_neighborhoods(df)

        # Add key data and parse dates
        df = df.withColumn("category_id", hasher("category")) \
            .withColumn("neighborhood_id", hasher("neighborhood")) \
            .withColumn("opened",
                        unix_timestamp(to_timestamp("openedStr", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(IntegerType())) \
            .withColumn("updated",
                        unix_timestamp(to_timestamp("updatedStr", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(IntegerType())) \
            .drop("openedStr", "updatedStr")

        return df.rdd

Example #4

Show file

 def __call__(self, rdd: RDD):
     if self.explained:
         self._log.info("toDebugString():\n%s", rdd.toDebugString().decode())
     rdd.toDF().write.parquet(self.save_loc)

Example #5

Show file

File: sys_exec.py Project: viditgupta-11/cadCAD-tweaked

def to_spark(rdd: RDD, init_condition: dict):
    type_aligner = align_type(init_condition)
    rdd: RDD = rdd.map(type_aligner)
    return rdd.toDF()