Python RDD.isEmpty Examples

Programming Language: Python

Namespace/Package Name: pyspark

Class/Type: RDD

Method/Function: isEmpty

Examples at hotexamples.com: 4

Python RDD.isEmpty - 4 examples found. These are the top rated real world Python examples of pyspark.RDD.isEmpty extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

RDD(30)

map(30)

flatMap(16)

count(11)

mapPartitionsWithIndex(10)

getNumPartitions(9)

filter(9)

repartition(6)

mapPartitions(6)

toDF(5)

collect(5)

mapValues(5)

groupByKey(4)

isEmpty(4)

coalesce(3)

cache(3)

take(3)

toDebugString(2)

persist(2)

unpersist(2)

zip(2)

zipWithIndex(2)

__init__(2)

_reserialize(2)

first(2)

distinct(2)

join(2)

sum(1)

_to_java_object_rdd(1)

union(1)

cogroup(1)

countApproxDistinct(1)

sortByKey(1)

subtractByKey(1)

sortBy(1)

sample(1)

randomSplit(1)

foreach(1)

name(1)

groupBy(1)

keys(1)

Example #1

Show file

File: spark_consumer.py Project: eschizoid/jconf-2020

def process_rdd(time: time_, rdd: RDD) -> None:
    if rdd.isEmpty():
        return
    else:
        logging.info("----------- %s -----------" % str(time))
        sql_context = get_sql_context_instance(rdd.context)
        tweets_df = sql_context.createDataFrame(rdd, StringType())
        tweets_df.write.json(
            f"""s3a://jconf-2020/bronze/{time_.strftime("%Y-%m-%d")}/{reverse_current_time_millis()}"""
        )

Example #2

Show file

 def __preprocessRdd(self, rdd: RDD):
     rddc = rddCorrector()
     rdd = rdd.map(lambda l: rddc.correct(l))
     if rdd != None:
         if (rdd.isEmpty() == False):
             rdd = rdd.map(lambda l: l.replace("<tweet>", ""))
             rdd = rdd.map(lambda l: l.replace("</tweet>", ""))
             df = DataFrameWorks().convertDataFrame(rdd, self.__spark)
             df = CleanText().clean(df, self.__spark)
             return df
     return None

Example #3

Show file

    def __convert_service_format(rdd: RDD) -> RDD:
        if rdd.isEmpty():
            return rdd

        df = rdd.toDF()

        df = add_neighborhoods(df)

        df = df.withColumn("row_id", hasher(df["row_id"])) \
            .withColumn("category_id", hasher(df["category"])) \
            .withColumn("opened",
                        unix_timestamp(to_timestamp("opened", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(
                            IntegerType())) \
            .withColumn("report_datetime",
                        unix_timestamp(to_timestamp("report_datetime", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(
                            IntegerType())) \
            .withColumn("neighborhood_id", hasher(df["neighborhood"]))

        return df.rdd

Example #4

Show file

    def __convert_service_format(rdd: RDD) -> RDD:
        if rdd.isEmpty():
            return rdd
        df = rdd.toDF()

        # Find neighborhoods from lat/lon
        # This is necessary, because a lot of the data from the API is missing neighborhood data
        df = add_neighborhoods(df)

        # Add key data and parse dates
        df = df.withColumn("category_id", hasher("category")) \
            .withColumn("neighborhood_id", hasher("neighborhood")) \
            .withColumn("opened",
                        unix_timestamp(to_timestamp("openedStr", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(IntegerType())) \
            .withColumn("updated",
                        unix_timestamp(to_timestamp("updatedStr", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(IntegerType())) \
            .drop("openedStr", "updatedStr")

        return df.rdd