Example #1
0
 def __call__(self, head: RDD):
     self._log.info("Writing %s", self.output)
     return head.toDF() \
         .coalesce(1) \
         .write \
         .option("header", "true") \
         .mode("overwrite") \
         .csv(self.output)
Example #2
0
    def __convert_service_format(rdd: RDD) -> RDD:
        if rdd.isEmpty():
            return rdd

        df = rdd.toDF()

        df = add_neighborhoods(df)

        df = df.withColumn("row_id", hasher(df["row_id"])) \
            .withColumn("category_id", hasher(df["category"])) \
            .withColumn("opened",
                        unix_timestamp(to_timestamp("opened", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(
                            IntegerType())) \
            .withColumn("report_datetime",
                        unix_timestamp(to_timestamp("report_datetime", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(
                            IntegerType())) \
            .withColumn("neighborhood_id", hasher(df["neighborhood"]))

        return df.rdd
Example #3
0
    def __convert_service_format(rdd: RDD) -> RDD:
        if rdd.isEmpty():
            return rdd
        df = rdd.toDF()

        # Find neighborhoods from lat/lon
        # This is necessary, because a lot of the data from the API is missing neighborhood data
        df = add_neighborhoods(df)

        # Add key data and parse dates
        df = df.withColumn("category_id", hasher("category")) \
            .withColumn("neighborhood_id", hasher("neighborhood")) \
            .withColumn("opened",
                        unix_timestamp(to_timestamp("openedStr", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(IntegerType())) \
            .withColumn("updated",
                        unix_timestamp(to_timestamp("updatedStr", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(IntegerType())) \
            .drop("openedStr", "updatedStr")

        return df.rdd
Example #4
0
 def __call__(self, rdd: RDD):
     if self.explained:
         self._log.info("toDebugString():\n%s", rdd.toDebugString().decode())
     rdd.toDF().write.parquet(self.save_loc)
Example #5
0
def to_spark(rdd: RDD, init_condition: dict):
    type_aligner = align_type(init_condition)
    rdd: RDD = rdd.map(type_aligner)
    return rdd.toDF()