def __call__(self, head: RDD): self._log.info("Writing %s", self.output) return head.toDF() \ .coalesce(1) \ .write \ .option("header", "true") \ .mode("overwrite") \ .csv(self.output)
def __convert_service_format(rdd: RDD) -> RDD: if rdd.isEmpty(): return rdd df = rdd.toDF() df = add_neighborhoods(df) df = df.withColumn("row_id", hasher(df["row_id"])) \ .withColumn("category_id", hasher(df["category"])) \ .withColumn("opened", unix_timestamp(to_timestamp("opened", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast( IntegerType())) \ .withColumn("report_datetime", unix_timestamp(to_timestamp("report_datetime", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast( IntegerType())) \ .withColumn("neighborhood_id", hasher(df["neighborhood"])) return df.rdd
def __convert_service_format(rdd: RDD) -> RDD: if rdd.isEmpty(): return rdd df = rdd.toDF() # Find neighborhoods from lat/lon # This is necessary, because a lot of the data from the API is missing neighborhood data df = add_neighborhoods(df) # Add key data and parse dates df = df.withColumn("category_id", hasher("category")) \ .withColumn("neighborhood_id", hasher("neighborhood")) \ .withColumn("opened", unix_timestamp(to_timestamp("openedStr", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(IntegerType())) \ .withColumn("updated", unix_timestamp(to_timestamp("updatedStr", "yyyy-MM-dd'T'HH:mm:ss.SSS")).cast(IntegerType())) \ .drop("openedStr", "updatedStr") return df.rdd
def __call__(self, rdd: RDD): if self.explained: self._log.info("toDebugString():\n%s", rdd.toDebugString().decode()) rdd.toDF().write.parquet(self.save_loc)
def to_spark(rdd: RDD, init_condition: dict): type_aligner = align_type(init_condition) rdd: RDD = rdd.map(type_aligner) return rdd.toDF()