def _task_run(self): super(_InlineSparkTask, self)._task_run() if self._get_spark_ctrl().stop_spark_session_on_finish: session = get_spark_session() logger.info("Stopping spark session: %s") session.stop()
def word_count_inline(text=parameter.data.txt[spark.DataFrame], counters=output.txt.data): # type: (spark.DataFrame, PathStr) -> spark.DataFrame from operator import add from dbnd._core.commands import get_spark_session lines = text.rdd.map(lambda r: r[0]) counts = (lines.flatMap(lambda x: x.split(" ")).map( lambda x: (x, 1)).reduceByKey(add)) counts.saveAsTextFile(str(counters)) df = get_spark_session().createDataFrame(counts) output = counts.collect() for (word, count) in output: print("%s: %i" % (word, count)) return df
def target_to_value(self, target, **kwargs): path = _target_to_path(target) schema = kwargs["schema"] if "schema" in kwargs else None return (get_spark_session().read.format( self.file_format).options(**kwargs).load(path, schema=schema))