Esempio n. 1
0
    def _task_run(self):
        super(_InlineSparkTask, self)._task_run()

        if self._get_spark_ctrl().stop_spark_session_on_finish:
            session = get_spark_session()
            logger.info("Stopping spark session: %s")
            session.stop()
Esempio n. 2
0
def word_count_inline(text=parameter.data.txt[spark.DataFrame],
                      counters=output.txt.data):
    # type:  (spark.DataFrame, PathStr) -> spark.DataFrame
    from operator import add
    from dbnd._core.commands import get_spark_session

    lines = text.rdd.map(lambda r: r[0])
    counts = (lines.flatMap(lambda x: x.split(" ")).map(
        lambda x: (x, 1)).reduceByKey(add))
    counts.saveAsTextFile(str(counters))
    df = get_spark_session().createDataFrame(counts)
    output = counts.collect()
    for (word, count) in output:
        print("%s: %i" % (word, count))

    return df
Esempio n. 3
0
 def target_to_value(self, target, **kwargs):
     path = _target_to_path(target)
     schema = kwargs["schema"] if "schema" in kwargs else None
     return (get_spark_session().read.format(
         self.file_format).options(**kwargs).load(path, schema=schema))