def word_count_inline(text=parameter.data.txt[spark.DataFrame], counters=output.txt.data): # type: (spark.DataFrame, PathStr) -> spark.DataFrame from operator import add from dbnd_spark import get_spark_session lines = text.rdd.map(lambda r: r[0]) counts = (lines.flatMap(lambda x: x.split(" ")).map( lambda x: (x, 1)).reduceByKey(add)) counts.saveAsTextFile(str(counters)) df = get_spark_session().createDataFrame(counts) output = counts.collect() for (word, count) in output: print("%s: %i" % (word, count)) return df
def data_to_value(self, data): spark_session = get_spark_session() data = list(zip(*data)) column_names = ["test_column_" + str(i) for i in range(len(data[0]))] return spark_session.createDataFrame(data, column_names)
def get_spark_session(): assert_plugin_enabled("dbnd-spark") import dbnd_spark return dbnd_spark.get_spark_session()