Ejemplo n.º 1
0
def word_count_inline(text=parameter.data.txt[spark.DataFrame],
                      counters=output.txt.data):
    # type:  (spark.DataFrame, PathStr) -> spark.DataFrame
    from operator import add
    from dbnd_spark import get_spark_session

    lines = text.rdd.map(lambda r: r[0])
    counts = (lines.flatMap(lambda x: x.split(" ")).map(
        lambda x: (x, 1)).reduceByKey(add))
    counts.saveAsTextFile(str(counters))
    df = get_spark_session().createDataFrame(counts)
    output = counts.collect()
    for (word, count) in output:
        print("%s: %i" % (word, count))

    return df
Ejemplo n.º 2
0
 def data_to_value(self, data):
     spark_session = get_spark_session()
     data = list(zip(*data))
     column_names = ["test_column_" + str(i) for i in range(len(data[0]))]
     return spark_session.createDataFrame(data, column_names)
Ejemplo n.º 3
0
def get_spark_session():
    assert_plugin_enabled("dbnd-spark")
    import dbnd_spark

    return dbnd_spark.get_spark_session()