コード例 #1
0
ファイル: readSpark.py プロジェクト: par1321633/spark-basics
def main():
    print ("Starting Spark Read CSV Program")
    options = get_cmdline_options()
    conf = get_spark_app_config(options.job_name)
    print ("Create Spark Session")

    spark = SparkSession\
        .builder\
        .config(conf=conf)\
        .getOrCreate()

    conf_out = spark.sparkContext.getConf()
    #print(conf_out.toDebugString())

    #spark.read
    # This returns a DataFrameReader Object, This obj is the gateway to read the data in Apache.
    df = read_df(spark, filename=options.file_name)
    #print (df.show())

    #Repartitioning
    partitioned_df = df.repartition(2)

    #Transformations
    cnt_df = partitioned_df.where("Age < 40")\
        .select("Age","Gender","Country","state")\
        .groupBy("Country")\
        .count()


    print ("Length of Output", len(cnt_df.collect()))
    spark.stop()
コード例 #2
0
import sys

from pyspark import SparkConf
from pyspark.sql import *
from lib.logger import Log4J
from lib.utils import get_spark_app_config, load_data_file, count_by_country

if __name__ == "__main__":
    conf = get_spark_app_config()
    spark = SparkSession.builder \
        .config(conf=conf) \
        .getOrCreate()
    #.appName("Hello Spark") \
    #.master("local[3]") \

    logger = Log4J(spark)
    # logger.info("Starting Hellospark")

    sample_df = load_data_file(spark=spark, datafile=sys.argv[1])

    logger.info("Using Spark Dataframe (like ORM)")
    partitioned_df = sample_df.repartition(2)
    counted_df = count_by_country(df=partitioned_df)
    # counted_df.show()
    logger.info(counted_df.collect())

    logger.info("Using Spark SQL")
    view_tbl = sample_df.createOrReplaceTempView("sample_tbl")
    counted_df = spark.sql(
        "SELECT Country, COUNT(1) AS count FROM sample_tbl WHERE Age<40 GROUP BY Country"
    )