コード例 #1
0
ファイル: helloSpark.py プロジェクト: par1321633/spark-basics
def main():
    spark = SparkSession\
        .builder\
        .appName("hello Spark")\
        .master("local[3]")\
        .getOrCreate()
    logger = Log4j(spark)
    print(logger)
    logger.info("Starting hello Spark Program")

    logger.info("Finished hello spark program")
    spark.stop()
コード例 #2
0
def process(spark, input_file, output_path):
    # this method has the spark transformations required to solve the problem  statement.
    global logger
    logger = Log4j(spark)  # setting  global logger

    if input_file is not None:
        input_df = spark.read.json(str(input_file))
    else:
        # reading data from API. Not recommended in Production environment to read the file from REST
        json_response = requests.get(
            "https://s3-eu-west-1.amazonaws.com/dwh-test-resources/recipes.json"
        )
        input_df = spark.createDataFrame(
            data=[json.loads(line) for line in json_response.iter_lines()])

    # registering UDFs
    udf_get_seconds_from_duration = udf(get_seconds_from_duration, LongType())
    udf_get_duration_from_seconds = udf(get_duration_from_seconds)

    # cleaning the input data by converting durations to seconds and date field to DateType
    clean_df = input_df.withColumn("prepSeconds", udf_get_seconds_from_duration("prepTime")). \
        withColumn("cookSeconds", udf_get_seconds_from_duration("cookTime")). \
        withColumn("datePublished", (to_date(col("datePublished")))).persist()  # persist is not needed here
    # as only single call is happening

    # Task-2
    output_df = clean_df \
        .where("cookSeconds>0 and prepSeconds>0") \
        .selectExpr("cookSeconds + prepSeconds as totalSeconds") \
        .withColumn("difficulty", when(col("totalSeconds") < 60 * 30, lit("easy")) \
                    .otherwise(when(col("totalSeconds") > 60 * 30, lit("hard")) \
                               .otherwise(lit("medium")))) \
        .groupBy("difficulty").avg() \
        .withColumnRenamed("avg(totalSeconds)", "avg_total_cooking_seconds") \
        .withColumn("avg_total_cooking_time", udf_get_duration_from_seconds("avg_total_cooking_seconds")) \
        .drop("avg_total_cooking_seconds").persist()
    # output_df.coalesce(1).write.mode("overwrite").csv("../output")
    # to_pandas not recommended when using spark if data size is large
    # provided relative path.also not recommended
    output_df.coalesce(1).toPandas().to_csv(output_path, index=False)
コード例 #3
0
from pyspark.sql import SparkSession

from lib.logger import Log4j
import os

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("Shuffle Join Demo") \
        .master("local[3]") \
        .getOrCreate()

    logger = Log4j(spark)
    d1 = os.path.expanduser("~/Spark-Programming-In-Python-master/data/d1/")
    d2 = os.path.expanduser("~/Spark-Programming-In-Python-master/data/d2/")

    flight_time_df1 = spark.read.json(d1)

    flight_time_df2 = spark.read.json(d2)

    spark.conf.set("spark.sql.shuffle.partitions", 3)

    join_expr = flight_time_df1.id == flight_time_df2.id
    join_df = flight_time_df1.join(flight_time_df2, join_expr, "inner")

    join_df.collect()
    join_df.show()
    input("press a key to stop...")
コード例 #4
0
import datetime
import json
import re

import isodate
import requests
from pyspark.sql.functions import udf, when, col, lit, to_date
from pyspark.sql.types import LongType

from lib.logger import Log4j

__author__ = 'Sumeet Gupta'
logger: Log4j = Log4j(None)


def process(spark, input_file, output_path):
    # this method has the spark transformations required to solve the problem  statement.
    global logger
    logger = Log4j(spark)  # setting  global logger

    if input_file is not None:
        input_df = spark.read.json(str(input_file))
    else:
        # reading data from API. Not recommended in Production environment to read the file from REST
        json_response = requests.get(
            "https://s3-eu-west-1.amazonaws.com/dwh-test-resources/recipes.json"
        )
        input_df = spark.createDataFrame(
            data=[json.loads(line) for line in json_response.iter_lines()])

    # registering UDFs
コード例 #5
0
import sys
from pyspark.sql import *
from lib.logger import Log4j
from lib.utils import *

if __name__ == "__main__":
    conf = get_spark_app_config()

    spark = SparkSession \
        .builder \
        .appName("HelloSpark") \
        .master("local[2]") \
        .getOrCreate()

    print(spark)
    logger = Log4j(spark)  #creates a Logger.
    print(sys.argv)
    if len(sys.argv) != 2:
        logger.error("Usage: HelloSpark <filename>")
        sys.exit(-1)

    logger.info("Starting HelloSpark")

    survey_raw_df = load_survey_df(spark, sys.argv[1])
    partitioned_survey_df = survey_raw_df.repartition(2)
    count_df = count_by_country(partitioned_survey_df)
    count_df.show()

    logger.info("Finished HelloSpark")
    spark.stop()