Ejemplo n.º 1
0
def create_row_output(original_file,
                      modified_file,
                      source_file=[],
                      target_file=[]):
    """
    :param original_file: df for original file
    :param modified_file: df for modified file
    :param source_file: list of paths for source files
    :param target_file: list of paths for target files
    :return: The funciton from above is used to create the multiple source/target
    df's, which are then merged for comparison purposes.
    """
    try:
        df_original = get_row_count(*source_file).withColumn(
            "Type", F.format_string(original_file))
        df_modified = get_row_count(*target_file).withColumn(
            "Type", F.format_string(modified_file))
        combined = df_original.union(df_modified)
        cols = list(combined.columns)
        cols = [cols[-1]] + cols[:-1]
        rowComparisonDf = combined[cols]
    except LookupError as l:
        print(str(l))
        raise l

    return rowComparisonDf
def alert_no_threshold(df, keycols, col_dest, str_format, col_origen,
                       col_threshold):
    col_to_select = keycols + [col_dest]
    dfret=df.filter(col(col_origen)< col(col_threshold)).withColumn(col_dest,F.when(col(col_origen)< col(col_threshold),
                                                                                    F.format_string(str_format, col(col_threshold).cast(IntegerType())-col(col_origen).cast(IntegerType()), \
                                                                                                    col(col_origen).cast(IntegerType()),col(col_threshold).cast(IntegerType()))).otherwise(None))
    dfret = dfret.select(col_to_select)
    return (dfret)
Ejemplo n.º 3
0
    def prepare_df(df):
        num_rows = df.count()

        # Expand dates.
        df = expand_date(df)

        df = df \
            .withColumn('Open', df.Open != '0') \
            .withColumn('Promo', df.Promo != '0') \
            .withColumn('StateHoliday', df.StateHoliday != '0') \
            .withColumn('SchoolHoliday', df.SchoolHoliday != '0')

        # Merge in store information.
        store = store_csv.join(store_states_csv, 'Store')
        df = df.join(store, 'Store')

        # Merge in Google Trend information.
        google_trend_all = prepare_google_trend()
        df = df.join(google_trend_all, ['State', 'Year', 'Week']).select(df['*'], google_trend_all.trend)

        # Merge in Google Trend for whole Germany.
        google_trend_de = google_trend_all[google_trend_all.file == 'Rossmann_DE']
        google_trend_de = google_trend_de.withColumnRenamed('trend', 'trend_de')
        df = df.join(google_trend_de, ['Year', 'Week']).select(df['*'], google_trend_de.trend_de)

        # Merge in weather.
        weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName)
        df = df.join(weather, ['State', 'Date'])

        # Fix null values.
        df = df \
            .withColumn('CompetitionOpenSinceYear', F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900))) \
            .withColumn('CompetitionOpenSinceMonth', F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1))) \
            .withColumn('Promo2SinceYear', F.coalesce(df.Promo2SinceYear, F.lit(1900))) \
            .withColumn('Promo2SinceWeek', F.coalesce(df.Promo2SinceWeek, F.lit(1)))

        # Days & months competition was open, cap to 2 years.
        df = df.withColumn('CompetitionOpenSince',
                           F.to_date(F.format_string('%s-%s-15', df.CompetitionOpenSinceYear,
                                                     df.CompetitionOpenSinceMonth)))
        df = df.withColumn('CompetitionDaysOpen',
                           F.when(df.CompetitionOpenSinceYear > 1900,
                                  F.greatest(F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince))))
                           .otherwise(0))
        df = df.withColumn('CompetitionMonthsOpen', (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

        # Days & weeks of promotion, cap to 25 weeks.
        df = df.withColumn('Promo2Since',
                           F.expr('date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)'))
        df = df.withColumn('Promo2Days',
                           F.when(df.Promo2SinceYear > 1900,
                                  F.greatest(F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))))
                           .otherwise(0))
        df = df.withColumn('Promo2Weeks', (df.Promo2Days / 7).cast(T.IntegerType()))

        # Check that we did not lose any rows through inner joins.
        assert num_rows == df.count(), 'lost rows in joins'
        return df
Ejemplo n.º 4
0
def main(spark, filename):
    DF = spark.read.format('csv').options(header='true',
                                          inferschema='true').load(filename)
    DF.createOrReplaceTempView("parking")
    query = getQuery()
    result = spark.sql(query)
    result.select(
        format_string('%s, %s\t%d', result.plate_id, result.registration_state,
                      result.total)).write.save("task6-sql.out", format="text")
Ejemplo n.º 5
0
def main(spark,filename1, filename2):
    parkingDF = spark.read.format('csv').options(header='true',inferschema='true').load(filename1)
    openDF = spark.read.format('csv').options(header='true',inferschema='true').load(filename2)
    parkingDF.createOrReplaceTempView("parking")
    openDF.createOrReplaceTempView("open")
    parkingDF.select('summons_number').subtract(openDF.select('summons_number')).createOrReplaceTempView("diffVals")
    query = getQuery()
    result = spark.sql(query)
    result.select(format_string('%d\t%s, %d, %d, %s',result.summons_number,result.plate_id,result.violation_precinct,result.violation_code,date_format(result.issue_date,'yyyy-MM-dd'))).write.save("task1-sql.out",format="text")
Ejemplo n.º 6
0
 def create_edge_id_column(cls, datasource, from_column, to_column):
     """Creates an '~id' column in a DynamicFrame whose values are based on the specified from and to columns.
     
     Example:
     >>> df = GremlinCsvTransforms.create_edge_id_column(df, 'supplierId', 'productId')
     """
     dataframe = datasource.toDF()
     dataframe = dataframe.withColumn('~id', format_string("%s-%s", dataframe[from_column], dataframe[to_column]))
     return DynamicFrame.fromDF(dataframe,  datasource.glue_ctx, 'create_edge_id_column')
Ejemplo n.º 7
0
def replicate_df(df, duplicates):

    if duplicates > 1:
        uniques = _get_uniques(duplicates)

        df = (df.crossJoin(uniques.distinct()).withColumn(
            "customerID", F.format_string("%s-%s", "customerID",
                                          "u_value")).drop("u_value"))

    return df
Ejemplo n.º 8
0
def main(spark, filename):
    DF = spark.read.format('csv').options(header='true',
                                          inferschema='true').load(filename)
    DF.createOrReplaceTempView("open")
    query = getQuery()
    result = spark.sql(query)
    result.select(
        format_string('%s\t%.2f, %.2f', result.license_type, result.Total,
                      result.Average)).write.save("task3-sql.out",
                                                  format="text")
def alert_less_sps_than_ss(df, keycols, col_dest, str_format, col_sps, col_ss,
                           col_ss_trashed):
    col_to_select = keycols + [col_dest]
    df = df.withColumn('ss_no_trashed', col(col_ss) - col(col_ss_trashed))
    dfret=df.filter(col(col_sps)< col('ss_no_trashed')).withColumn(col_dest,F.when(col(col_sps)< col('ss_no_trashed'), \
                                                                                   F.format_string(str_format, col('ss_no_trashed').cast(IntegerType())-col(col_sps).cast(IntegerType()), \
                                                                                                   col(col_sps).cast(IntegerType()),col(col_ss).cast(IntegerType()), \
                                                                                                   col(col_ss_trashed).cast(IntegerType()))).otherwise(None))
    dfret = dfret.select(col_to_select)
    return (dfret)
def alert_trashed_shoes(df, keycols, col_dest, str_format, col_origen,
                        col_excep_1, col_excep_2):
    col_to_select = keycols + [col_dest]
    df = df.withColumn('shoes_trashed_to_check',
                       col(col_origen) - col(col_excep_1) - col(col_excep_2))
    dfret=df.filter(col('shoes_trashed_to_check') > 0).withColumn(col_dest,F.when(col('shoes_trashed_to_check') > 0, \
                                                                                  F.format_string(str_format,col('shoes_trashed_to_check'), col(col_origen).cast(IntegerType()), \
                                                                                                  col(col_excep_1).cast(IntegerType()),col(col_excep_2).cast(IntegerType()))).otherwise(None))
    dfret = dfret.select(col_to_select)
    return (dfret)
Ejemplo n.º 11
0
 def create_prefixed_columns(cls, datasource, mappings):
     """Creates columns in a DynamicFrame whose values are based on prefixed values from another column in the DynamicFrame.
     
     Example:
     >>> df = GremlinCsvTransforms.create_prefixed_columns(df, [('~id', 'productId', 'p'),('~to', 'supplierId', 's')])
     """
     dataframe = datasource.toDF()
     for (column_name, source_column, prefix) in mappings:
         dataframe = dataframe.withColumn(column_name, format_string(prefix + "-%s", dataframe[source_column]))
     return DynamicFrame.fromDF(dataframe, datasource.glue_ctx, 'create_vertex_id_columns')
Ejemplo n.º 12
0
def aggregate_columns(dataframe, dataframe_name):
	f = open('get_aggregate_results.sh','w')
	for col in nyc.columns:
		print(col)
		query = 'select `{0}` as Data_Values, count(`{0}`) as count_data from nyc group by `{0}` order by `{0}`'.format(col)
		data = spark.sql(query)
		data.select(format_string('%s,%s',data.Data_Values,data.count_data)).write.save( "Projects/" +str(col)+".txt" ,format="text")
		f.write('hfs -getmerge Projects/'+str(col)+'.txt Outputs/'+str(col)+'.txt \n')
		f.flush()
	f.close()
	subprocess.call("get_aggregate_resukts.sh")
Ejemplo n.º 13
0
    def Run(self):
        print('Running DfCollectList test!!!')
        df = self.spark.read.load('sample-data.csv',
                                  format="csv",
                                  inferSchema="true",
                                  header="true")

        from pyspark.sql.functions import concat, regexp_replace, format_string, format_number, struct, collect_list
        collectDf = df.withColumn('country', regexp_replace(df.rpt_cty, ' ', '_')) \
            .withColumn('yrmon', concat(df.year, format_string('%02d', df.mon))) \
            .groupBy('country', 'yrmon') \
            .agg(collect_list(struct(*df.columns)).alias('my_rows'))
        print(collectDf.count())
        collectDf.printSchema()
        collectDf.show(5)
def lyrSizeSpark(COUNT): 
  """counting character count for :COUNT: inital bottle of beer
  Full pyspark version
  """
  df = spark.createDataFrame( sc.range(COUNT,0,-1) ,schema=T.IntegerType())

  df = df.withColumn("count", F.when(F.col("value") >0, F.col("value").astype(T.StringType()) ).otherwise("no more")) 
  df = df.withColumn("countm", F.when(F.col("value")-1 >0, (F.col("value")-1).astype(T.StringType()) ).otherwise("no more")) 
  df = df.withColumn("plu", F.when(F.col("count") == "1", "" ).otherwise("s")) 
  df = df.withColumn("plum", F.when(F.col("countm") == "1", "" ).otherwise("s")) 

  df = df.withColumn("lyr",F.format_string("""%s bottle%s of beer on the wall, %s bottle%s of beer.\nTake one down and pass it around, %s bottle%s of beer on the wall.\n\n""",
                              F.col("count"),F.col("plu"),F.col("count"),F.col("plu"),F.col("countm"),F.col("plum"))   )
  
  return df.withColumn("lyrc", F.length(F.col("lyr")) ).select(F.sum(F.col("lyrc")).alias("c")).first()["c"]
Ejemplo n.º 15
0
def aggregate_columns(dataframe, tableName):
    f = open('get_aggregate_results.sh', 'w')
    for col in dataframe.columns:
        query = "select " + str(col) + " as Data_Values, count(" + str(
            col) + ") as count_data from " + tableName + " group by " + str(
                col) + " order by " + str(col)
        data = spark.sql(query)
        data.select(format_string('%s,%s', data.Data_Values,
                                  data.count_data)).write.save(
                                      "Projects/" + str(col) + ".txt",
                                      format="text")
        f.write('hfs -getmerge Projects/' + str(col) +
                '.txt /home/ak6384/Project/results/' + str(col) + '.txt \n')
        f.flush()
    f.close()
    subprocess.call("get_aggregate_results.sh")
def main(spark, filename):
    DF = spark.read.format('csv').options(header='true',
                                          inferschema='true').load(filename)
    DF.createOrReplaceTempView("parking")
    query = getQuery()
    tempResult = spark.sql(query)
    weekendDF = tempResult.filter(
        tempResult.day == "Weekend").createOrReplaceTempView("weekendTable")
    weekdayDF = tempResult.filter(
        tempResult.day == "Weekday").createOrReplaceTempView("weekdayTable")
    query = getQuery2()
    result = spark.sql(query)
    result.select(
        format_string('%s\t%.2f, %.2f', result.violation_code,
                      result.weekend_average,
                      result.weekday_average)).write.save("task7-sql.out",
                                                          format="text")
Ejemplo n.º 17
0
    def Run(self):
        print('Running DfIter test!!!')
        df = self.spark.read.load('sample-data.csv',
                                  format="csv",
                                  inferSchema="true",
                                  header="true")
        #df.printSchema()
        #df.show()

        from pyspark.sql.functions import concat, regexp_replace, format_string, format_number
        df.withColumn('country', regexp_replace(df.rpt_cty, "[ ,/,\\\]", '_')) \
            .withColumn('yrmon', concat(df.year, format_string('%02d', df.mon))) \
            .repartition('country', 'yrmon') \
            .write.partitionBy('country', 'yrmon') \
            .option('header', 'true') \
            .mode('overwrite') \
            .format('csv') \
            .save('./out/as_of_date=latest/type=full/')
def add_features(df: pyspark.sql.DataFrame) -> pyspark.sql.DataFrame:
    print("[wrangle.py] adding features")
    max_date = df.select(max("case_closed_date")).first()[0]
    return (
        df.withColumn("num_weeks_late", expr("num_days_late / 7 AS num_weeks_late"))
        .withColumn(
            "council_district",
            format_string("%03d", col("council_district").cast("int")),
        )
        .withColumn("zipcode", regexp_extract("request_address", r"\d+$", 0))
        .withColumn("case_age", datediff(lit(max_date), "case_opened_date"))
        .withColumn("days_to_closed", datediff("case_closed_date", "case_opened_date"))
        .withColumn(
            "case_lifetime",
            when(expr("! case_closed"), col("case_age")).otherwise(
                col("days_to_closed")
            ),
        )
    )
Ejemplo n.º 19
0
total_rev_df_add = joined_df.select(
    F.col('pickup_datetime'), (F.col('fare_amount') + F.col('surcharge') +
                               F.col('tip_amount')).alias('total_revenue'),
    F.col('tolls_amount'))

total_rev_df_add = total_rev_df_add.withColumn(
    'pickup_datetime', F.date_format(F.col('pickup_datetime'), 'yyyy-MM-dd'))

total_rev_grouped = total_rev_df_add.groupBy(
    F.col('pickup_datetime').alias('date')).agg(
        F.sum('total_revenue').alias('total_revenue'),
        F.sum('tolls_amount').alias('total_tolls'))

total_rev_grouped = total_rev_grouped.select(
    'date',
    F.regexp_replace(F.format_number(F.round(F.col('total_revenue'), 2), 2),
                     ',', '').alias('total_revenue'),
    F.regexp_replace(F.format_number(F.round(F.col('total_tolls'), 2), 2), ',',
                     '').alias('total_tolls'))

total_rev_grouped = total_rev_grouped.sort('date')

total_rev_grouped.select(
    format_string('%s,%s,%s', total_rev_grouped.date,
                  total_rev_grouped.total_revenue,
                  total_rev_grouped.total_tolls)).write.save('task2c-sql.out',
                                                             format="text")

spark.stop()
Ejemplo n.º 20
0
import sys
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.functions import format_string, date_format
from csv import reader

spark = SparkSession.builder.getOrCreate()

parking = spark.read.format('csv').options(
    header='true', inferschema='true').load(sys.argv[1])

openv = spark.read.format('csv').options(header='true',
                                         inferschema='true').load(sys.argv[2])

parking.createOrReplaceTempView("parking")
openv.createOrReplaceTempView("openv")

# Perform left joing between parking and open violations
result = spark.sql(
    "SELECT parking.summons_number, plate_id, violation_precinct, violation_code, parking.issue_date FROM parking LEFT JOIN openv ON parking.summons_number = openv.summons_number WHERE openv.summons_number IS NULL"
)

result.select(
    format_string('%d\t%s, %d, %d, %s', result.summons_number, result.plate_id,
                  result.violation_precinct, result.violation_code,
                  date_format(result.issue_date,
                              'yyyy-MMdd'))).write.save("task1-sql.result",
                                                        format="text")
Ejemplo n.º 21
0
import sys
import string
from pyspark.sql.functions import format_string
from csv import reader
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Python Spark SQL basic example").config(
    "spark.some.config.option", "some-value").getOrCreate()

#lines = spark.read.format('csv').options(header='true',inferschema='true').load ('/user/ecc290/HW1data/open-violations-header.csv')
lines = spark.read.format('csv').options(header='true',
                                         inferschema='true').load(sys.argv[1])
lines.createOrReplaceTempView("lines")
result = spark.sql(
    "SELECT CASE WHEN registration_state='NY' THEN 'NY' ELSE 'Other' END AS registration_state, COUNT(*) as counts FROM lines GROUP BY CASE WHEN registration_state='NY' THEN 'NY' ELSE 'Other' END ORDER BY registration_state"
)
result.select(
    format_string('%s \t %d', result.registration_state,
                  result.counts)).write.save("task4-sql.out", format="text")
Ejemplo n.º 22
0
#!/usr/bin/env python

import sys
from pyspark.sql import SparkSession
from csv import reader

from pyspark.sql.functions import format_string

spark = SparkSession.builder.appName("task2SQL").getOrCreate()
parking_violation = spark.read.format('csv').options(header='true',
                                                     inferschema='true').load(
                                                         sys.argv[1])
parking_violation.createOrReplaceTempView("parking")

violation_count = spark.sql(
    "SELECT plate_id, registration_state, count(violation_code) violation_code_count FROM parking GROUP BY plate_id, registration_state ORDER BY plate_id"
)
violation_count_temp = violation_count.createOrReplaceTempView(
    "violation_count")
max_violation = spark.sql(
    "SELECT plate_id, registration_state, violation_code_count FROM violation_count ORDER BY violation_code_count DESC LIMIT 20"
)

max_violation.select(
    format_string(
        '%s, %s\t%d', max_violation.plate_id, max_violation.registration_state,
        max_violation.violation_code_count)).write.save("task6-sql.out",
                                                        format="text")
Ejemplo n.º 23
0
#!/usr/bin/env python

import sys
from pyspark.sql import SparkSession
from csv import reader

from pyspark.sql.functions import format_string

spark = SparkSession.builder.appName("task2SQL").getOrCreate()
parking_violation = spark.read.format('csv').options(header='true',
                                                     inferschema='true').load(
                                                         sys.argv[1])
parking_violation.createOrReplaceTempView("parking")

result = spark.sql(
    "SELECT violation_code, count(violation_code) violation_count FROM parking GROUP BY violation_code"
)

result.select(
    format_string('%s\t%d', result.violation_code,
                  result.violation_count)).write.save("task2-sql.out",
                                                      format="text")
Ejemplo n.º 24
0
    StructField("surcharge", StringType(),True),
    StructField("mta_tax", StringType(),True),
    StructField("tip_amount", StringType(),True),
    StructField("tolls_amount", StringType(),True),
    StructField("total_amount", StringType(),True),
    StructField("name", StringType(),True),
    StructField("type", StringType(),True),
    StructField("current_status", StringType(),True),
    StructField("DMV_license_plate", StringType(),True),
    StructField("vehicle_VIN_number", StringType(),True),
    StructField("vehicle_type", StringType(),True),
    StructField("model_year", StringType(),True),
    StructField("medallion_type", StringType(),True),
    StructField("agent_number", StringType(),True),
    StructField("agent_name", StringType(),True),
    StructField("agent_telephone_number", StringType(), True),
    StructField("agent_website", StringType(), True),
    StructField("agent_address", StringType(), True),
    StructField("last_updated_date", StringType(), True),
    StructField("last_updated_time", StringType(), True)])
fareLicenses = spark.read.format('csv').schema(fare_licenses).options(header='false',inferschema='true').load(sys.argv[1])
fareLicenses.createOrReplaceTempView("fareLicenses")
trips = spark.sql("select medallion_type, count(*) as total_trips from fareLicenses group by medallion_type")
total_revenue = spark.sql("select medallion_type, sum(fare_amount) as total_revenue from fareLicenses group by medallion_type")
tip = spark.sql("select medallion_type, sum(tip_amount) as total_tip from fareLicenses group by medallion_type")
df1 = trips.join(total_revenue,["medallion_type"])
df2 = df1.join(tip,["medallion_type"])
finaloutput = df2.select("medallion_type","total_trips", "total_revenue", ((df2.total_tip/df2.total_revenue)*(100/df2.total_trips)).alias("avg_tip_percentage")).sort(df2.medallion_type.asc())
print(finaloutput.show())
finaloutput.select(format_string('%s,%s,%s,%s',finaloutput.medallion_type,finaloutput.total_trips,finaloutput.total_revenue,finaloutput.avg_tip_percentage)).write.save("task4b-sql.out", format="text")
Ejemplo n.º 25
0
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, format_string

spark = SparkSession \
 .builder \
 .appName("Python Spark SQL basic example") \
 .config("spark.some.config.option", "some-value") \
 .getOrCreate()

parking = spark.read.format('csv').options(
    header='true', inferschema='true').load(sys.argv[1])
parking.createOrReplaceTempView("parking")
#weekend_dates=[(5,),(6,),(12,),(13,),(19,),(20,),(26,),(27,)]
weekend_dates = [("2016-03-05", ), ("2016-03-06", ), ("2016-03-12", ),
                 ("2016-03-13", ), ("2016-03-19", ), ("2016-03-20", ),
                 ("2016-03-26", ), ("2016-03-27", )]
weekend = spark.createDataFrame(weekend_dates, ["date"])
weekend.createOrReplaceTempView("weekend")
#result = spark.sql("select violation_code, round(sum(case when w.date is null then 1 else 0 end)/23,2) as wkd, round(count(w.date)/8,2) as ctr_weekend from parking p left outer join weekend w on extract(DAY from parking.issue_date)=w.date group by violation_code")
result = spark.sql(
    "select violation_code, round(sum(case when w.date is null then 1 else 0 end)/23,2) as wkd, round(count(w.date)/8,2) as ctr_weekend from parking p left outer join weekend w on issue_date=w.date group by violation_code"
)
#result.show(result.count(), False)
result.select(
    format_string("%s\t%.2f, %.2f", result.violation_code, result.ctr_weekend,
                  result.wkd)).write.save("task7-sql.out", format="text")
Ejemplo n.º 26
0
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, sum,  format_string

spark = SparkSession \
 .builder \
 .appName("Python Spark SQL basic example") \
 .config("spark.some.config.option", "some-value") \
 .getOrCreate()

open = spark.read.format('csv').options(header='true',inferschema='true').load(sys.argv[1])
#open.select(open["license_type"], open["amount_due"]).groupBy("license_type").count().show()
#open.groupBy("license_type").agg(avg("amount_due"), sum("amount_due")).show()
#result = open.groupBy("license_type").count()
#result.select(format_string("%d",result.count)).show()
open.createOrReplaceTempView("open")
result = spark.sql("select license_type, round(avg(amount_due),2) as avg, round(sum(amount_due),2) as sum from open group by license_type")
#result.printSchema()
#result = result.select(result.license_type, result.sum.cast("float"), result.avg.cast("float"))
result.select(format_string("%s\t%.2f, %.2f",result.license_type, result.sum, result.avg)).write.save("task3-sql.out", format="text")
Ejemplo n.º 27
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import format_string
import pyspark.sql.functions as F
import sys

spark = SparkSession.builder.appName("my_pp").getOrCreate()

# using joined_df
joined_df = spark.read.format('csv').options(
    header='false', inferschema='true').load(sys.argv[1]).select(
        F.col('_c15').alias('fare_amount'))

invalid_fare_amounts = joined_df.filter(F.col('fare_amount') < 0).select(
    F.count('*').alias('invalid_fare_amounts'))

invalid_fare_amounts.select(
    format_string("%s", invalid_fare_amounts.invalid_fare_amounts)).write.save(
        "task3a-sql.out", format="text")

spark.stop()
Ejemplo n.º 28
0
#!/usr/bin/env python
import sys
import csv
from pyspark.sql import SparkSession
from pyspark.sql.functions import format_string
spark = SparkSession.builder.appName("Python Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()

parking =spark.read.format('csv').options(header='true',inferschema='true').load(sys.argv[1])
parking.createOrReplaceTempView("parking")
result = spark.sql("select * from (select registration_state,plate_id, count(violation_code) as total  from parking group by plate_id,registration_state) g order by g.total desc,plate_id limit 20")
result.select(format_string('%s, %s\t%d', result.plate_id,result.registration_state,result.total)).write.save("task6-sql.out",format="text")
Ejemplo n.º 29
0
"""
Task-7 : For each violation code, list the average number of violations with that code issued per day on weekdays and weekend days
Author : Chinmay Wyawahare
"""

import sys
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.functions import format_string
from csv import reader

spark = SparkSession.builder.getOrCreate()

parkingdata = spark.read.format('csv').options(header='true',
                                               inferschema='true').load(
                                                   sys.argv[1])
parkingdata.createOrReplaceTempView("parkingdata")

out = spark.sql(
    "SELECT t1.violation_code, ROUND(t1.Weekend,2), ROUND(t2.Weekdays,2) FROM (SELECT violation_code, COUNT(violation_code)/8 AS Weekend FROM parkingdata WHERE (EXTRACT(DAY FROM issue_date)=27 OR EXTRACT(DAY FROM issue_date)=26 OR EXTRACT(DAY FROM issue_date)=20 OR EXTRACT(DAY FROM issue_date)=19 OR EXTRACT(DAY FROM issue_date)=13 OR EXTRACT(DAY FROM issue_date)=12 OR EXTRACT(DAY FROM issue_date)=6 OR EXTRACT(DAY FROM issue_date)=5) GROUP BY violation_code) t1  INNER JOIN (SELECT violation_code, COUNT(violation_code)/23 AS Weekdays FROM parkingdata WHERE NOT (EXTRACT(DAY FROM issue_date)=27 OR EXTRACT(DAY FROM issue_date)=26 OR EXTRACT(DAY FROM issue_date)=20 OR EXTRACT(DAY FROM issue_date)=19 OR EXTRACT(DAY FROM issue_date)=13 OR EXTRACT(DAY FROM issue_date)=12 OR EXTRACT(DAY FROM issue_date)=6 OR EXTRACT(DAY FROM issue_date)=5) GROUP BY violation_code) t2 ON t1.violation_code = t2.violation_code"
)
out.select(
    format_string('%d\t%d, %d', t1.violation_code, ROUND(t1.Weekend, 2),
                  ROUND(t2.Weekdays, 2))).write.save("task7-sql.out",
                                                     format="text")
def prepare_df(
    df: pyspark.sql.DataFrame,
    store_csv: pyspark.sql.DataFrame,
    store_states_csv: pyspark.sql.DataFrame,
    state_names_csv: pyspark.sql.DataFrame,
    google_trend_csv: pyspark.sql.DataFrame,
    weather_csv: pyspark.sql.DataFrame,
) -> pyspark.sql.DataFrame:
    num_rows = df.count()

    # expand dates
    df = expand_date(df)

    # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed).
    df = (df.withColumn("Open", df.Open != "0").withColumn(
        "Promo",
        df.Promo != "0").withColumn("StateHoliday",
                                    df.StateHoliday != "0").withColumn(
                                        "SchoolHoliday",
                                        df.SchoolHoliday != "0"))

    # merge store information
    store = store_csv.join(store_states_csv, "Store")
    df = df.join(store, "Store")

    # merge Google Trend information
    google_trend_all = prepare_google_trend(google_trend_csv)
    df = df.join(google_trend_all,
                 ["State", "Year", "Week"]).select(df["*"],
                                                   google_trend_all.trend)

    # merge in Google Trend for whole Germany
    google_trend_de = google_trend_all[google_trend_all.file ==
                                       "Rossmann_DE"].withColumnRenamed(
                                           "trend", "trend_de")
    df = df.join(google_trend_de,
                 ["Year", "Week"]).select(df["*"], google_trend_de.trend_de)

    # merge weather
    weather = weather_csv.join(state_names_csv,
                               weather_csv.file == state_names_csv.StateName)
    df = df.join(weather, ["State", "Date"])

    # fix null values
    df = (df.withColumn(
        "CompetitionOpenSinceYear",
        F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)),
    ).withColumn(
        "CompetitionOpenSinceMonth",
        F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)),
    ).withColumn("Promo2SinceYear",
                 F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn(
                     "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek,
                                                   F.lit(1))))

    # days and months since the competition has been open, cap it to 2 years
    df = df.withColumn(
        "CompetitionOpenSince",
        F.to_date(
            F.format_string("%s-%s-15", df.CompetitionOpenSinceYear,
                            df.CompetitionOpenSinceMonth)),
    )
    df = df.withColumn(
        "CompetitionDaysOpen",
        F.when(
            df.CompetitionOpenSinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(360 * 2),
                        F.datediff(df.Date, df.CompetitionOpenSince)),
            ),
        ).otherwise(0),
    )
    df = df.withColumn("CompetitionMonthsOpen",
                       (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

    # days and weeks of promotion, cap it to 25 weeks
    df = df.withColumn(
        "Promo2Since",
        F.expr(
            'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)'
        ),
    )
    df = df.withColumn(
        "Promo2Days",
        F.when(
            df.Promo2SinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))),
        ).otherwise(0),
    )
    df = df.withColumn("Promo2Weeks",
                       (df.Promo2Days / 7).cast(T.IntegerType()))

    # ensure that no row was lost through inner joins
    assert num_rows == df.count(), "lost rows in joins"
    return df