Exemple #1
0
def reformat_v1_0(flight, pqFolder, pqFileName):
	"""
		Read in the original v1.0 dataframe and save as a new parquet file compatible with v1.1
		@params:        
			flight		  - Required  : original v1.0 data(Spark DataFrame)        
			pqFolder      - Required  : folder to save the parquet files into (Str)        
			pqFileName    - Required  : parquet file name (Bool)                        
	"""
	flight2 = (flight.withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stay_days')))
					 .drop('stay_days')           
					 .withColumnRenamed('start_date', 'depDate')                 
					 .withColumn('depDate', to_date('depDate'))
					 .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later
					 .withColumnRenamed('from_city_name', 'fromCity')
					 .withColumnRenamed('to_city_name', 'toCity')                 
					 .withColumnRenamed('search_date', 'searchDate')                 
					 .withColumn('searchDate', to_date('searchDate'))
					 .withColumnRenamed('company', 'airlineName')                 
					 .withColumnRenamed('dep_time', 'departureTime')                                  
					 .withColumnRenamed('arr_time', 'arrivalTime')                                                   
					 .withColumn('duration_h', split(flight.duration,'h').getItem(0))
					 .withColumn('duration_m', F.substring_index(split(flight.duration,'h').getItem(1), 'm', 1))
	#                  .withColumn('duration', F.struct(col('duration_h'), col('duration_m')))
					 .withColumn('duration_m', (col('duration_h')*60 + col('duration_m')))
					 .drop('duration', 'duration_h', 'flight_number')
					 .withColumnRenamed('price_code', 'currencyCode')                                  
					 .withColumnRenamed('stop', 'stops')
					 .withColumn('stops', col('stops').cast('byte')) 
					 .withColumn('stop_info', split(col('stop_info'), ';'))
	#                  .withColumn('stop_duration', take_all_duration_UDF(col('stop_info')))
					 .withColumn('noOfTicketsLeft', correct_tickets_left_UDF('ticket_left'))
					 .withColumn('noOfTicketsLeft', col('noOfTicketsLeft').cast('byte')) 
					.drop('ticket_left')
				   .withColumnRenamed('table_name', 'tableName')
				   .withColumn('task_id', col('task_id').cast('long')) 
				   .withColumn('span_days', col('span_days').cast('integer')) 
					.select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 
							'fromCity', 'toCity', 'trip', 'depDate', 'retDate',
							'stayDays', 
						   'departureTime', 'arrivalTime', 
							'airlineName',  'duration_m', 
							'flight_code', 'plane', 'stops', 'noOfTicketsLeft',
						   'airline_code', 'airline_codes',
						   'stop_info', 'span_days', 'power', 'video', 'wifi')                #'stop_duration', 
			  )

	flight2.repartition(1).write.parquet(os.path.join(pq_folder, pqFileName))
spark = SparkSession.builder.appName('weathercheck').getOrCreate()
#Load weather data csv files
df = spark.read\
    .format("csv")\
    .options(header='True', inferSchema='True', delimiter=',')\
    .load("file:///D:/Data Engineer Test/")
#df.show()
#df.printSchema()
#Set Row Size to 2MB
PARQUET_BLOCK_SIZE = 2 * 1024 * 1024
#Save as parquet file
df.repartition(2).write.option(
    "parquet.block.size",
    PARQUET_BLOCK_SIZE).mode("overwrite").parquet("file:///D:/parquet_output/")

#Load parquet files
parq_df = spark.read\
    .format("parquet")\
    .load("file:///D:/parquet_output/")

#Query for the hottest temperature and get data as well as Region
hot_day = parq_df\
    .select("ObservationDate", "ScreenTemperature", "Region")\
    .sort(col("ScreenTemperature").desc())\
    .limit(1)\
    .select(to_date(col("ObservationDate"), "yyyy-MM-dd").alias("hottest_day"),
        col("ScreenTemperature").alias("hottest_Temperature"),
        col("Region").alias("hottest_Region"))
hot_day.show(truncate=False)
Exemple #3
0
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

torq = glueContext.create_dynamic_frame.from_catalog(
    database=args['DATABASE_NAME'], table_name=args['TABLE_NAME'])
print("Count: ", torq.count())
torq.printSchema()

torqDF = torq.toDF()

dt = split(torqDF.col0, ' ')

torqDF = torqDF.select(col('col1').alias('job_status'), col('col3').alias('detail'),
                       to_timestamp(torqDF.col0, 'MM/dd/yyyy HH:mm:ss').alias('o_dt'), torqDF.col0).where(torqDF.col1 == 'E') \
    .select('job_status', 'detail', 'o_dt',
            to_date(col("o_dt"), "yyyy-MM-dd").alias('date'),
            year('o_dt').alias('year'),
            month('o_dt').alias('month'),
            dayofmonth('o_dt').alias('day'),
            hour('o_dt').alias('hour'))

with_map = torqDF.withColumn("kvs", map_keys("detail"))

keys = (with_map.select(explode("kvs")).select("key").distinct().rdd.flatMap(
    lambda x: x).collect())

with_map = with_map.select(*["*"] +
                           [col("kvs").getItem(k).alias(k) for k in keys])

# change the data types and column names to be easier to query later
with_map = with_map \
Exemple #4
0
])

features2 = (spark.read.format(file_type).option(
    "header", first_row_is_header).option(
        "sep", delimiter).schema(featuresSchema).load(file_location))

display(features2)

# COMMAND ----------

# MAGIC %md
# MAGIC Changing Date type (String -> Date)

# COMMAND ----------

features3 = features2.withColumn("Date", to_date(col("Date"), "dd/MM/yyyy"))
#changedTypedf = joindf.withColumn("show", joindf["show"].cast(DoubleType()))
display(features3)

# COMMAND ----------

features3.printSchema()

# COMMAND ----------

features3.count()

# COMMAND ----------

# MAGIC %md
# MAGIC ### sale data-set.csv
Exemple #5
0
                    F.col("articleOffset"),
                    F.col("quotation"),
                    F.col("leftContext"),
                    F.col("rightContext"),
                    F.col("quotationOffset"),
                    F.col("leftOffset").alias("contextStart"),
                    F.col("rightOffset").alias("contextEnd"),
                )).alias("quotes_link"),
            get_website(
                F.sort_array(F.collect_list(F.struct(
                    "date", "website")))).alias("urls"),
        ).withColumn("quotation", longest(F.col("quotations"))).withColumn(
            "row_nb",
            F.row_number().over(
                Window.partitionBy(
                    F.to_date("earliest_date")).orderBy("canonicalQuotation")),
        ).withColumn(
            "quoteID",
            F.concat_ws("-", F.to_date("earliest_date"), pad_int("row_nb")),
        ).withColumn("month", F.month("earliest_date")).withColumn(
            "year", F.year("earliest_date")).drop("quotations", "row_nb"))

joined_df = qc.join(res, on=["articleUID", "articleOffset"])

w = Window.partitionBy("canonicalQuotation")
rank_w = Window.partitionBy("canonicalQuotation").orderBy(F.desc("sum(proba)"))
agg_proba = (joined_df.groupBy(
    lower(F.col("quotation")).alias("canonicalQuotation"),
    "qids").agg(F.sum("proba"),
                F.collect_list("speaker").alias("speakers")).select(
                    "*",
Exemple #6
0
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import to_date
from pyspark.sql.functions import split

spark = SparkSession.builder.appName("A20453021 - lab chapter 06").getOrCreate()


# Save this as a .csv uncompressed to the location: .save("hdfs://namenode/user/controller/output/your-hawk-id-here/00/save-file-action") (00 is whichever dataset you were assigned to)

df2 = spark.read.text("hdfs://namenode/user/controller/ncdc/raw/60/60.txt")

df2.withColumn('WeatherStation', df2['value'].substr(5, 6)) \
    .withColumn('WBAN', df2['value'].substr(11, 5)) \
    .withColumn('ObservationDate',to_date(df2['value'].substr(16,8), 'yyyyMMdd')) \
    .withColumn('ObservationHour', df2['value'].substr(24, 4).cast(IntegerType())) \
    .withColumn('Latitude', df2['value'].substr(29, 6).cast('float') / 1000) \
    .withColumn('Longitude', df2['value'].substr(35, 7).cast('float') / 1000) \
    .withColumn('Elevation', df2['value'].substr(47, 5).cast(IntegerType())) \
    .withColumn('WindDirection', df2['value'].substr(61, 3).cast(IntegerType())) \
    .withColumn('WDQualityCode', df2['value'].substr(64, 1).cast(IntegerType())) \
    .withColumn('SkyCeilingHeight', df2['value'].substr(71, 5).cast(IntegerType())) \
    .withColumn('SCQualityCode', df2['value'].substr(76, 1).cast(IntegerType())) \
    .withColumn('VisibilityDistance', df2['value'].substr(79, 6).cast(IntegerType())) \
    .withColumn('VDQualityCode', df2['value'].substr(86, 1).cast(IntegerType())) \
    .withColumn('AirTemperature', df2['value'].substr(88, 5).cast('float') /10) \
    .withColumn('ATQualityCode', df2['value'].substr(93, 1).cast(IntegerType())) \
    .withColumn('DewPoint', df2['value'].substr(94, 5).cast('float')) \
    .withColumn('DPQualityCode', df2['value'].substr(99, 1).cast(IntegerType())) \
    .withColumn('AtmosphericPressure', df2['value'].substr(100, 5).cast('float')/ 10) \
                    .schema(schema) \
                    .load(sys.argv[1])

    data = data.select('Date', 'StringencyIndexForDisplay',
                       'StringencyIndexForDisplay_LogChange',
                       'StringencyLegacyIndexForDisplay',
                       'StringencyLegacyIndexForDisplay_LogChange',
                       'GovernmentResponseIndexForDisplay',
                       'GovernmentResponseIndexForDisplay_LogChange',
                       'ContainmentHealthIndexForDisplay',
                       'ContainmentHealthIndexForDisplay_LogChange',
                       'EconomicSupportIndexForDisplay',
                       'EconomicSupportIndexForDisplay_LogChange')
    data = data.withColumn(
        'Date',
        to_date(unix_timestamp(col('Date'), 'yyyy-MM-dd').cast('timestamp')))

    data = data.toDF(
        'Date', 'USA_StringencyIndex', 'USA_StringencyIndex_LogChange',
        'USA_StringencyLegacyIndex', 'USA_StringencyLegacyIndex_LogChange',
        'USA_GovernmentResponseIndex', 'USA_GovernmentResponseIndex_LogChange',
        'USA_ContainmentHealthIndex', 'USA_ContainmentHealthIndex_LogChange',
        'USA_EconomicSupportIndex', 'USA_EconomicSupportIndex_LogChange')

    # cumulative log return
    cum_window = Window.orderBy(data['Date']).rangeBetween(
        Window.unboundedPreceding, 0)
    cum_sum = data.withColumn(
        'USA_StringencyIndex_Cumulative',
        sum('USA_StringencyIndex_LogChange').over(cum_window))
    cum_sum = cum_sum.withColumn(

# COMMAND ----------

from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)


# COMMAND ----------

from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
  .select(datediff(col("week_ago"), col("today"))).show(1)

dateDF.select(
    to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end"))\
  .select(months_between(col("start"), col("end"))).show(1)


# COMMAND ----------

from pyspark.sql.functions import to_date, lit
spark.range(5).withColumn("date", lit("2017-01-01"))\
  .select(to_date(col("date"))).show(1)


# COMMAND ----------

from pyspark.sql.functions import to_date
dateFormat = "yyyy-dd-MM"
    def junk_filter(self, offset=240, should_count=True):
        '''
        junk_filter(integer,boolean) -> void
        offset = time in seconds from delta time
        should_count = show debug counts at end of filter pass...peformance hit
        '''
        print(
            "\n_______________________________________________\nJUNK FILTER\n\n"
        )

        start_ms = time_ms()

        init_cnt_df = {}
        null_cnt_df = {}
        dupe_cnt_df = {}
        acc_cnt_df = {}
        coord_cnt_df = {}
        fnl_df = {}

        # Apply daily time offset in minutes (offset at 4am instead of midnight)
        offset = 240
        for index, dt in enumerate(self.study_dts):
            self.study_dts[index] = dt + timedelta(minutes=offset)

        # Convert study dates into unix timestamps
        unix_dts = []
        for dt in self.study_dts:
            unix_dts.append(calendar.timegm(dt.timetuple()))

        init_cnt_df = self.df

        # Drop records with null values in critical fields
        self.df = self.df.na.drop(subset=[
            'latitude', 'longitude', 'utc_timestamp', 'tz_offset', 'accuracy'
        ])
        null_cnt_df = self.df

        # Drop duplicate records based on device_id and timestamp
        self.df = self.df.dropDuplicates(['utc_timestamp', 'device_id'])
        dupe_cnt_df = self.df

        # Remove records falling outside safe horizontal accuracy thresholds
        self.df = self.df.filter((self.df.accuracy >= 5)
                                 & (self.df.accuracy <= 65))
        acc_cnt_df = self.df

        # Remove records falling outside of a bounding rectanlge of the contintental US, AK, and HI
        self.df = self.df.filter(((self.df.latitude >= 23.82585) & (self.df.latitude <= 50.107813) \
                                 & ((self.df.longitude >= -125.821901) & (self.df.longitude <= -65.934603))) \
                                 | ((self.df.latitude >= 50.494424) & (self.df.latitude <= 72.113805) \
                                 & ((self.df.longitude >= 172) | (self.df.longitude <= -128))) \
                                 | ((self.df.latitude >= 18.186832) & (self.df.latitude <= 26.499983) \
                                 & (self.df.longitude >= -172.536313) & (self.df.longitude <= -154.039891)))

        coord_cnt_df = self.df

        # Remove records falling outside of the study range scope
        schema = StructType([
            StructField("utc_timestamp", IntegerType(), True),
            StructField("device_id", StringType(), True),
            StructField("os", IntegerType(), True),
            StructField("latitude", FloatType(), True),
            StructField("longitude", FloatType(), True),
            StructField("accuracy", IntegerType(), True),
            StructField("tz_offset", IntegerType(), True),
            StructField("study_dt", StringType(), True)
        ])

        fnl_df = self.spark.createDataFrame(self.spark.sparkContext.emptyRDD(),
                                            schema)

        for dt in unix_dts:
            fnl_df = fnl_df.union(self.df.filter((self.df['utc_timestamp'] + self.df['tz_offset']).between(dt, dt + 86399)) \
                                  .withColumn("study_dt",py.to_date(py.from_unixtime(lit(dt)))))

        self.df = fnl_df

        init_cnt = -1
        null_cnt = -1
        dupe_cnt = -1
        acc_cnt = -1
        coord_cnt = -1
        tm_cnt = -1

        if should_count is True:
            print("init_cnt")
            init_cnt_accumulator = self.spark.sparkContext.accumulator(0)
            init_cnt_df.foreach(lambda row: init_cnt_accumulator.add(1))
            #init_cnt = init_cnt_df.select("accuracy").count()
            init_cnt = init_cnt_accumulator.value

            print("null_cnt")
            null_cnt_accumulator = self.spark.sparkContext.accumulator(0)
            null_cnt_df.foreach(lambda row: null_cnt_accumulator.add(1))
            #null_cnt = null_cnt_df.select("accuracy").count()
            null_cnt = null_cnt_accumulator.value

            print("dupe_cnt")
            dupe_cnt_accumulator = self.spark.sparkContext.accumulator(0)
            dupe_cnt_df.foreach(lambda row: dupe_cnt_accumulator.add(1))
            #dupe_cnt = dupe_cnt_df.select("accuracy").count()
            dupe_cnt = dupe_cnt_accumulator.value

            print("acc_cnt")
            acc_cnt_accumulator = self.spark.sparkContext.accumulator(0)
            acc_cnt_df.foreach(lambda row: acc_cnt_accumulator.add(1))
            #acc_cnt = acc_cnt_df.select("accuracy").count()
            acc_cnt = acc_cnt_accumulator.value

            print("coord_cnt")
            coord_cnt_accumulator = self.spark.sparkContext.accumulator(0)
            coord_cnt_df.foreach(lambda row: coord_cnt_accumulator.add(1))
            #coord_cnt = coord_cnt_df.select("accuracy").count()
            coord_cnt = coord_cnt_accumulator.value

            print("tm_cnt")
            tm_cnt_accumulator = self.spark.sparkContext.accumulator(0)
            fnl_df.foreach(lambda row: tm_cnt_accumulator.add(1))
            #tm_cnt = fnl_df.select("accuracy").count()
            tm_cnt = tm_cnt_accumulator.value

        else:
            print("fnl_df complete junk_filter")

        tbl_data = [['Initial count', init_cnt, 0, 0, 'Count of pings before junk filtering process'], \
           ['Null values', null_cnt, init_cnt - null_cnt, ((init_cnt - null_cnt) / float(init_cnt)) * 100, \
            'Empty values among latitude, longitude, accuracy, timestamp, tz offset'], \
           ['Duplicates', dupe_cnt, null_cnt - dupe_cnt, ((null_cnt - dupe_cnt) / float(init_cnt)) * 100, \
            'Records with duplicate device_id and utc_timestamps'], \
           ['Accuracy', acc_cnt, dupe_cnt - acc_cnt, ((dupe_cnt - acc_cnt) / float(init_cnt)) * 100, \
            'Horizontal accuracy values exceeding safe thresholds (outside of 5 - 65)'], \
           ['Coordinates', coord_cnt, acc_cnt - coord_cnt, ((acc_cnt - coord_cnt) / float(init_cnt)) * 100, \
            'Pings occurring outside bounding rectangles of the continental US, AK, and HI'], \
           ['Study date(s)', tm_cnt, coord_cnt - tm_cnt, ((coord_cnt - tm_cnt) / float(init_cnt)) * 100, \
            'Pings occuring outside the study date range when tz_offset is applied'], \
           ['Final count', tm_cnt, init_cnt - tm_cnt, ((init_cnt - tm_cnt) / float(init_cnt)) * 100, \
            'Count of pings after junk filtering process']]

        end_ms = time_ms()

        print(tabulate(tbl_data, floatfmt=".2f", headers=['Phase', 'Ping Count', 'Removed Pings', \
                                                          'Percent Reduction', 'Description']))
        print("junk_filter time ms: {ms}".format(ms=end_ms - start_ms))
Exemple #10
0
  .getOrCreate()
df = spark.read.format("csv"). \
  options(header='True',inferSchema='True'). \
  load("avdata/*")
df.printSchema()

# get sourcefile name from input_file_name()
df = df.withColumn("path", fun.input_file_name())
regex_str = "[\/]([^\/]+[^\/]+)$" #regex to extract text after the last / or \
df = df.withColumn("ticker", fun.regexp_extract("path",regex_str,1))
df = df.na.drop()
df.show()


# convert time to days ago
df=df.withColumn('timestamp', fun.to_date("timestamp"))
df=df.withColumn('days_ago', fun.datediff(fun.current_date(), "timestamp"))
df=df.withColumn('days_ago', fun.datediff(fun.current_date(), "timestamp"))

# Calculate week over week change (14 day window change)
windowSpec  = Window.partitionBy("ticker").orderBy("days_ago")
dflag = df.withColumn("lag",fun.lag("open",14).over(windowSpec))
dflag = dflag.withColumn('twoweekdiff', fun.col('lag') - fun.col('open'))
dflag.show()

# Within group (ticker) calculate anomaly time periods
# filter for anomalous events (keep outliers)
statsDF = dflag.groupBy("ticker").agg(fun.mean(dflag.twoweekdiff).alias("mean"), fun.stddev(dflag.twoweekdiff).alias("stddev"))

# add columns with upper and lower limits
statsDF = statsDF.withColumn("UpperLimit", statsDF.mean + statsDF.stddev * 3).withColumn("LowerLimit", statsDF.mean - statsDF.stddev * 3)

# COMMAND ----------

from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)


# COMMAND ----------

from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
  .select(datediff(col("week_ago"), col("today"))).show(1)

dateDF.select(
    to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end"))\
  .select(months_between(col("start"), col("end"))).show(1)


# COMMAND ----------

from pyspark.sql.functions import to_date, lit
spark.range(5).withColumn("date", lit("2017-01-01"))\
  .select(to_date(col("date"))).show(1)


# COMMAND ----------

from pyspark.sql.functions import to_date
dateFormat = "yyyy-dd-MM"
Exemple #12
0
def to_date_robust(x):
    '''converts a string column to date column with best effort'''
    return F.to_date(to_iso_string_udf(x))
Exemple #13
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = os.path.join(input_data, 'log_data/*/*/*.json')

    # read log data file
    df = spark.read.json(log_data).drop_duplicates()

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    print("********creating {} at {}".format('user_table',
                                             str(datetime.now())))

    user_col = ["userId", "firstName", "lastName", "gender", "level"]
    users_table = df.select(user_col).drop_duplicates()

    # write users table to parquet files
    print("********writing {} to s3 at {}".format('user_table',
                                                  str(datetime.now())))

    users_table.write.parquet(os.path.join(output_data, 'user'), 'overwrite')

    # create timestamp column from original timestamp column
    print("********creating {} at {}".format('time_table',
                                             str(datetime.now())))

    df = df.withColumn("timestamp", F.to_timestamp(df.ts / 1000))

    # create datetime column from original timestamp column
    df = df.withColumn("datetime", F.to_date(df.timestamp))

    # extract columns to create time table
    time_table = df.selectExpr([
        "timestamp as start_time ", "hour(timestamp) as hour",
        "weekofyear(timestamp) as week", "month(timestamp) as month",
        "year(timestamp) as year", "dayofweek(timestamp) as weekday"
    ]).drop_duplicates()

    # write time table to parquet files partitioned by year and month
    print("********writing {} to s3 at {}".format('time_table',
                                                  str(datetime.now())))

    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time_table'), 'overwrite')

    # read in song data to use for songplays table

    song_path = os.path.join(output_data, 'song')
    df_song = spark.read.parquet(song_path)

    # extract columns from joined song and log datasets to create songplays table
    df_songplay_all = df.join(df_song, (df.song == df_song.title))
    songplay_cols = [
        "song_id as songplay_id", "timestamp as start_time ",
        'userId as user_id', 'level', 'song_id', 'artist_id',
        'sessionId as session_id', 'location', 'userAgent as user_agent',
        "month(timestamp) as month", "year(timestamp) as year"
    ]

    print("********createing {} at {}".format('songplays_table',
                                              str(datetime.now())))

    songplays_table = df_songplay_all.selectExpr(
        songplay_cols).drop_duplicates()

    # write songplays table to parquet files partitioned by year and month
    print("********writing {} to s3 at {}".format('songplays_table',
                                                  str(datetime.now())))
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplay'), 'overwrite')
Exemple #14
0
    def _clean_immigration_data(self):
        """
        Class method to clean immigration data

        Operations:
                    - set column types and rename to appropriate names
                    - remove columns renamed and/or no longer of use
                    - additional fault check: only select columns that will be used/useful
        Returns:
                [dict] - object with source-name: SparkDF key-value pairs
        """
        df = self.data_dict.get('immigration_data', None)
        if df is not None:
            # set column types and rename
            tmp = df\
                .withColumn("cic_id", col("cicid").cast("integer"))\
                .withColumn("visa_code", col("i94visa").cast("integer"))\
                .withColumn("mode_code", col("i94mode").cast("integer"))\
                .withColumn("orig_country_code", col("i94res").cast("integer"))\
                .withColumn("cit_country_code", col("i94cit").cast("integer"))\
                .withColumn("year", col("i94yr").cast("integer"))\
                .withColumn("month", col("i94mon").cast("integer"))\
                .withColumn("birth_year", col("biryear").cast("integer"))\
                .withColumn("age", col("i94bir").cast("integer"))\
                .withColumn("counter", col("count").cast("integer"))\
                .withColumn("sas_date", to_date(lit("01/01/1960"), "MM/dd/yyyy"))\
                .withColumn("arrival_date", expr("date_add(sas_date, arrdate)"))\
                .withColumn("departure_date", expr("date_add(sas_date, depdate)")) \
                .withColumnRenamed("i94addr", "state_code") \
                .withColumnRenamed("i94port", "port_code") \
                .withColumnRenamed("visapost", "visa_post") \
                .withColumnRenamed("visatype", "visa_type")

            # drop original/renamed columns
            tmp = tmp\
                .drop("cicid")\
                .drop("i94visa")\
                .drop("i94mode")\
                .drop("i94res")\
                .drop("i94cit")\
                .drop("i94yr")\
                .drop("i94mon")\
                .drop("biryear")\
                .drop("i94bir")\
                .drop("count")\
                .drop("data_base_sas", "arrdate", "depdate")

            # fault check: select only what will be useful
            data = tmp.select(col("cic_id"), col("port_code"),
                              col("state_code"), col("visa_post"),
                              col("matflag"), col("dtaddto"), col("gender"),
                              col("airline"), col("admnum"), col("fltno"),
                              col("visa_type"), col("mode_code"),
                              col("orig_country_code"),
                              col("cit_country_code"), col("year"),
                              col("month"), col("birth_year"), col("age"),
                              col("counter"), col("arrival_date"),
                              col("departure_date"))

            return dict(immigration_data=data)

        else:
            logger.error(
                ValueError(
                    'No dataset named "immigration_data" found in sources dict.'
                ))
            raise ValueError(
                'No dataset named "immigration_data" found in sources dict.')
Exemple #15
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_date, current_timestamp, date_sub, date_add, datediff, months_between, to_date, lit
spark = SparkSession.builder.appName("Pyspark example").getOrCreate()
dateDf = spark.range(10).withColumn("Today", current_date()).withColumn(
    "Now", current_timestamp())
dateDf.createOrReplaceTempView("dftable")
dateDf.show()
dateDf.printSchema()

#'date_sub' and 'date_add' can be used to add and subtract from a particular date
dateDf.select(date_sub(col("Today"), 5), date_add(col("Today"), 4)).show(2)

#'dateddiff' and 'months_between' functions give number of dates and number of months between two dates
dateDf.withColumn("week_ago",
                  date_sub(col("Today"),
                           7)).select(datediff(col("week_ago"),
                                               col("Today"))).show(1)

dateformat = "yyyy-dd-MM"
dateDF1 = spark.range(1).select(
    to_date(lit("2020-27-08"), dateformat).alias("date"),
    to_date(lit("2020-26-08"), dateformat).alias("date1"))
dateDF1.createOrReplaceTempView("dataTable2")
dateDF1.show()
Exemple #16
0
from __future__ import print_function
import pyspark
from pyspark.sql import functions as F
import drpyspark


drpyspark.enable_debug_output()
with pyspark.SparkContext() as sc:
    sqlContext = pyspark.sql.SQLContext(sc)
    logs = sc.parallelize([
        {'timestamp': 1470663000, 'url': 'http://example.com/', 'ip': '192.168.1.1'},
        {'timestamp': 1470663163, 'url': 'http://example.com/', 'ip': '192.168.1.1'},
        {'timestamp': 1470663277, 'url': 'http://example.com/article1', 'ip': '192.168.1.2'},
        {'timestamp': 1470663277, 'url': 'http://example.com/article2', 'ip': '192.168.1.2'},
        {'timestamp': 1470663277, 'url': 'http://example.com/article3', 'ip': '192.168.1.2'},
    ])
    logs = logs.map(lambda l: pyspark.sql.Row(**l))
    logs = (sqlContext.createDataFrame(logs)
            .withColumn('timestamp', F.to_date(F.from_unixtime('timestamp')))
            .withColumn('minute', F.date_format('timestamp', "yyyy-MM-dd'T'HH")))
    (logs
     .groupBy(['minute', 'url'])
     .count()
     .show())
df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"),
    expr("count(Quantity)")).show()


# COMMAND ----------

df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"),expr("stddev_pop(Quantity)"))\
  .show()


# COMMAND ----------

from pyspark.sql.functions import col, to_date
dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm"))
dfWithDate.createOrReplaceTempView("dfWithDate")


# COMMAND ----------

from pyspark.sql.window import Window
from pyspark.sql.functions import desc
windowSpec = Window\
  .partitionBy("CustomerId", "date")\
  .orderBy(desc("Quantity"))\
  .rowsBetween(Window.unboundedPreceding, Window.currentRow)


# COMMAND ----------
#Page 228 of E-book
from __future__ import print_function

from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date
from pyspark.sql.functions import col

from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("Demo Spark Python Cluster Program").getOrCreate()
 
df1 = spark.read.text("hdfs://namenode/output/itmd-521/drp/2000/csv-file")

df2 = df1.withColumn('Weather_Station', df1['value'].substr(5, 6))\
.withColumn('WBAN', df1['value'].substr(11, 5))\
.withColumn('Observation_Date',to_date(df1['value'].substr(16,8),"yyyyMMdd"))\
.withColumn('Observation_Hour', df1['value'].substr(24, 4).cast(IntegerType()))\
.withColumn('Latitude', df1['value'].substr(29, 6).cast('float') / 1000)\
.withColumn('Longitude', df1['value'].substr(35, 7).cast('float') / 1000)\
.withColumn('Elevation', df1['value'].substr(47, 5).cast(IntegerType()))\
.withColumn('Wind_Direction', df1['value'].substr(61, 3).cast(IntegerType()))\
.withColumn('WD_Quality_Code', df1['value'].substr(64, 1).cast(IntegerType()))\
.withColumn('Sky_Ceiling_Height', df1['value'].substr(71, 5).cast(IntegerType()))\
.withColumn('SC_Quality_Code', df1['value'].substr(76, 1).cast(IntegerType()))\
.withColumn('Visibility_Distance', df1['value'].substr(79, 6).cast(IntegerType()))\
.withColumn('VD_Quality_Code', df1['value'].substr(86, 1).cast(IntegerType()))\
.withColumn('Air_Temperature', df1['value'].substr(88, 5).cast('float') /10)\
.withColumn('AT_Quality_Code', df1['value'].substr(93, 1).cast(IntegerType()))\
.withColumn('Dew_Point', df1['value'].substr(94, 5).cast('float'))\
.withColumn('DP_Quality_Code', df1['value'].substr(99, 1).cast(IntegerType()))\
.withColumn('Atmospheric_Pressure', df1['value'].substr(100, 5).cast('float')/ 10)\
Exemple #19
0
def _timeliness_todo(columns, value, df, dateFormat=None, timeFormat=None):
    """
    Returns what (columns, as in spark columns) to compute to get the results requested by
    the parameters.

    :param columns:
    :type columns: list
    :param value
    :type value: str
    :param df:
    :type df: DataFrame
    :param dateFormat:
    :type dateFormat: str
    :param timeFormat:
    :type timeFormat: str
    :return: Pyspark columns representing what to compute.
    """
    assert (dateFormat is None or timeFormat is None) and (
            not dateFormat is None or not timeFormat is None), "Pass either a dateFormat or a timeFormat, " \
                                                               "not both. "
    todo = []
    types = dict(df.dtypes)

    if dateFormat:
        value_date = to_date(lit(value), dateFormat)
        for c in columns:
            if types[c] == "timestamp" or types[c] == "date":
                todo.append(
                    sum(when(datediff(value_date, c) > 0,
                             1).otherwise(0)).alias(c))
            elif types[c] == "string":
                todo.append(
                    sum(
                        when(
                            datediff(value_date, to_date(c, dateFormat)) > 0,
                            1).otherwise(0)).alias(c))
            else:
                print(
                    "Type of a column on which the timeliness metric is run must be either timestamp, "
                    "date or string, if the metric is being run on dateFormat."
                )
                exit()
    elif timeFormat:
        value_long = to_timestamp(lit(value), timeFormat).cast("long")
        # check if value contains a date and not only hours, minutes, seconds
        has_date = _contains_date(timeFormat)
        if has_date:
            for c in columns:
                if types[c] == "timestamp":
                    todo.append(
                        sum(
                            when(value_long - col(c).cast("long") > 0,
                                 1).otherwise(0)).alias(c))
                elif types[c] == "string":
                    todo.append(
                        sum(
                            when(
                                value_long -
                                to_timestamp(col(c), timeFormat).cast("long") >
                                0, 1).otherwise(0)).alias(c))
                else:
                    print(
                        "Type of a column on which the timeliness metric is run must be either timestamp or string, if "
                        "the metric is being run on a timeFormat")
                    exit()
        else:
            for c in columns:
                if types[c] == "timestamp":
                    """
                    If there is no years, months, days we must ignore the years, months, days in the timestamp.
                    """
                    value_long = to_timestamp(lit(value), timeFormat)
                    # remove years, months, days
                    value_long = value_long.cast("long") - value_long.cast(
                        "date").cast("timestamp").cast("long")

                    # check for difference, but only considering hours, minutes, seconds
                    todo.append(
                        sum(
                            when(
                                value_long -
                                (col(c).cast("long") - col(c).cast("date").
                                 cast("timestamp").cast("long")) > 0,
                                1).otherwise(0)).alias(c))
                elif types[c] == "string":
                    """
                    If there are no years, months, days and the column is in the same format, meaning that it also
                    has no years, months, days, this means that they will be both initialized to the same year, month, day;
                    so years, months, days will be basically ignored.
                    """
                    todo.append(
                        sum(
                            when(
                                (value_long -
                                 to_timestamp(c, timeFormat).cast("long")) > 0,
                                1).otherwise(0)).alias(c))
                else:
                    print(
                        "Type of a column on which the timeliness metric is run must be either timestamp or string, if "
                        "the metric is being run on a timeFormat")
                    exit()
    return todo
Exemple #20
0
def _freshness_todo(columns, df, dateFormat=None, timeFormat=None):
    """
    Returns what (columns, as in spark columns) to compute to get the results requested by
    the parameters.

    :param columns:
    :type columns: list
    :param df:
    :type df: DataFrame
    :param dateFormat:
    :type dateFormat: str
    :param timeFormat:
    :type timeFormat: str
    :return: Pyspark columns representing what to compute.
    """
    assert (dateFormat is None or timeFormat is None) and (
            not dateFormat is None or not timeFormat is None), "Pass either a dateFormat or a timeFormat, " \
                                                               "not both. "
    types = dict(df.dtypes)
    todo = []

    if dateFormat:
        now = current_date()
        for c in columns:
            if types[c] == "timestamp" or types[c] == "date":
                todo.append(avg(abs(datediff(c, now))).alias(c))
            elif types[c] == "string":
                todo.append(
                    avg(abs(datediff(to_date(c, dateFormat), now))).alias(c))
            else:
                print(
                    "Type of a column on which the freshness metric is run must be either timestamp, "
                    "date or string, if the metric is being run on dateFormat."
                )
                exit()
    elif timeFormat:
        # check if value contains a date and not only hours, minutes, seconds
        has_date = _contains_date(timeFormat)

        current = current_timestamp()
        if has_date:
            """
            If the time format also contains a date it means the user is also interested in comparing years, months, days, 
            etc.
            """
            now = current.cast("long")
            for c in columns:
                if types[c] == "timestamp":
                    todo.append(avg(abs(col(c).cast("long") - now)).alias(c))
                elif types[c] == "string":
                    todo.append(
                        avg(abs(
                            to_timestamp(c, timeFormat).cast("long") -
                            now)).alias(c))
                else:
                    print(
                        "Type of a column on which the freshness metric is run must be either timestamp"
                        "or string, if the metric is being run on timeFormat.")
                    exit()
        else:
            """
            If the timestamp has no date the user is not interested in differences that consider years, months, days, but
            only hours, minutes, seconds.
            """
            now = current
            now = now.cast("long") - now.cast("date").cast("timestamp").cast(
                "long")
            for c in columns:
                if types[c] == "timestamp":
                    todo.append(
                        avg(
                            abs((col(c).cast("long") - col(c).cast(
                                "date").cast("timestamp").cast("long")) -
                                now)).alias(c))
                elif types[c] == "string":
                    """
                    Need to remove seconds from years, months and days here as well because even if the format
                    does not specify anything for those values they are initialized to something by default.
                    """
                    todo.append(
                        avg(
                            abs((to_timestamp(c, timeFormat).cast("long") -
                                 to_timestamp(c, timeFormat).cast("date").cast(
                                     "timestamp").cast("long")) -
                                now)).alias(c))
                else:
                    print(
                        "Type of a column on which the freshness metric is run must be either timestamp"
                        "or string, if the metric is being run on timeFormat.")
                    exit()
    return todo
Exemple #21
0
#loading datasets
raw_data_df = spark.read.csv("D:/Training/kaggle_dataset_coronaviru_sample/novel-corona-virus-2019-dataset/covid_19_data_rought.csv" ,\
                        header = True , inferSchema = True )

#rename a few col
raw_data_df_tmp = raw_data_df.withColumnRenamed(
    'Province/State',
    'ProvinceOrState').withColumnRenamed('Country/Region', 'CountryOrRegion')

#cleaning and filter data
drop_cols = ['SNo', 'Recovered', 'Last Update']
filtered_data_df = raw_data_df_tmp.filter(
    (raw_data_df_tmp.Confirmed > raw_data_df_tmp.Recovered)).drop(
        *drop_cols).fillna('unknown', subset=['ProvinceOrState'])

date_correction_df = filtered_data_df.select( when(to_date( F.col("ObservationDate") ,"MM/dd/yyyy").isNotNull() , \
                          to_date( F.col("ObservationDate") ,"MM/dd/yyyy")). \
                          when(to_date( F.col("ObservationDate") ,"MM-dd-yyyy").isNotNull() , \
                          to_date( F.col("ObservationDate") ,"MM-dd-yyyy")).otherwise("Unknown Format").alias("Date") , \
                          filtered_data_df['ProvinceOrState'] ,  filtered_data_df['CountryOrRegion'] , \
                          filtered_data_df['Confirmed'], filtered_data_df['Deaths'])

date_correction_df.createOrReplaceTempView('Covid19_dataset_tbl')

#get the latest entry of every state in a country and get sum of all the cases and deaths
groupby_states = """select CountryOrRegion , ProvinceOrState , Confirmed ,Deaths, latest_date 
                    from (select CountryOrRegion , ProvinceOrState , Confirmed , Deaths, Date , max(Date)
                    over( partition by ProvinceOrState) as Latest_date from Covid19_dataset_tbl ) 
                    as sub where Latest_date = Date """

latest_city_entry_df = spark.sql(groupby_states)
Exemple #22
0
# -*- coding: utf-8 -*-

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType


#Inicializa sessão spark
spark = SparkSession \
        .builder \
        .appName("Processa arquivo de dados base exemplo") \
        .getOrCreate()

to_value = lambda v: float(v.replace(",","."))
udf_to_value = F.udf(to_value, pyspark.sql.types.FloatType())

df=spark.read.csv("/tmp/spark/201302_Diarias.utf8.csv",header=True,sep="\t")

df2=df.withColumn("value", udf_to_value(df["Valor Pagamento"])) \
    .withColumn("Dtpg", F.to_date(df["Data Pagamento"], format="dd/MM/yyyy"))

df3=df2.select(df2["Nome Órgão Superior"].alias("Orgao"),df2["Data Pagamento"].alias("DtPg"))

df3.show()
Exemple #23
0
URL_MAPPING_FILE = os.path.join(MAPPING_DIR, "DOMAINSBYCOUNTRY-ALLLANGUAGES.TXT")
COUNTRY_CODES_MAPPING_FILE = os.path.join(MAPPING_DIR, "mapping_country_codes.csv")
PARQUET_FOLDER_NAME_EVENT = "Event.parquet"
PARQUET_FOLDER_NAME_MENTIONS = "Mentions.parquet"


# put at the head of the file so that the job stops immediately if it cannot read them
mapping_country_codes = spark.read.csv(COUNTRY_CODES_MAPPING_FILE, header=True)
mapping_url = spark.read.csv(URL_MAPPING_FILE, sep="\t", header=True)  # use the first line as column labels


# read the export.csv files
print("Reading the files")
event = spark.read.csv(os.path.join(DATA_DIR, "*.export.CSV"), sep="\t")
event = event.select(event._c0.cast("bigint").alias("Id"),
	to_date(event._c1, "yyyymmdd").alias("Date"),
	event._c7.alias("Country"),
	event._c12.alias("Type"),
	event._c34.cast("double").alias("AvgTone"))  # selecting columns of interest and casting them

event = event.dropna(how='any', subset=("Id", "Date", "Country", "AvgTone"))  # we accept to not have a Type but the other fields are necessary
event.registerTempTable('event')
print("event treated, {} rows left in event".format(event.count()))


# transform countries code into countries names
# data was taken from https://datahub.io/core/country-codes#data on 7th of november 2018 and then adapted to match the 
# countries names used in the GDELT dataset (some countries did not have exactly the same name as mapping_url)
mapping_country_codes = mapping_country_codes.select(mapping_country_codes.official_name_en, 
	mapping_country_codes["ISO3166-1-Alpha-3"].alias("ISO"))
mapping_country_codes.registerTempTable('mappingcc')
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType, Row, ArrayType, StringType
import pyspark.sql.functions as F
from pyspark.sql.functions import *
import json

# COMMAND ----------

spark = SparkSession.builder.appName("Twitter Analysis").getOrCreate()
sc = spark.sparkContext

# COMMAND ----------

tweets=spark.read.parquet("/FileStore/tables/est_tweets/")
tweets=tweets.drop('time','location').withColumnRenamed('new_location','state').withColumn("date_only", F.to_date(F.col("est_time")))

# COMMAND ----------

#tweets = tweets[(tweets['est_time'] > '2020-11-03 07:00:00') & (tweets['est_time'] < '2020-11-08 11:59:00')]
tweets.printSchema()
tweets.count()

# COMMAND ----------

joe_only = tweets.filter((tweets['text'].rlike("[Jj]oe|[Bb]iden") == True) & (tweets['text'].rlike("[Dd]onald|[Tt]rump") == False))
# print("Only Biden Tweets \t\t: ", joe_only.count())
trump_only = tweets.filter((tweets['text'].rlike("[Jj]oe|[Bb]iden") == False) & (tweets['text'].rlike("[Dd]onald|[Tt]rump") == True))
# print("Only Donald Trump Tweets \t\t: ", trump_only.count())
joe_and_trump = tweets.filter((tweets['text'].rlike("[Dd]onald|[Tt]rump")) & (tweets['text'].rlike("[Jj]oe|[Bb]iden")))
# print("Both Joe_Biden & Trump Tweets \t\t: ", joe_and_trump.count())
 
 ##
 df0 = spark.read.format('csv').option('header','true').load(infile)
   # columns: msno, payment_method_id, plan_list_price, actual_amount_paid, is_auto_renew, 
   #                transaction_date, membership_expire_date, is_cancel, plan_days
   # a record: 
   #          ++Jv+7YFiQv05MJ+Ep3wKS4QxojHEu78P3JOsR9djlo=,41,149,0,1,20150102,20150301,0,30
   
 ## casting columns
 df1 = df0.select('msno', 
                  expr('payment_method_id AS pay_method'),
                  expr('plan_list_price AS list_price'),
                  expr('actual_amount_paid AS actual_paid'),
                  'is_auto_renew',
                  expr('CAST(is_cancel AS boolean) AS is_cancel'),
                  to_date('transaction_date', 'yyyyMMdd').alias('trans_date'),
                  expr('CAST(plan_days AS int) AS plan_days'),
                  to_date('membership_expire_date', 'yyyyMMdd').alias('exp_date')
                 )
                 
 ## date corrections          
 udf_dates_correction = udf(dates_correction, ArrayType(DateType()))
 df2 = df1.withColumn('corr_start_exp_dates',
                  udf_dates_correction('is_cancel', 'trans_date', 'exp_date', 'plan_days')) \
   .withColumn('corr_start_date', expr('corr_start_exp_dates[0]')) \
   .withColumn('corr_exp_date', expr('corr_start_exp_dates[1]')) \
   .drop('corr_start_exp_dates')
   
 df3 = df2.where(~col('corr_exp_date').isNull()) 
 
 ## casting columns
Exemple #26
0
def load_data():
    """Load the data from the different files and join the togheter
    """

    users_schema = StructType([
        StructField('timestamp', TimestampType(), nullable=False),
        StructField('userId', IntegerType(), False),
        StructField('nick', StringType(), False),
        StructField('twitter', StringType(), False),
        StructField('dob', DateType(), False),
        StructField('country', StringType(), False)
    ])

    users = spark.read.load('data/users.csv',
                            format='csv',
                            schema=users_schema,
                            header=True)

    users = users.withColumn(
        'age',
        F.datediff(F.to_date(F.lit('2016-06-16'), 'yyyy-mm-dd'), users.dob) /
        365)

    ages = users.select(['userId', 'age'])

    buy_click = spark.read.load('data/buy-clicks.csv',
                                format='csv',
                                inferSchema=True,
                                header=True,
                                timestampFormat="yyyy-MM-dd HH:mm:ss")

    revenue_by_session_by_user = buy_click.groupBy([
        'userId', 'userSessionId'
    ]).agg(F.sum('price').alias('revenue')).select(['userId', 'revenue'])
    revenues=revenue_by_session_by_user.groupBy('userId').agg(F.mean('revenue').alias('avg_buy'), F.min('revenue')\
            .alias('min_buy'), F.max('revenue').alias('max_buy'))

    game_click = spark.read.load('data/game-clicks.csv',
                                 format='csv',
                                 inferSchema=True,
                                 header=True,
                                 timestampFormat="yyyy-MM-dd HH:mm:ss")

    avg_ishit = game_click.groupBy(['userId'
                                    ]).agg(F.mean('isHit').alias('avg_isHit'))

    user_session = spark.read.load('data/user-session.csv',
                                   format='csv',
                                   inferSchema=True,
                                   header=True,
                                   timestampFormat="yyyy-MM-dd HH:mm:ss")

    team = spark.read.load('data/team.csv',
                           format='csv',
                           inferSchema=True,
                           header=True,
                           timestampFormat="yyyy-MM-dd HH:mm:ss")

    strengths = team.join(user_session, on='teamId',
                          how='inner').select(['userId',
                                               'strength']).dropDuplicates()

    data = ages.join(revenues, on='userId',
                     how='inner').join(avg_ishit, on='userId',
                                       how='inner').join(strengths,
                                                         on='userId',
                                                         how='left').na.fill(0)

    return data
Exemple #27
0
    filter("ICUSTAY_ID >= 1 and ICUSTAY_ID <= 1").\
    drop('ICUSTAY_ID')
stays_df = stays_df.join(to_keep, on='HADM_ID')
# x = stays_df.count()
if args.verbose:
    print(
        'REMOVE MULTIPLE STAYS PER ADMIT:\n\tICUSTAY_IDs: {}\n\tHADM_IDs: {}\n\tSUBJECT_IDs: {}'
        .format(stays.ICUSTAY_ID.unique().shape[0],
                stays.HADM_ID.unique().shape[0],
                stays.SUBJECT_ID.unique().shape[0]))

stays = add_age_to_icustays(stays)

# stays_df.withColumn('AGE', stays_df['INDATE'] - stays_df["DOBDATE"])
stays_df = stays_df.select("*",
                           psql.to_date(stays_df["INTIME"]).alias("INDATE"))
stays_df = stays_df.select(
    "*",
    psql.to_date(stays_df["DOB"]).alias("DOBDATE"))  #.alias("DOBDATE")
stays_df = stays_df.select(
    "*",
    psql.floor((psql.datediff(stays_df['INDATE'], stays_df["DOBDATE"]) /
                365.0)).alias("AGE"))
# alias("AGE"))#.alias("AGE")
# print(stays_df.filter("AGE > 250").count())
stays_df = stays_df.withColumn(
    "AGE",
    psql.when(stays_df.AGE > 250, 90).otherwise(stays_df.AGE))
# print(stays_df.filter("AGE < 0").count())

stays = add_inunit_mortality_to_icustays(stays)
Exemple #28
0
        .appName("Agg Demo") \
        .master("local[2]") \
        .getOrCreate()

    logger = Log4j(spark)

    invoice_df = spark.read \
        .format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load("data/invoices.csv")

    NumInvoices = f.countDistinct("InvoiceNo").alias("NumInvoices")
    TotalQuantity = f.sum("Quantity").alias("TotalQuantity")
    InvoiceValue = f.expr("round(sum(Quantity * UnitPrice),2) as InvoiceValue")

    exSummary_df = invoice_df \
        .withColumn("InvoiceDate", f.to_date(f.col("InvoiceDate"), "dd-MM-yyyy H.mm")) \
        .where("year(InvoiceDate) == 2010") \
        .withColumn("WeekNumber", f.weekofyear(f.col("InvoiceDate"))) \
        .groupBy("Country", "WeekNumber") \
        .agg(NumInvoices, TotalQuantity, InvoiceValue)

    exSummary_df.coalesce(1) \
        .write \
        .format("parquet") \
        .mode("overwrite") \
        .save("output")

    exSummary_df.sort("Country", "WeekNumber").show()
Exemple #29
0
def compile_date(t, expr, scope, **kwargs):
    op = expr.op()
    src_column = t.translate(op.arg, scope)
    return F.to_date(src_column).cast('timestamp')
Exemple #30
0
tmp = tmp \
    .withColumn('pickup_longitude', func.lit(None)) \
    .withColumn('pickup_latitude', func.lit(None)) \
    .withColumn('dropoff_longitude', func.lit(None)) \
    .withColumn('dropoff_latitude', func.lit(None)) \
    .withColumn('with_areas', func.lit(True)) \
    .withColumnRenamed('VendorID', 'vendor_id') \
    .withColumnRenamed('tpep_pickup_datetime', 'pickup_datetime') \
    .withColumnRenamed('tpep_dropoff_datetime', 'dropoff_datetime') \
    .withColumnRenamed('RatecodeID', 'rate_code') \
    .select(new_columns_names)
data_final = data.union(tmp)

print("Casting columns")
data_final = data_final \
    .withColumn("pickup_datetime", func.to_date("pickup_datetime")) \
    .withColumn("dropoff_datetime", func.to_date("dropoff_datetime"))

data_final = data_final \
    .withColumn("passenger_count", data_final.passenger_count.cast("int")) \
    .withColumn("trip_distance", data_final.trip_distance.cast("float")) \
    .withColumn("payment_type", data_final.payment_type.cast("float")) \
    .withColumn("pickup_longitude", data_final.pickup_longitude.cast("float")) \
    .withColumn("pickup_latitude", data_final.pickup_latitude.cast("float")) \
    .withColumn("dropoff_longitude", data_final.dropoff_longitude.cast("float")) \
    .withColumn("dropoff_latitude", data_final.dropoff_latitude.cast("float")) \
    .withColumn("fare_amount", data_final.fare_amount.cast("float")) \
    .withColumn("extra", data_final.extra.cast("float")) \
    .withColumn("mta_tax", data_final.mta_tax.cast("float")) \
    .withColumn("tip_amount", data_final.tip_amount.cast("float")) \
    .withColumn("tolls_amount", data_final.tolls_amount.cast("float")) \
              ("community_area", "int", "community_area", "int"),
              ("fbi_code", "string", "fbi_code", "string"),
              ("x_coordinate", "double", "x_coordinate", "double"),
              ("y_coordinate", "double", "y_coordinate", "double"),
              ("year", "int", "year", "int"),
              ("updated_on", "string", "updated_on", "timestamp"),
              ("latitude", "double", "latitude", "double"),
              ("longitude", "double", "longitude", "double"),
              ("location", "string", "location", "string")])

#convert to spark dataframe
df = dynamic_frame.toDF()
df.show()

# convert date columns to day & month
df = df.withColumn("date_added", to_date(split(df["date"], " ").getItem(0).cast("string"), 'MM/dd/yyyy')) \
    .withColumn("month", split(col("date_added"), "-").getItem(1)) \
    .withColumn("day", split(col("date_added"), "-").getItem(2)) \
    .orderBy('date_added')
print("Dataframe sorted")

partitioned_dataframe = df.repartition("day")

# Convert back to dynamic frame
dynamic_frame2 = DynamicFrame.fromDF(partitioned_dataframe,
                                     glue_context,
                                     "dynamic_frame_write",
                                     transformation_ctx="applymapping1")
#resolve discrepency in columns data types
resolvechoice = ResolveChoice.apply(frame=dynamic_frame2,
                                    choice="make_struct",
def date_to_year(data_frame, date_format):
    data_frame = data_frame.\
        withColumn("first_payment_year", functions.year(functions.to_date(data_frame.first_payment_date, date_format)))
    return data_frame
weekday_udf = udf(lambda date_time: date_time.weekday(), IntegerType())
is_holiday_udf = udf(lambda date_time: date_time.date() in holidays.UnitedStates(), BooleanType())

date_df = date_df.withColumn('Hour', func.hour(date_df.Time))
date_df = date_df.withColumn('Day_Of_Week', weekday_udf(date_df.Time))
date_df = date_df.withColumn('Day_Of_Year', func.dayofyear(date_df.Time))
date_df = date_df.withColumn('Is_Holiday', is_holiday_udf(date_df.Time))


# Aggregate events happening in last and next 3 hours for each hour
event_3h_df = event_df.withColumnRenamed('Venues', 'Venues_0h')
for i in range(-3, 4):
    if i != 0:
        add_hours_udf = udf(lambda date_time: date_time + datetime.timedelta(hours=i), TimestampType())
        event_3h_df = event_3h_df.join(event_df.withColumn('Time', add_hours_udf(event_df.Time)).withColumnRenamed('Venues', 'Venues_%sh' % str(i)), 'Time')


# Join single feature groups
features_df = taxi_df.select(index_columns + [taxi_df.Pickup_Count]) \
                     .join(taxi_dis_1h_df, index_columns) \
                     .join(taxi_dis_4h_df, index_columns) \
                     .join(taxi_nb_1h_df, index_columns) \
                     .join(taxi_nb_4h_df, index_columns) \
                     .join(taxi_nyc_1h_df, 'Time') \
                     .join(taxi_nyc_4h_df, 'Time') \
                     .join(date_df, 'Time') \
                     .join(weather_df, func.to_date(taxi_df.Time) == weather_df.Date).drop(weather_df.Date) \
                     .join(event_3h_df, 'Time')

features_df.write.parquet(output_file)
def date_to_month(data_frame, date_format):
    data_frame = data_frame.\
        withColumn("first_payment_month", functions.month(functions.to_date(data_frame.first_payment_date, date_format)))
    return data_frame
Exemple #35
0
 def test_datetime_functions(self):
     from pyspark.sql import functions
     from datetime import date
     df = self.spark.range(1).selectExpr("'2017-01-22' as dateCol")
     parse_result = df.select(functions.to_date(functions.col("dateCol"))).first()
     self.assertEquals(date(2017, 1, 22), parse_result['to_date(`dateCol`)'])
from pyspark.sql import types
from pyspark.ml.feature import StopWordsRemover, Tokenizer

if len(sys.argv) != 2:
    print("Number of arguments: ", len(sys.argv))
    raise Exception("Exactly ONE pyspark job file is required!")

PATH = sys.argv[-1]

spark = SparkSession.builder.appName("acledPyspark").getOrCreate()
df = spark.read.csv(path=PATH, sep=",", quote='"', header=True)
print(df.count(), len(df.columns))
df.printSchema()

# Column transformations
df = df.withColumn("event_date", f.to_date("event_date", "dd MMMM yyyy"))
df = df.withColumn("year", f.to_date("year", "yyyy"))
df = df.withColumn("latitude", df["latitude"].cast(types.DoubleType()))
df = df.withColumn("longitude", df["longitude"].cast(types.DoubleType()))
df = df.withColumn("fatalities", df["fatalities"].cast(types.IntegerType()))

# WRITING TABLES TO BQ
bucket = 'acled-pyspark-bucket'
spark.conf.set('temporaryGcsBucket', bucket)

# 1. write historic parquet file into bigquery (more compact)
df.write.format("bigquery") \
    .option('table', 'acled_dataset.historic_fact_table') \
    .save()

# 2. events per day 2020
from pyspark.sql import SparkSession

from pyspark.sql import functions as F
from pyspark.sql.types import FloatType

spark = SparkSession \
            .builder \
            .appName("Python Spark SQL basic example") \
            .getOrCreate()

to_value=lambda v: float(v.replace(",", "."))
udf_to_value = F.udf(to_value, FloatType())

df=spark.read.csv("201707_Diarias.utf8.truncated.csv", header=True, sep="\t")

df2=df.withColumn("Valor", udf_to_value(df["Valor Pagamento"])) \
      .withColumn("DtPg", F.to_date(df["Data Pagamento"], format="dd/MM/yyyy"))

df3=df2.select(df2["Nome Órgão Superior"].alias("Orgao"), 
            df2["Nome Função"].alias("Funcao"),
            df2["Nome Programa"].alias("Programa"), 
            df2["Nome Favorecido"].alias("Favorecido"), 
            df2["Valor"], df2["DtPg"])

df3.write.parquet("governo/viagens/2017/07")
Exemple #38
0
def txtToPq_v2(inputFolder, pqFolder, pqFileName, searchString = "*.txt", append = True):
    """
    Read in all txt files in a folder, convert to parquet, and either append parquet or create new parquet
	This version is compatible with some of the v1.1 files inside s3://flight.price.11
	Main difference: leg1 is renamed to leg1	
    @params:
        inputFolder   - Required  : input folder that contains json line txt files (Str)        
        pqFolder      - Required  : folder to save the parquet files into (Str)        
        pqFileName    - Required  : parquet file name (Bool)        
        append        - Optional  : append to existing parquet or create new parquet 
        searchString  - Optional  : search string that identifies all the json line text files (Str)        
    """
    
    flightv1_1 = spark.read.json(os.path.join(inputFolder, searchString))
    
    flightv1_1_2 = (flightv1_1.withColumn('trip', col('trip').cast('string'))
                            .withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stayDays')))                    
                            .withColumn('depDate', to_date('depDate'))
                            .withColumn('searchDate', to_date('searchDate'))
                            .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later
                            .withColumn('airline_code', flightv1_1.leg1.carrierSummary.airlineCodes.getItem(0))                   
                            .withColumn('airline_codes', flightv1_1.leg1.carrierSummary.airlineCodes)                    
                            .withColumn('airline_codes_leg2', flightv1_1.leg2.carrierSummary.airlineCodes)                    
                            .withColumn('departureTime', flightv1_1.leg1.departureTime.isoStr)
                            .withColumn('departureTime_leg2', flightv1_1.leg2.departureTime.isoStr)
                            .withColumn('arrivalTime', flightv1_1.leg1.arrivalTime.isoStr)
                            .withColumn('arrivalTime_leg2', flightv1_1.leg2.arrivalTime.isoStr)
        #                 .withColumn('check_bag_inc', flightv1_1.leg1.arrivalTime)
                            .withColumn('airlineName', flightv1_1.leg1.carrierSummary.airlineName)
                            .withColumn('airlineName_leg2', flightv1_1.leg2.carrierSummary.airlineName)
                            .withColumn('duration_m', (F.unix_timestamp('arrivalTime', format=timeFmt) - 
                                                       F.unix_timestamp('departureTime', format=timeFmt))/60)                    
                        .withColumn('duration_m_leg2', (F.unix_timestamp('arrivalTime_leg2', format=timeFmt) - 
                                                       F.unix_timestamp('departureTime_leg2', format=timeFmt))/60)                    
        #                     .withColumn('duration', flightv1_1.timeline1.getItem(1).duration)
                        .withColumn('airlineCode', flightv1_1.timeline1.getItem(0).carrier.airlineCode)
                        .withColumn('flightNumber', flightv1_1.timeline1.getItem(0).carrier.flightNumber.cast('string'))                
                        .select('*', F.concat(col('airlineCode'), col('flightNumber')).alias('flight_code'))
                        .drop('airlineCode', 'flightNumber')
                        .withColumn('plane', flightv1_1.timeline1.getItem(0).carrier.plane)                
                        .withColumn('stops', flightv1_1.leg1.stops.cast('byte'))                                
                        .withColumn('stops_leg2', flightv1_1.leg2.stops.cast('byte'))                

        #                 .withColumn('stop_list', flightv1_1.leg1.stop_list)# need to do more work                
                        .withColumn('stop_airport', take_all_level1_str(flightv1_1.leg1.stop_list, lit('airport')))                                               
                        .withColumn('stop_duration', take_all_level1_str(flightv1_1.leg1.stop_list, lit('duration')))                                               

        #                 .withColumn('stop_list_leg2', flightv1_1.leg2.stop_list)               
                        .withColumn('stop_airport_leg2', take_all_level1_str(flightv1_1.leg2.stop_list, lit('airport')))                                               
                        .withColumn('stop_duration_leg2', take_all_level1_str(flightv1_1.leg2.stop_list, lit('duration')))                                               


                        .withColumn('noOfTicketsLeft', correct_tickets_left_UDF(flightv1_1.leg1.carrierSummary.noOfTicketsLeft))
                        .withColumn('noOfTicketsLeft', col('noOfTicketsLeft').cast('byte'))                
                        .withColumn('noOfTicketsLeft_leg2', correct_tickets_left_UDF(flightv1_1.leg2.carrierSummary.noOfTicketsLeft))
                        .withColumn('noOfTicketsLeft_leg2', col('noOfTicketsLeft_leg2').cast('byte'))
                        .withColumn('fromCityAirportCode', flightv1_1.leg1.departureLocation.airportCode)                
                        .withColumn('toCityAirportCode', flightv1_1.leg1.arrivalLocation.airportCode)
                        .withColumn('fromCityAirportCode_leg2', flightv1_1.leg2.departureLocation.airportCode)
                        .withColumn('toCityAirportCode_leg2', flightv1_1.leg2.arrivalLocation.airportCode)

                        # carrier leg 1
                        .withColumn('carrierAirProviderId', flightv1_1.leg1.carrierSummary.airProviderId)
                        .withColumn('carrierAirlineImageFileName', flightv1_1.leg1.carrierSummary.airlineImageFileName)
                        .withColumn('carrierMixedCabinClass', flightv1_1.leg1.carrierSummary.mixedCabinClass)
                        .withColumn('carrierMultiStop', flightv1_1.leg1.carrierSummary.multiStop)
                        .withColumn('carrierNextDayArrival', flightv1_1.leg1.carrierSummary.nextDayArrival)

                        # carrier leg 2
                        .withColumn('carrierAirProviderId_leg2', flightv1_1.leg2.carrierSummary.airProviderId)
                        .withColumn('carrierAirlineImageFileName_leg2', flightv1_1.leg2.carrierSummary.airlineImageFileName)
                        .withColumn('carrierMixedCabinClass_leg2', flightv1_1.leg2.carrierSummary.mixedCabinClass)
                        .withColumn('carrierMultiStop_leg2', flightv1_1.leg2.carrierSummary.multiStop)
                        .withColumn('carrierNextDayArrival_leg2', flightv1_1.leg2.carrierSummary.nextDayArrival)

                        ### Leg 1
                        ## Leg 1 departure
        #                 .withColumn('timeline_departureAirport', take_all_airport(flightv1_1.timeline1, lit('departureAirport')))                               
                        .withColumn('timeline_departureAirport_cityState', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('airportCityState')))
                        .withColumn('timeline_departureAirport_city', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('city')))
                        .withColumn('timeline_departureAirport_code', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('code')))
                        .withColumn('timeline_departureAirport_localName', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('localName')))
                        .withColumn('timeline_departureAirport_longName', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('longName')))
                        .withColumn('timeline_departureAirport_name', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('name')))

                        .withColumn('timeline_departureTime', take_all_level2_str(flightv1_1.timeline1, lit('departureTime'), lit('isoStr')))



                        ## Leg 1 arrival
                        .withColumn('timeline_arrivalAirport_cityState', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('airportCityState')))
                        .withColumn('timeline_arrivalAirport_city', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('city')))
                        .withColumn('timeline_arrivalAirport_code', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('code')))
                        .withColumn('timeline_arrivalAirport_localName', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('localName')))
                        .withColumn('timeline_arrivalAirport_longName', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('longName')))
                        .withColumn('timeline_arrivalAirport_name', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('name')))                

                        .withColumn('timeline_arrivalTime', take_all_level2_str(flightv1_1.timeline1, lit('arrivalTime'), lit('isoStr')))

                        # distance
                        .withColumn('timeline_distance', take_all_level2_str(flightv1_1.timeline1, lit('distance'), lit('formattedTotal')))

                        # carrier
                        .withColumn('timeline_plane', take_all_level2_str(flightv1_1.timeline1, lit('carrier'), lit('plane')))

                        # brandedFareName
                        .withColumn('timeline_brandedFareName', take_all_level1_str(flightv1_1.timeline1, lit('brandedFareName')))                               

                        # type
                        .withColumn('timeline_type', take_all_level1_str(flightv1_1.timeline1, lit('type')))                               

                        ### Leg 2
                        ## Leg 2 departure
                        .withColumn('timeline_departureAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('airportCityState')))
                        .withColumn('timeline_departureAirport_city_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('city')))
                        .withColumn('timeline_departureAirport_code_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('code')))
                        .withColumn('timeline_departureAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('localName')))
                        .withColumn('timeline_departureAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('longName')))
                        .withColumn('timeline_departureAirport_name_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('name')))

                        .withColumn('timeline_departureTime_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureTime'), lit('isoStr')))                


                        ## Leg 2 arrival
                        .withColumn('timeline_arrivalAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('airportCityState')))
                        .withColumn('timeline_arrivalAirport_city_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('city')))
                        .withColumn('timeline_arrivalAirport_code_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('code')))
                        .withColumn('timeline_arrivalAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('localName')))
                        .withColumn('timeline_arrivalAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('longName')))
                        .withColumn('timeline_arrivalAirport_name_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('name')))                

                        .withColumn('timeline_arrivalTime_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalTime'), lit('isoStr')))

                        # distance
                        .withColumn('timeline_distance_leg2', take_all_level2_str(flightv1_1.timeline2, lit('distance'), lit('formattedTotal')))

                        # carrier
                        .withColumn('timeline_plane_leg2', take_all_level2_str(flightv1_1.timeline2, lit('carrier'), lit('plane')))

                        # brandedFareName
                        .withColumn('timeline_brandedFareName_leg2', take_all_level1_str(flightv1_1.timeline2, lit('brandedFareName')))                           

                        # type
                        .withColumn('timeline_type_leg2', take_all_level1_str(flightv1_1.timeline2, lit('type')))                               

                        # create variables droppped from v1.0
                        .withColumn('span_days', lit(99))
                        .withColumn('power', lit(False))
                        .withColumn('video', lit(False))
                        .withColumn('wifi', lit(False))
                        .withColumn('stop_info', col('stop_airport')) #placeholder. can't figure out how to create struct literal


                        .select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 
                                'fromCity', 'toCity', 'trip', 'depDate', 'retDate',
                                'stayDays', 
                               'departureTime', 'arrivalTime', 'departureTime_leg2', 'arrivalTime_leg2',
                                'airlineName', 'airlineName_leg2', 'duration_m', 'duration_m_leg2',                
                                'flight_code', 'plane', 'stops', 'stops_leg2', 'stop_airport', 'stop_duration', 'stop_airport_leg2', 'stop_duration_leg2',
                                'noOfTicketsLeft', 'noOfTicketsLeft_leg2',
                               'airline_code', 'airline_codes', 'airline_codes_leg2', 
                                'fromCityAirportCode', 'toCityAirportCode', 'fromCityAirportCode_leg2', 'toCityAirportCode_leg2',
                               'carrierAirProviderId', 'carrierAirlineImageFileName', 'carrierMixedCabinClass', 'carrierMultiStop', 'carrierNextDayArrival',
                                'carrierAirProviderId_leg2', 'carrierAirlineImageFileName_leg2', 'carrierMixedCabinClass_leg2', 'carrierMultiStop_leg2', 'carrierNextDayArrival_leg2',
                                #'url',

                                ## leg 1
                                # departure
                                'timeline_departureAirport_cityState', 'timeline_departureAirport_city', 'timeline_departureAirport_code', 'timeline_departureAirport_localName', 
                                'timeline_departureAirport_longName', 'timeline_departureAirport_name',

                                'timeline_departureTime',

                                # arrival
                                'timeline_arrivalAirport_cityState', 'timeline_arrivalAirport_city', 'timeline_arrivalAirport_code', 'timeline_arrivalAirport_localName', 
                                'timeline_arrivalAirport_longName', 'timeline_arrivalAirport_name',

                                'timeline_arrivalTime',

                                'timeline_distance',
                                'timeline_plane',
                                'timeline_brandedFareName',
                                'timeline_type',

                                ## leg 2                        
                                # departure
                                'timeline_departureAirport_cityState_leg2', 'timeline_departureAirport_city_leg2', 'timeline_departureAirport_code_leg2', 'timeline_departureAirport_localName_leg2', 
                                'timeline_departureAirport_longName_leg2', 'timeline_departureAirport_name_leg2',

                                'timeline_departureTime_leg2',

                                # arrival
                                'timeline_arrivalAirport_cityState_leg2', 'timeline_arrivalAirport_city_leg2', 'timeline_arrivalAirport_code_leg2', 'timeline_arrivalAirport_localName_leg2', 
                                'timeline_arrivalAirport_longName_leg2', 'timeline_arrivalAirport_name_leg2',

                                'timeline_arrivalTime_leg2',

                                'timeline_distance_leg2',
                                'timeline_plane_leg2',
                                'timeline_brandedFareName_leg2',
                                'timeline_type_leg2',

                                # variables dropped from v1.0
                                'span_days', 'power', 'video', 'wifi', 'stop_info'
                               )                
                       )


    if append:
        flightv1_1_2.repartition(1).write.mode('append').parquet(os.path.join(pqFolder, pqFileName))        
    else:
        flightv1_1_2.repartition(1).write.parquet(os.path.join(pqFolder, pqFileName))