def test_math_functions(self): df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF() from pyspark.sql import functions import math def get_values(l): return [j[0] for j in l] def assert_close(a, b): c = get_values(b) diff = [abs(v - c[k]) < 1e-6 for k, v in enumerate(a)] return sum(diff) == len(a) assert_close([math.cos(i) for i in range(10)], df.select(functions.cos(df.a)).collect()) assert_close([math.cos(i) for i in range(10)], df.select(functions.cos("a")).collect()) assert_close([math.sin(i) for i in range(10)], df.select(functions.sin(df.a)).collect()) assert_close([math.sin(i) for i in range(10)], df.select(functions.sin(df['a'])).collect()) assert_close([math.pow(i, 2 * i) for i in range(10)], df.select(functions.pow(df.a, df.b)).collect()) assert_close([math.pow(i, 2) for i in range(10)], df.select(functions.pow(df.a, 2)).collect()) assert_close([math.pow(i, 2) for i in range(10)], df.select(functions.pow(df.a, 2.0)).collect()) assert_close([math.hypot(i, 2 * i) for i in range(10)], df.select(functions.hypot(df.a, df.b)).collect()) assert_close([math.hypot(i, 2 * i) for i in range(10)], df.select(functions.hypot("a", u"b")).collect()) assert_close([math.hypot(i, 2) for i in range(10)], df.select(functions.hypot("a", 2)).collect()) assert_close([math.hypot(i, 2) for i in range(10)], df.select(functions.hypot(df.a, 2)).collect())
def add_haversine_distances(df, lat_a, lon_a, lat_b, lon_b): """ Although the Haversine distance is readily available from sklearn.metrics.pairwise.haversine_distances, the implementation below makes use only of in-built PySpark functions, so should be more efficient than passing the sklearn function in as a UDF. :param df: :param lat_a: :param lon_a: :param lat_b: :param lon_b: :return: """ earth_radius_km = 6371.0 return \ df \ .withColumn('dist_lat', F.radians(lat_a) - F.radians(lat_b)) \ .withColumn('dist_lon', F.radians(lon_a) - F.radians(lon_b)) \ .withColumn('area', (F.sin(F.col('dist_lat') / 2) ** 2) + (F.cos(F.radians(lat_a)) * F.cos(F.radians(lat_b)) * (F.sin(F.col('dist_lon') / 2) ** 2) ) ) \ .withColumn('central_angle', 2 * F.asin(F.sqrt(F.col('area')))) \ .withColumn('distance_km', F.col('central_angle') * F.lit(earth_radius_km)) \ .drop('dist_lat', 'dist_lon', 'area', 'central_angle')
def add_solar_features(df): return (df.withColumn( "declination_angle", radians(-23.45 * cos(((2 * pi) / 365) * (dayofyear("date") + 10))), ).withColumn("diff_local_time_UTC", timezone_from_date("date")).withColumn( "d", (2 * pi * dayofyear("date")) / 365).withColumn( "equation_of_time", -7.655 * sin(col("d")) + 9.873 * sin(2 * col("d") + 3.588), ).drop("d").withColumn( "time_correction", 4 * (col("loc_long") - (15 * col("diff_local_time_UTC"))) + col("equation_of_time"), ).withColumn( "local_solar_hour", col("hour") + 0.5 + col("time_correction") / 60).withColumn( "hour_angle", 0.2618 * (col("local_solar_hour") - 12)).drop( "diff_local_time_UTC", "equation_of_time", "time_correction", "local_solar_hour", ).withColumn( "solar_elevation", degrees( asin( sin("declination_angle") * sin(radians("loc_lat")) + cos("declination_angle") * cos(radians("loc_lat")) * cos("hour_angle"))), ).drop("declination_angle", "hour_angle"))
def add_solar_features(df): return \ (df .withColumn('declination_angle', radians(-23.45 * cos(((2 * pi)/365) * (dayofyear('date') + 10)))) .withColumn('diff_local_time_UTC', timezone_from_date('date')) .withColumn('d', (2 * pi * dayofyear('date')) / 365) .withColumn('equation_of_time', -7.655 * sin(col('d')) + 9.873 * sin(2 * col('d') + 3.588)) .drop('d') .withColumn('time_correction', 4 * (col('loc_long') - (15 * col('diff_local_time_UTC'))) + col('equation_of_time')) .withColumn('local_solar_hour', col('hour') + 0.5 + col('time_correction') / 60) .withColumn('hour_angle', 0.2618 * (col('local_solar_hour') - 12)) .drop('diff_local_time_UTC', 'equation_of_time', 'time_correction', 'local_solar_hour') .withColumn('solar_elevation', degrees(asin(sin('declination_angle') * sin(radians('loc_lat')) + cos('declination_angle') * cos(radians('loc_lat')) * cos('hour_angle')))) .drop('declination_angle', 'hour_angle'))
def distance(long1, lat1, long2, lat2): radius = 6371 diff_lat = radians(lat2 - lat1) diff_long = radians(long2 - long1) a = sin(diff_lat / 2)**2 + cos(lat1) * cos(lat2) * sin(diff_long / 2)**2 c = 2 * atan2(a**0.5, (1 - a)**0.5) return radius * c
def test_math_functions(self): df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF() from pyspark.sql import functions SQLTestUtils.assert_close([math.cos(i) for i in range(10)], df.select(functions.cos(df.a)).collect()) SQLTestUtils.assert_close([math.cos(i) for i in range(10)], df.select(functions.cos("a")).collect()) SQLTestUtils.assert_close([math.sin(i) for i in range(10)], df.select(functions.sin(df.a)).collect()) SQLTestUtils.assert_close([math.sin(i) for i in range(10)], df.select(functions.sin(df["a"])).collect()) SQLTestUtils.assert_close([math.pow(i, 2 * i) for i in range(10)], df.select(functions.pow(df.a, df.b)).collect()) SQLTestUtils.assert_close([math.pow(i, 2) for i in range(10)], df.select(functions.pow(df.a, 2)).collect()) SQLTestUtils.assert_close([math.pow(i, 2) for i in range(10)], df.select(functions.pow(df.a, 2.0)).collect()) SQLTestUtils.assert_close( [math.hypot(i, 2 * i) for i in range(10)], df.select(functions.hypot(df.a, df.b)).collect(), ) SQLTestUtils.assert_close( [math.hypot(i, 2 * i) for i in range(10)], df.select(functions.hypot("a", "b")).collect(), ) SQLTestUtils.assert_close([math.hypot(i, 2) for i in range(10)], df.select(functions.hypot("a", 2)).collect()) SQLTestUtils.assert_close([math.hypot(i, 2) for i in range(10)], df.select(functions.hypot(df.a, 2)).collect())
def distance(CLat, CLon, data, col_name): return data.withColumn('CLon', f.lit(CLon)).withColumn('CLat',f.lit(CLat)).withColumn("dlon", f.radians(f.col("CLon")) - f.radians(f.col("longitude"))).withColumn("dlat", f.radians(f.col("CLat")) - f.radians(f.col("latitude"))).withColumn(col_name, f.asin(f.sqrt( f.sin(f.col("dlat") / 2) ** 2 + f.cos(f.radians(f.col("latitude"))) * f.cos(f.radians(f.col("CLat"))) * f.sin(f.col("dlon") / 2) ** 2 ) ) * 2 * 6371 * 1000) \ .drop("dlon", "dlat",'CLon', 'CLat')
def test_math_functions(self): df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF() from pyspark.sql import functions import math def get_values(l): return [j[0] for j in l] def assert_close(a, b): c = get_values(b) diff = [abs(v - c[k]) < 1e-6 for k, v in enumerate(a)] return sum(diff) == len(a) assert_close([math.cos(i) for i in range(10)], df.select(functions.cos(df.a)).collect()) assert_close([math.cos(i) for i in range(10)], df.select(functions.cos("a")).collect()) assert_close([math.sin(i) for i in range(10)], df.select(functions.sin(df.a)).collect()) assert_close([math.sin(i) for i in range(10)], df.select(functions.sin(df['a'])).collect()) assert_close([math.pow(i, 2 * i) for i in range(10)], df.select(functions.pow(df.a, df.b)).collect()) assert_close([math.pow(i, 2) for i in range(10)], df.select(functions.pow(df.a, 2)).collect()) assert_close([math.pow(i, 2) for i in range(10)], df.select(functions.pow(df.a, 2.0)).collect()) assert_close([math.hypot(i, 2 * i) for i in range(10)], df.select(functions.hypot(df.a, df.b)).collect())
def distance_intermediate_formula(lat1, long1, lat2, long2): """Returns spark expression computing intermediate result to compute the distance between to GPS coordinates Source: https://www.movable-type.co.uk/scripts/latlong.html """ return pow(sin(radians(col(lat1) - col(lat2)) / 2), 2) + (pow(sin(radians(col(long1) - col(long2)) / 2), 2) * cos(radians(col(lat1))) * cos(radians(col(lat2))))
def km(lat1, lon1, lat2, lon2): R = 6371 # Radius of the earth in km dLat = deg2rad(lat2 - lat1) # deg2rad below dLon = deg2rad(lon2 - lon1) a = sin(dLat / 2) * sin(dLat / 2) + cos(deg2rad(lat1)) * cos( deg2rad(lat2)) * sin(dLon / 2) * sin(dLon / 2) c = 2 * atan2(sqrt(a), sqrt(1 - a)) d = R * c # Distance in km return d
def compute_time_cyclical_features(data): """ Extract month/hour and apply cyclical calculation """ return data.select(key) \ .withColumn("Cos_Month", cos(2*np.pi*month("TIMESTAMP")/12)) \ .withColumn("Sin_Month", sin(2*np.pi*month("TIMESTAMP")/12)) \ .withColumn("Cos_Hour", cos(2*np.pi*hour("TIMESTAMP")/23)) \ .withColumn("Sin_Hour", sin(2*np.pi*hour("TIMESTAMP")/23))
def compute_time_cyclical_features_v2(data): """ Extract month/hour and apply cyclical calculation and if day is a week-end """ return data.select(key) \ .withColumn("Cos_Month", cos(2*np.pi*month("TIMESTAMP")/12)) \ .withColumn("Sin_Month", sin(2*np.pi*month("TIMESTAMP")/12)) \ .withColumn("Cos_Hour", cos(2*np.pi*hour("TIMESTAMP")/23)) \ .withColumn("Sin_Hour", sin(2*np.pi*hour("TIMESTAMP")/23)) \ .withColumn("is_weekend", dayofweek("TIMESTAMP").isin([1,7]).cast("int"))
def haversine(lon1, lat1, lon2, lat2): lon1, lat1, lon2, lat2 = map(toRadians, [ lon1.cast("float"), lat1.cast("float"), lon2.cast("float"), lat2.cast("float") ]) dlon = lon2 - lon1 dlat = lat2 - lat1 a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2 c = 2 * asin(sqrt(a)) m = 6367000 * c return m.cast("decimal(10,2)") # meters
def haversine(lng1: Column, lat1: Column, lng2: Column, lat2: Column): radius = 6378137 #将度数转成弧度 radLng1 = f.radians(lng1) radLat1 = f.radians(lat1) radLng2 = f.radians(lng2) radLat2 = f.radians(lat2) result = f.asin( f.sqrt( f.pow(f.sin((radLat1 - radLat2) / 2.0), 2) + f.cos(radLat1) * f.cos(radLat2) * f.pow(f.sin((radLng1 - radLng2) / 2.0), 2))) * 2.0 * radius return result
def calculate_bearing_degrees(latitude_1, longitude_1, latitude_2, longitude_2): diff_longitude = F.radians(longitude_2 - longitude_1) r_latitude_1 = F.radians(latitude_1) r_longitude_1 = F.radians(longitude_1) r_latitude_2 = F.radians(latitude_2) r_longitude_2 = F.radians(longitude_2) y = F.sin(diff_longitude) * F.cos(r_longitude_2) x = (F.cos(r_latitude_1) * F.sin(r_latitude_2) - F.sin(r_latitude_1) * F.cos(r_latitude_2) * F.cos(diff_longitude)) return F.degrees(F.atan2(x, y))
def dist(lat1, long1, lat2, long2): """ Calculate the great circle distance between two points on the earth (specified in decimal degrees) """ # convert decimal degrees to radians lat1, long1, lat2, long2 = map(f.toRadians, [lat1, long1, lat2, long2]) # haversine formula dlon = long2 - long1 dlat = lat2 - lat1 a = f.sin(dlat/2)**2 + f.cos(lat1) * f.cos(lat2) * f.sin(dlon/2)**2 c = 2 * f.asin(f.sqrt(a)) # Radius of earth in kilometers is 6371 km = 6371* c return f.round(km, 3)
def add_cyclic_feature(df, column, col_name, period): period_scale = (2 * pi) / period return ( df.withColumn(col_name + "_cos", cos(column * lit(period_scale))) .withColumn(col_name + "_sin", sin(column * lit(period_scale))) .drop(col_name) )
def add_distance_column(dfs, order_column='timestamp'): # Radians lat/lon dfs = dfs.withColumn('latitude2', F.radians('latitude')).withColumn( 'longitude2', F.radians('longitude')) # Groups GPS locations into chucks. A chunk is formed by groups of points that are distant no more than roam_dist w = Window.partitionBy(['userID']).orderBy(order_column) dfs = dfs.withColumn('next_lat', F.lead('latitude2', 1).over(w)) dfs = dfs.withColumn('next_lon', F.lead('longitude2', 1).over(w)) # Haversine distance dfs = dfs.withColumn('distance_next', EARTH_RADIUS * 2 * F.asin(F.sqrt( F.pow(F.sin((col('next_lat') - col('latitude2')) / 2.0), 2) + F.cos('latitude2') * F.cos('next_lat') * F.pow( F.sin((col('next_lon') - col('longitude2')) / 2.0), 2)))) dfs = dfs.withColumn('distance_prev', F.lag('distance_next', default=0).over(w)).drop( 'latitude2').drop('longitude2').drop('next_lon').drop('next_lat').drop('distance_next') return dfs
def add_fourier_terms(df, period, col, degree_fourier): """根据输入的周期和列的值,添加相应的傅里叶项来描述周期性 """ for i in range(1, degree_fourier + 1): df = df.withColumn(col + '_fourier_sin_' + str(i), F.sin((2 * np.pi * F.col(col) / period) * i)) df = df.withColumn(col + '_fourier_cos_' + str(i), F.cos((2 * np.pi * F.col(col) / period) * i)) return df
def add_cyclical_features(df, source_column, target_column): df = df.withColumn(target_column, F.from_utc_timestamp(F.col(source_column), "UTC")) hour = target_column + "_hour" weekday = target_column + "_weekday" month = target_column + "_month" year = target_column + "_year" df = df.withColumn(hour, F.hour(F.col(target_column))) df = df.withColumn(weekday, F.dayofweek(F.col(target_column))-1) df = df.withColumn(month, F.month(F.col(target_column))-1) df = df.withColumn("DerivedTime_cos_" + hour, F.cos(F.col(hour) * (2.0 * np.pi / 24)) + 1) df = df.withColumn("DerivedTime_sin_" + hour, F.sin(F.col(hour) * (2.0 * np.pi / 24)) + 1) df = df.withColumn("DerivedTime_cos_" + weekday, F.cos(F.col(weekday) * (2.0 * np.pi / 7)) + 1) df = df.withColumn("DerivedTime_sin_" + weekday, F.sin(F.col(weekday) * (2.0 * np.pi / 7)) + 1) df = df.withColumn("DerivedTime_cos_" + month, F.cos(F.col(month) * (2.0 * np.pi / 12)) + 1) df = df.withColumn("DerivedTime_sin_" + month, F.sin(F.col(month) * (2.0 * np.pi / 12)) + 1) df = df.withColumn("DerivedTime_" + year, F.year(F.col(target_column))) drop_cols = [source_column, hour, weekday, month] df = df.drop(*drop_cols) return df
def complicated_arithmetic_operation(df): theta_1 = df['pickup_longitude'] phi_1 = df['pickup_latitude'] theta_2 = df['dropoff_longitude'] phi_2 = df['dropoff_latitude'] temp = (f.cos(theta_1) * np.pi / 180) * (f.cos(theta_2) * np.pi / 180) * ( f.sin(phi_2 - phi_1) / 2 * np.pi / 180)**2 expression = 2 * f.atan2(f.sqrt(temp), f.sqrt(1 - temp)) df.select(f.mean(expression)).collect()
def elem2coord(df): df = df.withColumn('e_sin_M', df['e'] * sin(df['M'])) df = df.withColumn('e_cos_M', df['e'] * cos(df['M'])) df = df.withColumn('r', df['a'] * (1 - df['e_cos_M'])) df = df.withColumn('Omega', Omega_udf(df['a'])) df = df.withColumn('Kappa', Kappa_udf(df['a'])) df = df.withColumn('t', (df['Omega'] / df['Kappa']) * (df['M'] + 2.0 * df['e_sin_M']) + df['wt']) df = df.withColumn('vr', df['a'] * df['Kappa'] * df['e_sin_M']) df = df.withColumn('vt', df['a'] * df['Omega'] * (1.0 + df['e_cos_M'])) cols = ['id', 'timestep', 'streamline', 'a', 'r', 't', 'vr', 'vt'] return df.select(cols)
def distance(lat1, lon1, lat2, lon2, unit='miles'): ''' Measure simple haversine distance between two points. Default unit = miles. ''' units = { 'miles': 3963.19, 'kilometers': 6378.137, 'meters': 6378137, 'feet': 20902464 } phi_1 = py.radians(lat1) phi_2 = py.radians(lat2) delta_phi = py.radians(lat2 - lat1) delta_lambda = py.radians(lon2 - lon1) area = py.sin(delta_phi/2.0) ** 2 \ + py.cos(phi_1) * py.cos(phi_2) * \ py.sin(delta_lambda / 2.0) ** 2 central_angle = 2 * py.asin((area**0.5)) radius = units[unit.lower()] return py.abs(py.round((central_angle * radius), 4))
def join_and_analyze(df_poi,df_sample): """ Joins the Requests data and POI list data, calculates distance between POI Centers and retains the record with the minimum distance to a particular POI center Parameters: df_poi: POI List datafarme df_sample: Requests dataframe """ # Since there are no matching fields between the data, cartesian product is done to combine the datasets df_joined = df_sample.crossJoin(df_poi) # Caching to memory df_joined.cache() # Applying the Haversine formula to determine distance between coordinate pairs df_joined = df_joined.withColumn("a", ( F.pow(F.sin(F.radians(F.col("POI_Latitude") - F.col("Latitude")) / 2), 2) + F.cos(F.radians(F.col("Latitude"))) * F.cos(F.radians(F.col("POI_Latitude"))) * F.pow(F.sin(F.radians(F.col("POI_Longitude") - F.col("Longitude")) / 2), 2) )).withColumn("distance", F.atan2(F.sqrt(F.col("a")), F.sqrt(-F.col("a") + 1)) * 2 * 6371) # Applying window function to retain the records with the least distance to a POI center w = Window.partitionBy('_ID') df_joined = df_joined.withColumn('min', F.min('distance').over(w)) .where(F.col('distance') == F.col('min')) .drop('min').drop('a') return df_joined
def createCyclicalFeatures(data, cyclical_variables, drop_orig_vars=False, verbose=False, logger=False): try: if verbose: logger.info('create_cyclical_features() start') for i in range(len(cyclical_variables)): distinct_values_count = data.select( cyclical_variables[i]).distinct().count() data = data.withColumn(cyclical_variables[i]+'_sin', sin(2*np.pi*col(cyclical_variables[i]).\ cast(DoubleType())/distinct_values_count)) data = data.withColumn(cyclical_variables[i]+'_cos', cos(2*np.pi*col(cyclical_variables[i]).\ cast(DoubleType())/distinct_values_count)) if drop_orig_vars: data = data.drop(*cyclical_variables) if verbose: logger.info('create_cyclical_features() end') except Exception: logger.exception("Fatal error in create_cyclical_features()") raise return data
def distance(lat, lon, lat2, lon2): ''' Uses the "haversine" formula to calculate the distance between two points using they latitude and longitude Parameters ---------- lat: latitude co-ordinate using signed decimal degrees without compass direction for first location lon: longitude co-ordinate using signed decimal degrees without compass direction for first location lat2: latitude co-ordinate using signed decimal degrees without compass direction for second location lon2: longitude co-ordinate using signed decimal degrees without compass direction for second location Returns ------- Returns distance between two points Notes ----- Haversine formula Δφ = φ1 - φ2 Δλ = λ1 - λ2 a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2) c = 2 ⋅ atan2( √a, √(1−a) ) d = R ⋅ c φ -> latitude λ -> longitude R -> 6371 ''' R = 6371 delta_lat = lat - lat2 delta_lon = lon - lon2 a = pow(sin(toRadians(delta_lat/2)),2) + cos(toRadians(lat)) * cos(toRadians(lat2)) * pow(sin(toRadians(delta_lon/2)),2) c = 2 * atan2(pow(a,0.5) , pow(1-a, 0.5) ) d = R * c return d
def main(): # Parameters for the algorithm roam_dist = 100 # meters min_stay = 10 # minutes # Parameters for the paths input_path = '/path_to_parquet' # parquet file output_path = '/path_to_parquet_out' # parquet file spark = SparkSession \ .builder \ .appName("Stop locations") \ .getOrCreate() # Read data source_df = spark.read.parquet(input_path) source_df = source_df.select( 'user_id', 'timestamp', F.radians('latitude').alias('lat'), F.radians('longitude').alias("lon")).orderBy('timestamp') source_df.cache() # Filter out all the data that is not necessary (e.g. positions equals to others in a time-distance less than min_stay w = Window.partitionBy(['user_id']).orderBy('timestamp') source_df = source_df.select("user_id", "timestamp", "lat", "lon", F.lead("lat", 1).over(w).alias("next_lat"), F.lead("lon", 1).over(w).alias("next_lon")) dist_df = source_df.withColumn( "distance_next", EARTH_RADIUS * 2 * F.asin( F.sqrt( F.pow(F.sin((col("next_lat") - col("lat")) / 2.0), 2) + F.cos("lat") * F.cos("next_lat") * F.pow(F.sin((col("next_lon") - col("lon")) / 2.0), 2)))) dist_df = dist_df.withColumn("distance_prev", F.lag("distance_next").over(w)) exclude_df = dist_df.where(( (col("distance_next") < 5) & (col("distance_prev") < 5)) | ((col("distance_next") > roam_dist) & (col("distance_prev") > roam_dist))) df = source_df.join(exclude_df, ['user_id', 'timestamp'], "left_anti").select("user_id", "timestamp", "lat", "lon") # Transform to RDD, in order to apply the function get_stop_location # RDD that contains: (user_id, [timestamp, lat, lon, lat_degrees, lon_degrees]) df_rdd = df.orderBy(['user_id', 'timestamp']).rdd.map(tuple) df_rdd = df_rdd.map(lambda x: (x[0], [[x[1], x[2], x[3]]])) # RDD that contains: (user_id, [[timestamp, lat, lon], ..., [timestamp, lat, lon]]), sorted by timestamp grouped_rdd = df_rdd.reduceByKey(lambda x, y: x + y) stop_locations_rdd = grouped_rdd.map(lambda x: ( x[0], get_stop_location( x[1], min_stay_duration=min_stay, roaming_distance=roam_dist))) stop_locations_rdd = stop_locations_rdd.flatMapValues(lambda x: x).map( lambda x: (x[0], x[1][0], x[1][1], x[1][2], x[1][3])) # Output schema schema = StructType([ StructField('user_id', StringType(), False), StructField('lat', DoubleType(), False), StructField('lon', DoubleType(), False), StructField('from', TimestampType(), False), StructField('to', TimestampType(), False) ]) result_df = spark.createDataFrame(stop_locations_rdd, schema) result_df = result_df.withColumn('lat', F.degrees('lat')) result_df = result_df.withColumn('lon', F.degrees('lon')) result_df.write.save(output_path)
((col('Start_Longitude') != col("End_Longitude")) &\ (col('Start_Latitude') != col("End_Latitude"))) &\ (col('Start_Longitude') > -80) &\ (col('Start_Longitude') < -70) &\ (col('Start_Latitude') > 40) &\ (col('Start_Latitude') < 46) &\ (col('End_Longitude') > -80) &\ (col('End_Longitude') < -70) &\ (col('End_Latitude') > 40) &\ (col('End_Latitude') < 46) &\ (col('Cost') > 0)) yellow_tripdata_1m = yellow_tripdata_1m.withColumn("Duration", ((unix_timestamp(col("End_Datetime")) - unix_timestamp(col("Start_Datetime")))/60))\ .withColumn("Diff_Longitude", col("End_Longitude") - col("Start_Longitude"))\ .withColumn("Diff_Latitude", col("End_Latitude") - col("Start_Latitude"))\ .withColumn("a", F.pow(F.sin(col("Diff_Latitude")/2),2) +\ F.cos(col("Start_Latitude"))*F.cos(col("End_Latitude"))*F.pow(F.sin(col("Diff_Longitude")/2),2))\ .withColumn("Distance", 2 * 6371 * F.atan2(F.sqrt(col("a")), F.sqrt(1.0 - col("a"))))\ .drop("Diff_Longitude").drop("Diff_Latitude").drop("Start_Datetime")\ .drop("End_Datetime").drop("Start_Longitude").drop("Start_Latitude")\ .drop("End_Longitude").drop("End_Latitude").drop("a").drop("Cost") yellow_trip_joined = yellow_tripdata_1m.join(yellow_tripvendors_1m, "ID", "inner").drop("ID") yellow_trip_joined.createOrReplaceTempView("yellow_trip_joined") window = Window.partitionBy("Vendor") res = yellow_trip_joined.withColumn("Max_Distance", F.max("Distance").over(window))\ .where(col("Distance") == col("Max_Distance"))\ .drop("Max_Distance").select(["Vendor", "Distance", "Duration"]) res.show()
from df_tools import * from histfile import * from tools import * from pyspark.sql import functions as F import sys nside = int(sys.argv[1]) spark = SparkSession.builder.getOrCreate() df = spark.read.parquet("nside{}.parquet".format(nside)) df = df.withColumn( "dx", F.degrees( F.sin((df["theta"] + df["theta_c"]) / 2) * (df["phi"] - df["phi_c"])) * 60) df = df.withColumn("dy", F.degrees(df["theta"] - df["theta_c"]) * 60) #df=df.withColumn("r",F.hypot(df["dx"],df["dy"])) df = df.withColumn("x", F.sin(df["theta"]) * F.cos(df["phi"])).withColumn( "y", F.sin(df["theta"]) * F.sin(df["phi"])).withColumn( "z", F.cos(df["theta"])).drop("theta", "phi") df = df.withColumn("xc", F.sin(df["theta_c"]) * F.cos(df["phi_c"])).withColumn( "yc", F.sin(df["theta_c"]) * F.sin(df["phi_c"])).withColumn( "zc",
def main(): """Main function""" # Get args args = get_args() # Azure credentials sas_token = args.sas storage_account_name = args.storage container_in = args.container_in container_out = args.container_out azure_accounts = list() azure_accounts.append({ "storage": storage_account_name, "sas": sas_token, "container": container_in }) azure_accounts.append({ "storage": storage_account_name, "sas": sas_token, "container": container_out }) # VM cores = args.vm_cores ram = args.vm_ram shuffle_partitions = args.shuffle_partitions # Geohash file path geohash_path = args.geohashpath # Date, country, prefix country = args.country date_string = args.date prefix = args.prefix # Set date variables day_time = datetime.strptime(date_string, "%Y-%m-%d") year = day_time.year month = day_time.month day = day_time.day # stop config seconds = 60 accuracy = args.accuracy roam_dist = args.roam_dist min_stay = args.min_stay overlap_hours = args.overlap_hours # Path in - path out blob_in = f"wasbs://{container_in}@{storage_account_name}.blob.core.windows.net/preprocessed/{country}/" path_out = f"stoplocation-v{VERSION}_r{roam_dist}-s{min_stay}-a{accuracy}-h{overlap_hours}/{country}" if prefix: path_out = f"stoplocation-v{VERSION}_prefix_r{roam_dist}-s{min_stay}-a{accuracy}-h{overlap_hours}/{country}" # config spark conf = getSparkConfig(cores, ram, shuffle_partitions, azure_accounts) # Create spark session sc = SparkContext(conf=conf).getOrCreate() sqlContext = SQLContext(sc) spark = sqlContext.sparkSession # Init azure client blob_service_client = BlobServiceClient.from_connection_string( CONN_STRING.format(storage_account_name, sas_token)) # build keys, date is mandatory, prefix opt partition_key = "year={}/month={}/day={}".format(year, month, day) if prefix: partition_key = "year={}/month={}/day={}/prefix={}".format( year, month, day, prefix) blob_base = "{}/{}".format(path_out, partition_key) # # check for skip # TODO # skip = False print("process " + partition_key + " to " + blob_base) start_time = time.time() local_dir = LOCAL_PATH + partition_key print("write temp to " + local_dir) # cleanup local if exists if (os.path.isdir(local_dir)): map(os.unlink, (os.path.join(local_dir, f) for f in os.listdir(local_dir))) # TODO cleanup remote if exists # Output schema schema = ArrayType( StructType([ #StructField('device_type', IntegerType(), False), StructField('serial', IntegerType(), False), StructField('latitude', DoubleType(), False), StructField('longitude', DoubleType(), False), StructField('begin', TimestampType(), False), StructField('end', TimestampType(), False), StructField('personal_area', BooleanType(), False), StructField('distance', DoubleType(), False), StructField('geohash6', StringType(), False), StructField('after_stop_distance', DoubleType(), False) ])) spark_get_stop_location = udf( lambda z: get_stop_location(z, roam_dist, min_stay), schema) # Geohash file print("read geohash parquet") csv_time = time.time() dfs_us_states = spark.read.format("parquet").load(geohash_path) # states = [s.STUSPS for s in dfs_us_states.select( # 'STUSPS').distinct().collect()] dfs_us_states = dfs_us_states.select( col('STUSPS').alias('state'), col('geohash').alias('geohash5')) dfs_us_states = dfs_us_states.drop_duplicates(subset=['geohash5']) # Input dataset print("read dataset table") read_time = time.time() # dfs = spark.read.format("parquet").load(blob_in) # # apply partition filter # dfs_partition = dfs.where( # f"(year = {year} AND month = {month} AND day = {day} AND prefix = '{prefix}')") # read only partition to reduce browse time dfs_cur_partition = spark.read.format("parquet").load( f"{blob_in}/{partition_key}") # lit partition filters as data dfs_cur_partition = dfs_cur_partition.withColumn('year', F.lit(year)) dfs_cur_partition = dfs_cur_partition.withColumn('month', F.lit(month)) dfs_cur_partition = dfs_cur_partition.withColumn('day', F.lit(day)) if prefix: dfs_cur_partition = dfs_cur_partition.withColumn( 'prefix', F.lit(prefix)) # read next day for overlap next_day = day_time + timedelta(days=1) next_partition_key = "year={}/month={}/day={}".format( next_day.year, next_day.month, next_day.day) if prefix: next_partition_key = "year={}/month={}/day={}/prefix={}".format( next_day.year, next_day.month, next_day.day, prefix) dfs_next_partition = spark.read.format("parquet").load( f"{blob_in}/{next_partition_key}") dfs_next_partition = dfs_next_partition.where( F.hour("timestamp") <= (overlap_hours - 1)) # lit partition filters as data dfs_next_partition = dfs_next_partition.withColumn('year', F.lit(next_day.year)) dfs_next_partition = dfs_next_partition.withColumn('month', F.lit(next_day.month)) dfs_next_partition = dfs_next_partition.withColumn('day', F.lit(next_day.day)) if prefix: dfs_next_partition = dfs_next_partition.withColumn( 'prefix', F.lit(prefix)) # union with overlap dfs_partition = dfs_cur_partition.unionAll(dfs_next_partition) print("process with spark") spark_time = time.time() # select columns dfs_partition = dfs_partition.select( 'prefix', 'userID', 'timestamp', 'latitude', 'longitude', (F.when(col('opt1') == 'PERSONAL_AREA', True).otherwise(False)).alias('personal_area'), 'accuracy') # keep only data with required accuracy dfs_partition = dfs_partition.where((col('accuracy') <= accuracy) & (col('accuracy') >= 0)) # stats - enable only for debug! # num_inputs = dfs_partition.count() # print(f"read {num_inputs} rows from "+partition_key) # Lowering the granularity to 1 minutes # explicitely convert to timestamp #dfs_partition = dfs_partition.withColumn('timestamp', col('timestamp').cast('timestamp')) seconds_window = F.unix_timestamp( 'timestamp') - F.unix_timestamp('timestamp') % seconds w = Window().partitionBy('userID', seconds_window).orderBy('accuracy') dfs_partition = dfs_partition.withColumn( 'rn', F.row_number().over(w).cast('int')).where(col('rn') == 1).drop('rn') # Radians lat/lon dfs_partition = dfs_partition.withColumn('latitude', F.radians('latitude')).withColumn( 'longitude', F.radians('longitude')) # Groups GPS locations into chucks. A chunk is formed by groups of points that are distant no more than roam_dist w = Window.partitionBy(['prefix', 'userID']).orderBy('timestamp') dfs_partition = dfs_partition.withColumn('next_lat', F.lead('latitude', 1).over(w)) dfs_partition = dfs_partition.withColumn('next_lon', F.lead('longitude', 1).over(w)) # Haversine distance dfs_partition = dfs_partition.withColumn( 'distance_next', EARTH_RADIUS * 2 * F.asin( F.sqrt( F.pow(F.sin((col('next_lat') - col('latitude')) / 2.0), 2) + F.cos('latitude') * F.cos('next_lat') * F.pow(F.sin((col('next_lon') - col('longitude')) / 2.0), 2)))) dfs_partition = dfs_partition.withColumn( 'distance_prev', F.lag('distance_next', default=0).over(w)) # Chunks dfs_partition = dfs_partition.withColumn( 'chunk', F.when(col('distance_prev') > roam_dist, 1).otherwise(0)) windowval = (Window.partitionBy( 'prefix', 'userID').orderBy('timestamp').rangeBetween(Window.unboundedPreceding, 0)) dfs_partition = dfs_partition.withColumn( 'chunk', F.sum('chunk').over(windowval).cast('int')) # Remove chunks of the next day w = Window.partitionBy(['prefix', 'userID', 'chunk']) dfs_partition = dfs_partition.withColumn( 'min_timestamp', F.dayofmonth(F.min('timestamp').over(w))) dfs_partition = dfs_partition.where( col('min_timestamp') == day).drop('min_timestamp') # Get the stops result_df = dfs_partition.groupBy('prefix', 'userID', 'chunk').agg( F.array_sort( F.collect_list( F.struct('timestamp', 'latitude', 'longitude', 'distance_prev', 'personal_area'))).alias('gpsdata'), F.sum('distance_prev').alias('dist_sum')) result_df = result_df.withColumn('gpsdata', spark_get_stop_location('gpsdata')) result_df = result_df.select('userID', 'chunk', F.explode_outer('gpsdata').alias('e'), 'dist_sum') result_df = result_df.select( 'userID', 'chunk', col('e.latitude').alias('latitude'), col('e.longitude').alias('longitude'), col('e.begin').alias('begin'), col('e.end').alias('end'), col('e.personal_area').alias('personal_area'), col('e.geohash6').alias('geohash6'), col('e.serial').alias('serial'), col('e.distance').alias('stop_distance'), col('e.after_stop_distance').alias('after_stop_distance'), 'dist_sum') result_df = result_df.fillna(0, subset=['after_stop_distance']) # Remove all those stop that start the next day result_df = result_df.where((col('begin').isNull()) | (F.dayofmonth('begin') != next_day.day)) result_df = result_df.withColumn( 'isStop', F.when(col('serial').isNotNull(), 1).otherwise(0)) result_df = result_df.withColumn( 'dist_sum', F.when(col('isStop') == 1, col('stop_distance')).otherwise(col('dist_sum'))) windowval = (Window.partitionBy('userId').orderBy( 'chunk', 'serial').rowsBetween(Window.currentRow, Window.unboundedFollowing)) result_df = result_df.withColumn('isStop_cum', F.sum('isStop').over(windowval)) result_df = result_df.groupBy('userId', 'isStop_cum').agg( F.first('latitude', ignorenulls=True).alias('latitude'), F.first('longitude', ignorenulls=True).alias('longitude'), F.first('begin', ignorenulls=True).alias('begin'), F.first('end', ignorenulls=True).alias('end'), F.first('personal_area', ignorenulls=True).alias('personal_area'), F.first('geohash6', ignorenulls=True).alias('geohash6'), F.sum('dist_sum').alias('prev_travelled_distance'), F.sum('after_stop_distance').alias('after_stop_distance')) # compute next distance, which is null if it's the last windowval = Window.partitionBy('userId').orderBy(F.desc('isStop_cum')) result_df = result_df.withColumn( 'next_travelled_distance', F.lead('prev_travelled_distance').over(windowval)) result_df = result_df.withColumn( 'next_travelled_distance', F.when((col('next_travelled_distance').isNull()) & (col('after_stop_distance') > 0), col('after_stop_distance')).otherwise( col('next_travelled_distance'))) # Drop nulls result_df = result_df.dropna(subset=['latitude']).drop('isStop_cum') # Transform latitude and longitude back to degrees result_df = result_df.withColumn('latitude', F.degrees('latitude')) result_df = result_df.withColumn('longitude', F.degrees('longitude')) # US states result_df = result_df.withColumn( "geohash5", F.expr("substring(geohash6, 1, length(geohash6)-1)")) result_df = result_df.join(F.broadcast(dfs_us_states), on="geohash5", how="inner").drop('geohash5') # lit partition data - enable only if added to partitionBy # result_df = result_df.withColumn('year', F.lit(year)) # result_df = result_df.withColumn('month', F.lit(month)) # result_df = result_df.withColumn('day', F.lit(day)) # write out_partitions = len(US_STATES) result_df.repartition(out_partitions, "state").write.partitionBy( "state").format('parquet').mode("overwrite").save(local_dir + "/") # stats - enable only for debug! # num_records = result_df.count() # print(f"written {num_records} rows to "+local_dir) # if num_records == 0: # raise Exception("Zero rows output") print("upload local data to azure") upload_time = time.time() # upload parts over states for state in US_STATES: print(f"upload files for {state}") state_dir = local_dir + "/state=" + state state_key = f"{partition_key}/state={state}/" if (os.path.isdir(state_dir)): files = [ filename for filename in os.listdir(state_dir) if filename.startswith("part-") ] if len(files) > 0: for file_local in files: file_path = state_dir + "/" + file_local part_num = int(file_local.split('-')[1]) part_key = '{:05d}'.format(part_num) # fix name as static hash to be reproducible filename_hash = hashlib.sha1( str.encode(state_key + part_key)).hexdigest() blob_key = "{}/state={}/part-{}-{}.snappy.parquet".format( blob_base, state, part_key, filename_hash) print("upload " + file_path + " to " + container_out + ":" + blob_key) blob_client = blob_service_client.get_blob_client( container_out, blob_key) with open(file_path, "rb") as data: blob_client.upload_blob(data, overwrite=True) # cleanup os.remove(file_path) else: print(f"no files to upload for {state}") else: print(f"missing partition for {state}") print("--- {} seconds elapsed ---".format(int(time.time() - start_time))) print() stop_time = time.time() spark.stop() end_time = time.time() print("Done in {} seconds (csv:{} read:{} spark:{} upload:{} stop:{})". format(int(end_time - start_time), int(read_time - csv_time), int(spark_time - read_time), int(upload_time - spark_time), int(stop_time - upload_time), int(end_time - stop_time))) print('Done.')
def dist(long_x, lat_x, long_y, lat_y): return acos( sin(radians(lat_x)) * sin(radians(lat_y)) + cos(radians(lat_x)) * cos(radians(lat_y)) * cos(radians(long_x) - radians(long_y)) ) * lit(6371.0)