Esempio n. 1
0
    def test_math_functions(self):
        df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF()
        from pyspark.sql import functions
        import math

        def get_values(l):
            return [j[0] for j in l]

        def assert_close(a, b):
            c = get_values(b)
            diff = [abs(v - c[k]) < 1e-6 for k, v in enumerate(a)]
            return sum(diff) == len(a)

        assert_close([math.cos(i) for i in range(10)],
                     df.select(functions.cos(df.a)).collect())
        assert_close([math.cos(i) for i in range(10)],
                     df.select(functions.cos("a")).collect())
        assert_close([math.sin(i) for i in range(10)],
                     df.select(functions.sin(df.a)).collect())
        assert_close([math.sin(i) for i in range(10)],
                     df.select(functions.sin(df['a'])).collect())
        assert_close([math.pow(i, 2 * i) for i in range(10)],
                     df.select(functions.pow(df.a, df.b)).collect())
        assert_close([math.pow(i, 2) for i in range(10)],
                     df.select(functions.pow(df.a, 2)).collect())
        assert_close([math.pow(i, 2) for i in range(10)],
                     df.select(functions.pow(df.a, 2.0)).collect())
        assert_close([math.hypot(i, 2 * i) for i in range(10)],
                     df.select(functions.hypot(df.a, df.b)).collect())
        assert_close([math.hypot(i, 2 * i) for i in range(10)],
                     df.select(functions.hypot("a", u"b")).collect())
        assert_close([math.hypot(i, 2) for i in range(10)],
                     df.select(functions.hypot("a", 2)).collect())
        assert_close([math.hypot(i, 2) for i in range(10)],
                     df.select(functions.hypot(df.a, 2)).collect())
Esempio n. 2
0
def add_haversine_distances(df, lat_a, lon_a, lat_b, lon_b):
    """
    Although the Haversine distance is readily available from sklearn.metrics.pairwise.haversine_distances, the
    implementation below makes use only of in-built PySpark functions, so should be more efficient than passing the
    sklearn function in as a UDF.

    :param df:
    :param lat_a:
    :param lon_a:
    :param lat_b:
    :param lon_b:
    :return:
    """

    earth_radius_km = 6371.0

    return \
        df \
            .withColumn('dist_lat', F.radians(lat_a) - F.radians(lat_b)) \
            .withColumn('dist_lon', F.radians(lon_a) - F.radians(lon_b)) \
            .withColumn('area',
                        (F.sin(F.col('dist_lat') / 2) ** 2)
                        + (F.cos(F.radians(lat_a))
                           * F.cos(F.radians(lat_b))
                           * (F.sin(F.col('dist_lon') / 2) ** 2)
                           )
                        ) \
            .withColumn('central_angle', 2 * F.asin(F.sqrt(F.col('area')))) \
            .withColumn('distance_km', F.col('central_angle') * F.lit(earth_radius_km)) \
            .drop('dist_lat', 'dist_lon', 'area', 'central_angle')
Esempio n. 3
0
def add_solar_features(df):
    return (df.withColumn(
        "declination_angle",
        radians(-23.45 * cos(((2 * pi) / 365) * (dayofyear("date") + 10))),
    ).withColumn("diff_local_time_UTC", timezone_from_date("date")).withColumn(
        "d", (2 * pi * dayofyear("date")) / 365).withColumn(
            "equation_of_time",
            -7.655 * sin(col("d")) + 9.873 * sin(2 * col("d") + 3.588),
        ).drop("d").withColumn(
            "time_correction",
            4 * (col("loc_long") - (15 * col("diff_local_time_UTC"))) +
            col("equation_of_time"),
        ).withColumn(
            "local_solar_hour",
            col("hour") + 0.5 + col("time_correction") / 60).withColumn(
                "hour_angle", 0.2618 * (col("local_solar_hour") - 12)).drop(
                    "diff_local_time_UTC",
                    "equation_of_time",
                    "time_correction",
                    "local_solar_hour",
                ).withColumn(
                    "solar_elevation",
                    degrees(
                        asin(
                            sin("declination_angle") *
                            sin(radians("loc_lat")) +
                            cos("declination_angle") *
                            cos(radians("loc_lat")) * cos("hour_angle"))),
                ).drop("declination_angle", "hour_angle"))
def add_solar_features(df):
    return \
        (df
         .withColumn('declination_angle',
                     radians(-23.45
                             * cos(((2 * pi)/365) * (dayofyear('date') + 10))))
         .withColumn('diff_local_time_UTC', timezone_from_date('date'))
         .withColumn('d', (2 * pi * dayofyear('date')) / 365)
         .withColumn('equation_of_time',
                     -7.655 * sin(col('d'))
                     + 9.873 * sin(2 * col('d') + 3.588))
         .drop('d')
         .withColumn('time_correction',
                     4 * (col('loc_long') - (15 * col('diff_local_time_UTC')))
                     + col('equation_of_time'))
         .withColumn('local_solar_hour',
                     col('hour') + 0.5 + col('time_correction') / 60)
         .withColumn('hour_angle', 0.2618 * (col('local_solar_hour') - 12))
         .drop('diff_local_time_UTC', 'equation_of_time', 'time_correction',
               'local_solar_hour')
         .withColumn('solar_elevation',
                     degrees(asin(sin('declination_angle')
                                  * sin(radians('loc_lat'))
                                  + cos('declination_angle')
                                  * cos(radians('loc_lat'))
                                  * cos('hour_angle'))))
         .drop('declination_angle', 'hour_angle'))
Esempio n. 5
0
def distance(CLat, CLon, data, col_name):
    return data.withColumn('CLon', f.lit(CLon)).withColumn('CLat',f.lit(CLat)).withColumn("dlon", f.radians(f.col("CLon")) - f.radians(f.col("longitude"))).withColumn("dlat", f.radians(f.col("CLat")) - f.radians(f.col("latitude"))).withColumn(col_name, f.asin(f.sqrt(
                                         f.sin(f.col("dlat") / 2) ** 2 + f.cos(f.radians(f.col("latitude")))
                                         * f.cos(f.radians(f.col("CLat"))) * f.sin(f.col("dlon") / 2) ** 2
                                         )
                                    ) * 2 * 6371 * 1000) \
              .drop("dlon", "dlat",'CLon', 'CLat')
Esempio n. 6
0
def distance(long1, lat1, long2, lat2):
    radius = 6371
    diff_lat = radians(lat2 - lat1)
    diff_long = radians(long2 - long1)
    a = sin(diff_lat / 2)**2 + cos(lat1) * cos(lat2) * sin(diff_long / 2)**2
    c = 2 * atan2(a**0.5, (1 - a)**0.5)
    return radius * c
Esempio n. 7
0
    def test_math_functions(self):
        df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF()
        from pyspark.sql import functions

        SQLTestUtils.assert_close([math.cos(i) for i in range(10)],
                                  df.select(functions.cos(df.a)).collect())
        SQLTestUtils.assert_close([math.cos(i) for i in range(10)],
                                  df.select(functions.cos("a")).collect())
        SQLTestUtils.assert_close([math.sin(i) for i in range(10)],
                                  df.select(functions.sin(df.a)).collect())
        SQLTestUtils.assert_close([math.sin(i) for i in range(10)],
                                  df.select(functions.sin(df["a"])).collect())
        SQLTestUtils.assert_close([math.pow(i, 2 * i) for i in range(10)],
                                  df.select(functions.pow(df.a,
                                                          df.b)).collect())
        SQLTestUtils.assert_close([math.pow(i, 2) for i in range(10)],
                                  df.select(functions.pow(df.a, 2)).collect())
        SQLTestUtils.assert_close([math.pow(i, 2) for i in range(10)],
                                  df.select(functions.pow(df.a,
                                                          2.0)).collect())
        SQLTestUtils.assert_close(
            [math.hypot(i, 2 * i) for i in range(10)],
            df.select(functions.hypot(df.a, df.b)).collect(),
        )
        SQLTestUtils.assert_close(
            [math.hypot(i, 2 * i) for i in range(10)],
            df.select(functions.hypot("a", "b")).collect(),
        )
        SQLTestUtils.assert_close([math.hypot(i, 2) for i in range(10)],
                                  df.select(functions.hypot("a", 2)).collect())
        SQLTestUtils.assert_close([math.hypot(i, 2) for i in range(10)],
                                  df.select(functions.hypot(df.a,
                                                            2)).collect())
Esempio n. 8
0
    def test_math_functions(self):
        df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF()
        from pyspark.sql import functions
        import math

        def get_values(l):
            return [j[0] for j in l]

        def assert_close(a, b):
            c = get_values(b)
            diff = [abs(v - c[k]) < 1e-6 for k, v in enumerate(a)]
            return sum(diff) == len(a)
        assert_close([math.cos(i) for i in range(10)],
                     df.select(functions.cos(df.a)).collect())
        assert_close([math.cos(i) for i in range(10)],
                     df.select(functions.cos("a")).collect())
        assert_close([math.sin(i) for i in range(10)],
                     df.select(functions.sin(df.a)).collect())
        assert_close([math.sin(i) for i in range(10)],
                     df.select(functions.sin(df['a'])).collect())
        assert_close([math.pow(i, 2 * i) for i in range(10)],
                     df.select(functions.pow(df.a, df.b)).collect())
        assert_close([math.pow(i, 2) for i in range(10)],
                     df.select(functions.pow(df.a, 2)).collect())
        assert_close([math.pow(i, 2) for i in range(10)],
                     df.select(functions.pow(df.a, 2.0)).collect())
        assert_close([math.hypot(i, 2 * i) for i in range(10)],
                     df.select(functions.hypot(df.a, df.b)).collect())
Esempio n. 9
0
def distance_intermediate_formula(lat1, long1, lat2, long2):
    """Returns spark expression computing intermediate result
    to compute the distance between to GPS coordinates
    Source: https://www.movable-type.co.uk/scripts/latlong.html
    """
    return pow(sin(radians(col(lat1) - col(lat2)) / 2),
               2) + (pow(sin(radians(col(long1) - col(long2)) / 2), 2) *
                     cos(radians(col(lat1))) * cos(radians(col(lat2))))
def km(lat1, lon1, lat2, lon2):
    R = 6371  # Radius of the earth in km
    dLat = deg2rad(lat2 - lat1)  # deg2rad below
    dLon = deg2rad(lon2 - lon1)
    a = sin(dLat / 2) * sin(dLat / 2) + cos(deg2rad(lat1)) * cos(
        deg2rad(lat2)) * sin(dLon / 2) * sin(dLon / 2)
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    d = R * c  # Distance in km
    return d
def compute_time_cyclical_features(data):
  """
  Extract month/hour and apply cyclical calculation
  """
  return data.select(key) \
              .withColumn("Cos_Month", cos(2*np.pi*month("TIMESTAMP")/12)) \
              .withColumn("Sin_Month", sin(2*np.pi*month("TIMESTAMP")/12)) \
              .withColumn("Cos_Hour", cos(2*np.pi*hour("TIMESTAMP")/23)) \
              .withColumn("Sin_Hour", sin(2*np.pi*hour("TIMESTAMP")/23))
Esempio n. 12
0
def complicated_arithmetic_operation(df):
    theta_1 = df['pickup_longitude']
    phi_1 = df['pickup_latitude']
    theta_2 = df['dropoff_longitude']
    phi_2 = df['dropoff_latitude']

    temp = (f.cos(theta_1) * np.pi / 180) * (f.cos(theta_2) * np.pi / 180) * (
        f.sin(phi_2 - phi_1) / 2 * np.pi / 180)**2
    expression = 2 * f.atan2(f.sqrt(temp), f.sqrt(1 - temp))
    df.select(f.mean(expression)).collect()
def compute_time_cyclical_features_v2(data):
  """
  Extract month/hour and apply cyclical calculation and if day is a week-end
  """
  return data.select(key) \
              .withColumn("Cos_Month", cos(2*np.pi*month("TIMESTAMP")/12)) \
              .withColumn("Sin_Month", sin(2*np.pi*month("TIMESTAMP")/12)) \
              .withColumn("Cos_Hour", cos(2*np.pi*hour("TIMESTAMP")/23)) \
              .withColumn("Sin_Hour", sin(2*np.pi*hour("TIMESTAMP")/23)) \
              .withColumn("is_weekend", dayofweek("TIMESTAMP").isin([1,7]).cast("int"))
def computeDistances(spark, dataframe):
    df = dataframe.withColumn("lat1",F.radians(F.col("start_station_latitude"))).withColumn("lat2",F.radians(F.col("end_station_latitude")))\
        .withColumn("lon1",F.radians(F.col("start_station_longitude"))).withColumn("lon2",F.radians(F.col("end_station_longitude")))\
        .withColumn("distance",F.round(F.asin(F.sqrt(
            (-F.cos(F.col("lat2") - F.col("lat1"))*0.5 + 0.5) +
              F.cos(F.col("lat1"))*
                F.cos(F.col("lat2"))*
                (-F.cos(F.col("lon2") - F.col("lon1"))*0.5 + 0.5)))
                  *(2*earth_radius_in_meters/meters_per_mile),2))
    df = df.drop(F.col("lat1")).drop(F.col("lat2")).drop(F.col("lon1")).drop(
        F.col("lon2"))
    return df
Esempio n. 15
0
def haversine(lon1, lat1, lon2, lat2):
    lon1, lat1, lon2, lat2 = map(toRadians, [
        lon1.cast("float"),
        lat1.cast("float"),
        lon2.cast("float"),
        lat2.cast("float")
    ])
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * asin(sqrt(a))
    m = 6367000 * c
    return m.cast("decimal(10,2)")  # meters
Esempio n. 16
0
def haversine(lng1: Column, lat1: Column, lng2: Column, lat2: Column):
    radius = 6378137
    #将度数转成弧度
    radLng1 = f.radians(lng1)
    radLat1 = f.radians(lat1)
    radLng2 = f.radians(lng2)
    radLat2 = f.radians(lat2)

    result = f.asin(
        f.sqrt(
            f.pow(f.sin((radLat1 - radLat2) / 2.0), 2) +
            f.cos(radLat1) * f.cos(radLat2) *
            f.pow(f.sin((radLng1 - radLng2) / 2.0), 2))) * 2.0 * radius
    return result
def calculate_bearing_degrees(latitude_1, longitude_1, latitude_2,
                              longitude_2):
    diff_longitude = F.radians(longitude_2 - longitude_1)

    r_latitude_1 = F.radians(latitude_1)
    r_longitude_1 = F.radians(longitude_1)
    r_latitude_2 = F.radians(latitude_2)
    r_longitude_2 = F.radians(longitude_2)

    y = F.sin(diff_longitude) * F.cos(r_longitude_2)
    x = (F.cos(r_latitude_1) * F.sin(r_latitude_2) -
         F.sin(r_latitude_1) * F.cos(r_latitude_2) * F.cos(diff_longitude))

    return F.degrees(F.atan2(x, y))
Esempio n. 18
0
def dist(lat1, long1, lat2, long2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lat1, long1, lat2, long2 = map(f.toRadians, [lat1, long1, lat2, long2])
    # haversine formula 
    dlon = long2 - long1 
    dlat = lat2 - lat1 
    a = f.sin(dlat/2)**2 + f.cos(lat1) * f.cos(lat2) * f.sin(dlon/2)**2
    c = 2 * f.asin(f.sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return f.round(km, 3)
Esempio n. 19
0
def add_cyclic_feature(df, column, col_name, period):
    period_scale = (2 * pi) / period
    return (
        df.withColumn(col_name + "_cos", cos(column * lit(period_scale)))
        .withColumn(col_name + "_sin", sin(column * lit(period_scale)))
        .drop(col_name)
    )
Esempio n. 20
0
def add_fourier_terms(df, period, col, degree_fourier):
    """根据输入的周期和列的值,添加相应的傅里叶项来描述周期性
    """
    for i in range(1, degree_fourier + 1):
        df = df.withColumn(col + '_fourier_sin_' + str(i),
                           F.sin((2 * np.pi * F.col(col) / period) * i))
        df = df.withColumn(col + '_fourier_cos_' + str(i),
                           F.cos((2 * np.pi * F.col(col) / period) * i))
    return df
Esempio n. 21
0
def add_cyclical_features(df, source_column, target_column):
	df = df.withColumn(target_column, F.from_utc_timestamp(F.col(source_column), "UTC"))
	hour = target_column + "_hour"
	weekday = target_column + "_weekday"
	month = target_column + "_month"
	year = target_column + "_year"
	df = df.withColumn(hour, F.hour(F.col(target_column)))
	df = df.withColumn(weekday, F.dayofweek(F.col(target_column))-1)
	df = df.withColumn(month, F.month(F.col(target_column))-1)
	df = df.withColumn("DerivedTime_cos_" + hour, F.cos(F.col(hour) * (2.0 * np.pi / 24)) + 1)
	df = df.withColumn("DerivedTime_sin_" + hour, F.sin(F.col(hour) * (2.0 * np.pi / 24)) + 1)
	df = df.withColumn("DerivedTime_cos_" + weekday, F.cos(F.col(weekday) * (2.0 * np.pi / 7)) + 1)
	df = df.withColumn("DerivedTime_sin_" + weekday, F.sin(F.col(weekday) * (2.0 * np.pi / 7)) + 1)
	df = df.withColumn("DerivedTime_cos_" + month, F.cos(F.col(month) * (2.0 * np.pi / 12)) + 1)
	df = df.withColumn("DerivedTime_sin_" + month, F.sin(F.col(month) * (2.0 * np.pi / 12)) + 1)
	df = df.withColumn("DerivedTime_" + year, F.year(F.col(target_column)))
	drop_cols = [source_column, hour, weekday, month]
	df = df.drop(*drop_cols)
	return df
Esempio n. 22
0
def elem2coord(df):
    df = df.withColumn('e_sin_M', df['e'] * sin(df['M']))
    df = df.withColumn('e_cos_M', df['e'] * cos(df['M']))
    df = df.withColumn('r', df['a'] * (1 - df['e_cos_M']))
    df = df.withColumn('Omega', Omega_udf(df['a']))
    df = df.withColumn('Kappa', Kappa_udf(df['a']))
    df = df.withColumn('t', (df['Omega'] / df['Kappa']) *
                       (df['M'] + 2.0 * df['e_sin_M']) + df['wt'])
    df = df.withColumn('vr', df['a'] * df['Kappa'] * df['e_sin_M'])
    df = df.withColumn('vt', df['a'] * df['Omega'] * (1.0 + df['e_cos_M']))
    cols = ['id', 'timestep', 'streamline', 'a', 'r', 't', 'vr', 'vt']
    return df.select(cols)
def add_distance_column(dfs, order_column='timestamp'):
    # Radians lat/lon
    dfs = dfs.withColumn('latitude2', F.radians('latitude')).withColumn(
        'longitude2', F.radians('longitude'))

    # Groups GPS locations into chucks. A chunk is formed by groups of points that are distant no more than roam_dist
    w = Window.partitionBy(['userID']).orderBy(order_column)
    dfs = dfs.withColumn('next_lat', F.lead('latitude2', 1).over(w))
    dfs = dfs.withColumn('next_lon', F.lead('longitude2', 1).over(w))

    # Haversine distance
    dfs = dfs.withColumn(
        'distance_next', EARTH_RADIUS * 2 * F.asin(
            F.sqrt(
                F.pow(F.sin((col('next_lat') - col('latitude2')) / 2.0), 2) +
                F.cos('latitude2') * F.cos('next_lat') *
                F.pow(F.sin((col('next_lon') - col('longitude2')) / 2.0), 2))))
    dfs = dfs.withColumn(
        'distance_prev',
        F.lag('distance_next',
              default=0).over(w)).drop('latitude2').drop('longitude2').drop(
                  'next_lon').drop('next_lat').drop('distance_next')
    return dfs
    def distance(lat, lon, lat2, lon2):
        '''
        Uses the "haversine" formula to calculate the distance between two points
        using they latitude and longitude

        Parameters
        ----------
        lat: latitude co-ordinate using signed decimal degrees without compass direction for first location 
        lon: longitude co-ordinate using signed decimal degrees without compass direction for first location 
        lat2: latitude co-ordinate using signed decimal degrees without compass direction for second location 
        lon2: longitude co-ordinate using signed decimal degrees without compass direction for second location 

        Returns
        -------
        Returns distance between two points
    
    
        Notes
        -----
        Haversine formula
        Δφ = φ1 - φ2
        Δλ = λ1 - λ2
        a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
        c = 2 ⋅ atan2( √a, √(1−a) )
        d = R ⋅ c
        φ -> latitude 
        λ -> longitude
        R -> 6371
        '''
        
        R = 6371
        delta_lat = lat - lat2
        delta_lon = lon - lon2
        a = pow(sin(toRadians(delta_lat/2)),2) + cos(toRadians(lat)) * cos(toRadians(lat2)) * pow(sin(toRadians(delta_lon/2)),2)
        c = 2 * atan2(pow(a,0.5) , pow(1-a, 0.5) )
        d = R * c
        return d
Esempio n. 25
0
def distance(lat1, lon1, lat2, lon2, unit='miles'):
    '''
    Measure simple haversine distance between two points. Default unit = miles.
    '''
    units = {
        'miles': 3963.19,
        'kilometers': 6378.137,
        'meters': 6378137,
        'feet': 20902464
    }

    phi_1 = py.radians(lat1)
    phi_2 = py.radians(lat2)
    delta_phi = py.radians(lat2 - lat1)
    delta_lambda = py.radians(lon2 - lon1)

    area = py.sin(delta_phi/2.0) ** 2 \
    + py.cos(phi_1) * py.cos(phi_2) * \
    py.sin(delta_lambda / 2.0) ** 2

    central_angle = 2 * py.asin((area**0.5))
    radius = units[unit.lower()]

    return py.abs(py.round((central_angle * radius), 4))
Esempio n. 26
0
def join_and_analyze(df_poi,df_sample):
    """ Joins the Requests data and POI list data, calculates distance between POI Centers
    and retains the record with the minimum distance to a particular POI center
    
    Parameters: df_poi: POI List datafarme 
                df_sample: Requests dataframe
    
    """
    # Since there are no matching fields between the data, cartesian product is done to combine the datasets
    df_joined = df_sample.crossJoin(df_poi)
    # Caching to memory
    df_joined.cache()
    # Applying the Haversine formula to determine distance between coordinate pairs
    df_joined = df_joined.withColumn("a", (
    F.pow(F.sin(F.radians(F.col("POI_Latitude") - F.col("Latitude")) / 2), 2) +
    F.cos(F.radians(F.col("Latitude"))) * F.cos(F.radians(F.col("POI_Latitude"))) *
    F.pow(F.sin(F.radians(F.col("POI_Longitude") - F.col("Longitude")) / 2), 2)
    )).withColumn("distance", F.atan2(F.sqrt(F.col("a")), F.sqrt(-F.col("a") + 1)) * 2 * 6371)
    
    # Applying window function to retain the records with the least distance to a POI center
    w = Window.partitionBy('_ID')
    df_joined = df_joined.withColumn('min', F.min('distance').over(w))    .where(F.col('distance') == F.col('min'))    .drop('min').drop('a')

    return df_joined
Esempio n. 27
0
def createCyclicalFeatures(data,
                           cyclical_variables,
                           drop_orig_vars=False,
                           verbose=False,
                           logger=False):
    try:
        if verbose:
            logger.info('create_cyclical_features() start')
        for i in range(len(cyclical_variables)):
            distinct_values_count = data.select(
                cyclical_variables[i]).distinct().count()
            data = data.withColumn(cyclical_variables[i]+'_sin', sin(2*np.pi*col(cyclical_variables[i]).\
                                                                     cast(DoubleType())/distinct_values_count))
            data = data.withColumn(cyclical_variables[i]+'_cos', cos(2*np.pi*col(cyclical_variables[i]).\
                                                                     cast(DoubleType())/distinct_values_count))
        if drop_orig_vars:
            data = data.drop(*cyclical_variables)
        if verbose:
            logger.info('create_cyclical_features() end')
    except Exception:
        logger.exception("Fatal error in create_cyclical_features()")
        raise
    return data
Esempio n. 28
0
def main():
    # Parameters for the algorithm
    roam_dist = 100  # meters
    min_stay = 10  # minutes
    # Parameters for the paths
    input_path = '/path_to_parquet'  # parquet file
    output_path = '/path_to_parquet_out'  # parquet file

    spark = SparkSession \
     .builder \
     .appName("Stop locations") \
     .getOrCreate()

    # Read data
    source_df = spark.read.parquet(input_path)
    source_df = source_df.select(
        'user_id', 'timestamp',
        F.radians('latitude').alias('lat'),
        F.radians('longitude').alias("lon")).orderBy('timestamp')
    source_df.cache()

    # Filter out all the data that is not necessary (e.g. positions equals to others in a time-distance less than min_stay
    w = Window.partitionBy(['user_id']).orderBy('timestamp')

    source_df = source_df.select("user_id", "timestamp", "lat", "lon",
                                 F.lead("lat", 1).over(w).alias("next_lat"),
                                 F.lead("lon", 1).over(w).alias("next_lon"))

    dist_df = source_df.withColumn(
        "distance_next", EARTH_RADIUS * 2 * F.asin(
            F.sqrt(
                F.pow(F.sin((col("next_lat") - col("lat")) / 2.0), 2) +
                F.cos("lat") * F.cos("next_lat") *
                F.pow(F.sin((col("next_lon") - col("lon")) / 2.0), 2))))

    dist_df = dist_df.withColumn("distance_prev",
                                 F.lag("distance_next").over(w))
    exclude_df = dist_df.where((
        (col("distance_next") < 5) & (col("distance_prev") < 5))
                               | ((col("distance_next") > roam_dist)
                                  & (col("distance_prev") > roam_dist)))

    df = source_df.join(exclude_df, ['user_id', 'timestamp'],
                        "left_anti").select("user_id", "timestamp", "lat",
                                            "lon")

    # Transform to RDD, in order to apply the function get_stop_location
    # RDD that contains: (user_id, [timestamp, lat, lon, lat_degrees, lon_degrees])
    df_rdd = df.orderBy(['user_id', 'timestamp']).rdd.map(tuple)
    df_rdd = df_rdd.map(lambda x: (x[0], [[x[1], x[2], x[3]]]))
    # RDD that contains: (user_id, [[timestamp, lat, lon], ..., [timestamp, lat, lon]]), sorted by timestamp
    grouped_rdd = df_rdd.reduceByKey(lambda x, y: x + y)

    stop_locations_rdd = grouped_rdd.map(lambda x: (
        x[0],
        get_stop_location(
            x[1], min_stay_duration=min_stay, roaming_distance=roam_dist)))

    stop_locations_rdd = stop_locations_rdd.flatMapValues(lambda x: x).map(
        lambda x: (x[0], x[1][0], x[1][1], x[1][2], x[1][3]))

    # Output schema
    schema = StructType([
        StructField('user_id', StringType(), False),
        StructField('lat', DoubleType(), False),
        StructField('lon', DoubleType(), False),
        StructField('from', TimestampType(), False),
        StructField('to', TimestampType(), False)
    ])
    result_df = spark.createDataFrame(stop_locations_rdd, schema)

    result_df = result_df.withColumn('lat', F.degrees('lat'))
    result_df = result_df.withColumn('lon', F.degrees('lon'))

    result_df.write.save(output_path)
                                	      (col('Start_Latitude') != col("End_Latitude"))) &\
                          	              (col('Start_Longitude') > -80) &\
                                 	      (col('Start_Longitude') < -70) &\
                                	      (col('Start_Latitude') > 40) &\
                               		      (col('Start_Latitude') < 46) &\
                             		      (col('End_Longitude') > -80) &\
                             		      (col('End_Longitude') < -70) &\
                             		      (col('End_Latitude') > 40) &\
                               		      (col('End_Latitude') < 46) &\
                                              (col('Cost') > 0))

yellow_tripdata_1m = yellow_tripdata_1m.withColumn("Duration", ((unix_timestamp(col("End_Datetime")) - unix_timestamp(col("Start_Datetime")))/60))\
		                       .withColumn("Diff_Longitude", col("End_Longitude") - col("Start_Longitude"))\
				       .withColumn("Diff_Latitude", col("End_Latitude") - col("Start_Latitude"))\
				       .withColumn("a", F.pow(F.sin(col("Diff_Latitude")/2),2) +\
				                        F.cos(col("Start_Latitude"))*F.cos(col("End_Latitude"))*F.pow(F.sin(col("Diff_Longitude")/2),2))\
				       .withColumn("Distance", 2 * 6371 * F.atan2(F.sqrt(col("a")), F.sqrt(1.0 - col("a"))))\
				       .drop("Diff_Longitude").drop("Diff_Latitude").drop("Start_Datetime")\
				       .drop("End_Datetime").drop("Start_Longitude").drop("Start_Latitude")\
				       .drop("End_Longitude").drop("End_Latitude").drop("a").drop("Cost")

yellow_trip_joined = yellow_tripdata_1m.join(yellow_tripvendors_1m, "ID", "inner").drop("ID")
yellow_trip_joined.createOrReplaceTempView("yellow_trip_joined")

window = Window.partitionBy("Vendor")
res = yellow_trip_joined.withColumn("Max_Distance", F.max("Distance").over(window))\
                        .where(col("Distance") == col("Max_Distance"))\
                        .drop("Max_Distance").select(["Vendor", "Distance", "Duration"])
   
res.show() 
print("Time of Q2 using SQL with parquet is: %s seconds" % (time.time() - start_time_parquet)) 
Esempio n. 30
0
def main():
    """Main function"""

    # Get args
    args = get_args()

    # Azure credentials
    sas_token = args.sas
    storage_account_name = args.storage
    container_in = args.container_in
    container_out = args.container_out

    azure_accounts = list()
    azure_accounts.append({
        "storage": storage_account_name,
        "sas": sas_token,
        "container": container_in
    })
    azure_accounts.append({
        "storage": storage_account_name,
        "sas": sas_token,
        "container": container_out
    })

    # VM
    cores = args.vm_cores
    ram = args.vm_ram
    shuffle_partitions = args.shuffle_partitions

    # Geohash file path
    geohash_path = args.geohashpath

    # Date, country, prefix
    country = args.country
    date_string = args.date
    prefix = args.prefix

    # Set date variables
    day_time = datetime.strptime(date_string, "%Y-%m-%d")
    year = day_time.year
    month = day_time.month
    day = day_time.day

    # stop config
    seconds = 60
    accuracy = args.accuracy
    roam_dist = args.roam_dist
    min_stay = args.min_stay
    overlap_hours = args.overlap_hours

    # Path in - path out
    blob_in = f"wasbs://{container_in}@{storage_account_name}.blob.core.windows.net/preprocessed/{country}/"
    path_out = f"stoplocation-v{VERSION}_r{roam_dist}-s{min_stay}-a{accuracy}-h{overlap_hours}/{country}"

    if prefix:
        path_out = f"stoplocation-v{VERSION}_prefix_r{roam_dist}-s{min_stay}-a{accuracy}-h{overlap_hours}/{country}"

    # config spark
    conf = getSparkConfig(cores, ram, shuffle_partitions, azure_accounts)

    # Create spark session
    sc = SparkContext(conf=conf).getOrCreate()
    sqlContext = SQLContext(sc)
    spark = sqlContext.sparkSession

    # Init azure client
    blob_service_client = BlobServiceClient.from_connection_string(
        CONN_STRING.format(storage_account_name, sas_token))

    #  build keys, date is mandatory, prefix opt
    partition_key = "year={}/month={}/day={}".format(year, month, day)
    if prefix:
        partition_key = "year={}/month={}/day={}/prefix={}".format(
            year, month, day, prefix)

    blob_base = "{}/{}".format(path_out, partition_key)

    #
    # check for skip
    # TODO
    #
    skip = False

    print("process " + partition_key + " to " + blob_base)
    start_time = time.time()
    local_dir = LOCAL_PATH + partition_key
    print("write temp to " + local_dir)

    # cleanup local if exists
    if (os.path.isdir(local_dir)):
        map(os.unlink,
            (os.path.join(local_dir, f) for f in os.listdir(local_dir)))

    # TODO cleanup remote if exists

    # Output schema
    schema = ArrayType(
        StructType([
            #StructField('device_type', IntegerType(), False),
            StructField('serial', IntegerType(), False),
            StructField('latitude', DoubleType(), False),
            StructField('longitude', DoubleType(), False),
            StructField('begin', TimestampType(), False),
            StructField('end', TimestampType(), False),
            StructField('personal_area', BooleanType(), False),
            StructField('distance', DoubleType(), False),
            StructField('geohash6', StringType(), False),
            StructField('after_stop_distance', DoubleType(), False)
        ]))

    spark_get_stop_location = udf(
        lambda z: get_stop_location(z, roam_dist, min_stay), schema)

    # Geohash file
    print("read geohash parquet")
    csv_time = time.time()
    dfs_us_states = spark.read.format("parquet").load(geohash_path)
    # states = [s.STUSPS for s in dfs_us_states.select(
    #     'STUSPS').distinct().collect()]

    dfs_us_states = dfs_us_states.select(
        col('STUSPS').alias('state'),
        col('geohash').alias('geohash5'))
    dfs_us_states = dfs_us_states.drop_duplicates(subset=['geohash5'])

    # Input dataset
    print("read dataset table")
    read_time = time.time()

    # dfs = spark.read.format("parquet").load(blob_in)

    # # apply partition filter
    # dfs_partition = dfs.where(
    #     f"(year = {year} AND month = {month} AND day = {day}  AND prefix = '{prefix}')")

    # read only partition to reduce browse time
    dfs_cur_partition = spark.read.format("parquet").load(
        f"{blob_in}/{partition_key}")

    # lit partition filters as data
    dfs_cur_partition = dfs_cur_partition.withColumn('year', F.lit(year))
    dfs_cur_partition = dfs_cur_partition.withColumn('month', F.lit(month))
    dfs_cur_partition = dfs_cur_partition.withColumn('day', F.lit(day))
    if prefix:
        dfs_cur_partition = dfs_cur_partition.withColumn(
            'prefix', F.lit(prefix))

    # read next day for overlap
    next_day = day_time + timedelta(days=1)
    next_partition_key = "year={}/month={}/day={}".format(
        next_day.year, next_day.month, next_day.day)
    if prefix:
        next_partition_key = "year={}/month={}/day={}/prefix={}".format(
            next_day.year, next_day.month, next_day.day, prefix)

    dfs_next_partition = spark.read.format("parquet").load(
        f"{blob_in}/{next_partition_key}")
    dfs_next_partition = dfs_next_partition.where(
        F.hour("timestamp") <= (overlap_hours - 1))

    # lit partition filters as data
    dfs_next_partition = dfs_next_partition.withColumn('year',
                                                       F.lit(next_day.year))
    dfs_next_partition = dfs_next_partition.withColumn('month',
                                                       F.lit(next_day.month))
    dfs_next_partition = dfs_next_partition.withColumn('day',
                                                       F.lit(next_day.day))
    if prefix:
        dfs_next_partition = dfs_next_partition.withColumn(
            'prefix', F.lit(prefix))

    # union with overlap
    dfs_partition = dfs_cur_partition.unionAll(dfs_next_partition)

    print("process with spark")
    spark_time = time.time()

    # select columns
    dfs_partition = dfs_partition.select(
        'prefix', 'userID', 'timestamp', 'latitude', 'longitude',
        (F.when(col('opt1') == 'PERSONAL_AREA',
                True).otherwise(False)).alias('personal_area'), 'accuracy')

    # keep only data with required accuracy
    dfs_partition = dfs_partition.where((col('accuracy') <= accuracy)
                                        & (col('accuracy') >= 0))

    # stats - enable only for debug!
    # num_inputs = dfs_partition.count()
    # print(f"read {num_inputs} rows from "+partition_key)

    # Lowering the granularity to 1 minutes

    # explicitely convert to timestamp
    #dfs_partition = dfs_partition.withColumn('timestamp', col('timestamp').cast('timestamp'))
    seconds_window = F.unix_timestamp(
        'timestamp') - F.unix_timestamp('timestamp') % seconds
    w = Window().partitionBy('userID', seconds_window).orderBy('accuracy')
    dfs_partition = dfs_partition.withColumn(
        'rn',
        F.row_number().over(w).cast('int')).where(col('rn') == 1).drop('rn')

    # Radians lat/lon
    dfs_partition = dfs_partition.withColumn('latitude',
                                             F.radians('latitude')).withColumn(
                                                 'longitude',
                                                 F.radians('longitude'))

    # Groups GPS locations into chucks. A chunk is formed by groups of points that are distant no more than roam_dist
    w = Window.partitionBy(['prefix', 'userID']).orderBy('timestamp')
    dfs_partition = dfs_partition.withColumn('next_lat',
                                             F.lead('latitude', 1).over(w))
    dfs_partition = dfs_partition.withColumn('next_lon',
                                             F.lead('longitude', 1).over(w))

    # Haversine distance
    dfs_partition = dfs_partition.withColumn(
        'distance_next', EARTH_RADIUS * 2 * F.asin(
            F.sqrt(
                F.pow(F.sin((col('next_lat') - col('latitude')) / 2.0), 2) +
                F.cos('latitude') * F.cos('next_lat') *
                F.pow(F.sin((col('next_lon') - col('longitude')) / 2.0), 2))))
    dfs_partition = dfs_partition.withColumn(
        'distance_prev',
        F.lag('distance_next', default=0).over(w))

    # Chunks
    dfs_partition = dfs_partition.withColumn(
        'chunk',
        F.when(col('distance_prev') > roam_dist, 1).otherwise(0))

    windowval = (Window.partitionBy(
        'prefix',
        'userID').orderBy('timestamp').rangeBetween(Window.unboundedPreceding,
                                                    0))
    dfs_partition = dfs_partition.withColumn(
        'chunk',
        F.sum('chunk').over(windowval).cast('int'))

    # Remove chunks of the next day
    w = Window.partitionBy(['prefix', 'userID', 'chunk'])
    dfs_partition = dfs_partition.withColumn(
        'min_timestamp', F.dayofmonth(F.min('timestamp').over(w)))
    dfs_partition = dfs_partition.where(
        col('min_timestamp') == day).drop('min_timestamp')

    # Get the stops
    result_df = dfs_partition.groupBy('prefix', 'userID', 'chunk').agg(
        F.array_sort(
            F.collect_list(
                F.struct('timestamp', 'latitude', 'longitude', 'distance_prev',
                         'personal_area'))).alias('gpsdata'),
        F.sum('distance_prev').alias('dist_sum'))
    result_df = result_df.withColumn('gpsdata',
                                     spark_get_stop_location('gpsdata'))

    result_df = result_df.select('userID', 'chunk',
                                 F.explode_outer('gpsdata').alias('e'),
                                 'dist_sum')
    result_df = result_df.select(
        'userID', 'chunk',
        col('e.latitude').alias('latitude'),
        col('e.longitude').alias('longitude'),
        col('e.begin').alias('begin'),
        col('e.end').alias('end'),
        col('e.personal_area').alias('personal_area'),
        col('e.geohash6').alias('geohash6'),
        col('e.serial').alias('serial'),
        col('e.distance').alias('stop_distance'),
        col('e.after_stop_distance').alias('after_stop_distance'), 'dist_sum')
    result_df = result_df.fillna(0, subset=['after_stop_distance'])

    # Remove all those stop that start the next day
    result_df = result_df.where((col('begin').isNull())
                                | (F.dayofmonth('begin') != next_day.day))

    result_df = result_df.withColumn(
        'isStop',
        F.when(col('serial').isNotNull(), 1).otherwise(0))

    result_df = result_df.withColumn(
        'dist_sum',
        F.when(col('isStop') == 1,
               col('stop_distance')).otherwise(col('dist_sum')))

    windowval = (Window.partitionBy('userId').orderBy(
        'chunk', 'serial').rowsBetween(Window.currentRow,
                                       Window.unboundedFollowing))
    result_df = result_df.withColumn('isStop_cum',
                                     F.sum('isStop').over(windowval))

    result_df = result_df.groupBy('userId', 'isStop_cum').agg(
        F.first('latitude', ignorenulls=True).alias('latitude'),
        F.first('longitude', ignorenulls=True).alias('longitude'),
        F.first('begin', ignorenulls=True).alias('begin'),
        F.first('end', ignorenulls=True).alias('end'),
        F.first('personal_area', ignorenulls=True).alias('personal_area'),
        F.first('geohash6', ignorenulls=True).alias('geohash6'),
        F.sum('dist_sum').alias('prev_travelled_distance'),
        F.sum('after_stop_distance').alias('after_stop_distance'))

    # compute next distance, which is null if it's the last
    windowval = Window.partitionBy('userId').orderBy(F.desc('isStop_cum'))
    result_df = result_df.withColumn(
        'next_travelled_distance',
        F.lead('prev_travelled_distance').over(windowval))
    result_df = result_df.withColumn(
        'next_travelled_distance',
        F.when((col('next_travelled_distance').isNull()) &
               (col('after_stop_distance') > 0),
               col('after_stop_distance')).otherwise(
                   col('next_travelled_distance')))

    # Drop nulls
    result_df = result_df.dropna(subset=['latitude']).drop('isStop_cum')

    # Transform latitude and longitude back to degrees
    result_df = result_df.withColumn('latitude', F.degrees('latitude'))
    result_df = result_df.withColumn('longitude', F.degrees('longitude'))

    # US states
    result_df = result_df.withColumn(
        "geohash5", F.expr("substring(geohash6, 1, length(geohash6)-1)"))
    result_df = result_df.join(F.broadcast(dfs_us_states),
                               on="geohash5",
                               how="inner").drop('geohash5')

    # lit partition data - enable only if added to partitionBy
    # result_df = result_df.withColumn('year', F.lit(year))
    # result_df = result_df.withColumn('month', F.lit(month))
    # result_df = result_df.withColumn('day', F.lit(day))

    # write
    out_partitions = len(US_STATES)
    result_df.repartition(out_partitions, "state").write.partitionBy(
        "state").format('parquet').mode("overwrite").save(local_dir + "/")

    # stats - enable only for debug!
    # num_records = result_df.count()
    # print(f"written {num_records} rows to "+local_dir)

    # if num_records == 0:
    #     raise Exception("Zero rows output")

    print("upload local data to azure")
    upload_time = time.time()

    # upload parts over states
    for state in US_STATES:
        print(f"upload files for {state}")
        state_dir = local_dir + "/state=" + state
        state_key = f"{partition_key}/state={state}/"

        if (os.path.isdir(state_dir)):
            files = [
                filename for filename in os.listdir(state_dir)
                if filename.startswith("part-")
            ]

            if len(files) > 0:

                for file_local in files:
                    file_path = state_dir + "/" + file_local
                    part_num = int(file_local.split('-')[1])
                    part_key = '{:05d}'.format(part_num)
                    # fix name as static hash to be reproducible
                    filename_hash = hashlib.sha1(
                        str.encode(state_key + part_key)).hexdigest()

                    blob_key = "{}/state={}/part-{}-{}.snappy.parquet".format(
                        blob_base, state, part_key, filename_hash)

                    print("upload " + file_path + " to " + container_out +
                          ":" + blob_key)

                    blob_client = blob_service_client.get_blob_client(
                        container_out, blob_key)

                    with open(file_path, "rb") as data:
                        blob_client.upload_blob(data, overwrite=True)

                    # cleanup
                    os.remove(file_path)
            else:
                print(f"no files to upload for {state}")

        else:
            print(f"missing partition for {state}")

    print("--- {} seconds elapsed ---".format(int(time.time() - start_time)))
    print()
    stop_time = time.time()
    spark.stop()

    end_time = time.time()
    print("Done in {} seconds (csv:{} read:{} spark:{} upload:{} stop:{})".
          format(int(end_time - start_time), int(read_time - csv_time),
                 int(spark_time - read_time), int(upload_time - spark_time),
                 int(stop_time - upload_time), int(end_time - stop_time)))
    print('Done.')
Esempio n. 31
0
def dist(long_x, lat_x, long_y, lat_y):
    return acos(
        sin(radians(lat_x)) * sin(radians(lat_y)) + 
        cos(radians(lat_x)) * cos(radians(lat_y)) * 
            cos(radians(long_x) - radians(long_y))
    ) * lit(6371.0)