Esempio n. 1
0
def calculate_volatility(rolling_windows=20):
    spark = SparkSession.builder.master('local[*]').appName('Volatility').getOrCreate()
    df = spark.read.format('csv')\
                     .option('header', 'true')\
                     .load('/media/guolewen/research_data/compustats/*.csv')
    # adjust price with stock/dividend split ratio
    df = df.withColumn('adjprccd', df['prccd'] / df['ajexdi'])
    df = df.withColumn('adjprchd', df['prchd'] / df['ajexdi'])
    df = df.withColumn('adjprcld', df['prcld'] / df['ajexdi'])
    # create window
    win_spec = Window.partitionBy('isin').orderBy('datadate')
    # lag price
    df = df.withColumn('ladjprccd', lag('adjprccd').over(win_spec))
    # compute squared daily log returns as the square of natural logarithm of
    # the current closing price divided by previous closing price.
    df = df.withColumn('retsq', pow(log(df['adjprccd'] / df['ladjprccd']), 2))
    # construct a 20-trading-day rolling window
    win_rolling = Window.partitionBy('isin').orderBy('datadate').rowsBetween(-rolling_windows, -1)
    # traditional volatility approach: square root of the average squared daily log returns in a 20-rolling window
    df = df.withColumn('volatility', sqrt(avg('retsq').over(win_rolling)))
    # compute squared daily log high low as the square of natural logarithm
    # of daily high price divided by low price.
    # fill na values with 0 (this is for the case if no trading during the day)
    df = df.withColumn('loghlsq', pow(log(df['adjprchd'] / df['adjprcld']), 2)).fillna(0, subset=['loghlsq'])
    # Parkison's extreme value method: square root of 1/4*Ln2 times the average of squared daily log high low
    # in a 20-rolling window
    df = df.withColumn('Parkinsonvol', sqrt((1/(4*np.log(2))) * avg('loghlsq').over(win_rolling)))
    return df.selectExpr('datadate as Date', 'isin as ISIN', 'volatility', 'Parkinsonvol').toPandas()
Esempio n. 2
0
    def test_math_functions(self):
        df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF()
        from pyspark.sql import functions
        import math

        def get_values(l):
            return [j[0] for j in l]

        def assert_close(a, b):
            c = get_values(b)
            diff = [abs(v - c[k]) < 1e-6 for k, v in enumerate(a)]
            return sum(diff) == len(a)

        assert_close([math.cos(i) for i in range(10)],
                     df.select(functions.cos(df.a)).collect())
        assert_close([math.cos(i) for i in range(10)],
                     df.select(functions.cos("a")).collect())
        assert_close([math.sin(i) for i in range(10)],
                     df.select(functions.sin(df.a)).collect())
        assert_close([math.sin(i) for i in range(10)],
                     df.select(functions.sin(df['a'])).collect())
        assert_close([math.pow(i, 2 * i) for i in range(10)],
                     df.select(functions.pow(df.a, df.b)).collect())
        assert_close([math.pow(i, 2) for i in range(10)],
                     df.select(functions.pow(df.a, 2)).collect())
        assert_close([math.pow(i, 2) for i in range(10)],
                     df.select(functions.pow(df.a, 2.0)).collect())
        assert_close([math.hypot(i, 2 * i) for i in range(10)],
                     df.select(functions.hypot(df.a, df.b)).collect())
        assert_close([math.hypot(i, 2 * i) for i in range(10)],
                     df.select(functions.hypot("a", u"b")).collect())
        assert_close([math.hypot(i, 2) for i in range(10)],
                     df.select(functions.hypot("a", 2)).collect())
        assert_close([math.hypot(i, 2) for i in range(10)],
                     df.select(functions.hypot(df.a, 2)).collect())
Esempio n. 3
0
    def test_math_functions(self):
        df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF()
        from pyspark.sql import functions
        import math

        def get_values(l):
            return [j[0] for j in l]

        def assert_close(a, b):
            c = get_values(b)
            diff = [abs(v - c[k]) < 1e-6 for k, v in enumerate(a)]
            return sum(diff) == len(a)
        assert_close([math.cos(i) for i in range(10)],
                     df.select(functions.cos(df.a)).collect())
        assert_close([math.cos(i) for i in range(10)],
                     df.select(functions.cos("a")).collect())
        assert_close([math.sin(i) for i in range(10)],
                     df.select(functions.sin(df.a)).collect())
        assert_close([math.sin(i) for i in range(10)],
                     df.select(functions.sin(df['a'])).collect())
        assert_close([math.pow(i, 2 * i) for i in range(10)],
                     df.select(functions.pow(df.a, df.b)).collect())
        assert_close([math.pow(i, 2) for i in range(10)],
                     df.select(functions.pow(df.a, 2)).collect())
        assert_close([math.pow(i, 2) for i in range(10)],
                     df.select(functions.pow(df.a, 2.0)).collect())
        assert_close([math.hypot(i, 2 * i) for i in range(10)],
                     df.select(functions.hypot(df.a, df.b)).collect())
Esempio n. 4
0
    def _get_cosine_similarity(self, n_partitions=200):
        """Cosine similarity metric from

        :Citation:

            Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist:
            introducing serendipity into music recommendation, WSDM 2012

        The item indexes in the result are such that i1 <= i2.
        """
        if self.df_cosine_similarity is None:
            pairs = self._get_pairwise_items(df=self.train_df)
            item_count = self.train_df.groupBy(self.col_item).count()

            self.df_cosine_similarity = (pairs.groupBy(
                "i1", "i2").count().join(
                    item_count.select(
                        F.col(self.col_item).alias("i1"),
                        F.pow(F.col("count"), 0.5).alias("i1_sqrt_count"),
                    ),
                    on="i1",
                ).join(
                    item_count.select(
                        F.col(self.col_item).alias("i2"),
                        F.pow(F.col("count"), 0.5).alias("i2_sqrt_count"),
                    ),
                    on="i2",
                ).select(
                    "i1",
                    "i2",
                    (F.col("count") /
                     (F.col("i1_sqrt_count") * F.col("i2_sqrt_count"))).alias(
                         self.sim_col),
                ).repartition(n_partitions, "i1", "i2"))
        return self.df_cosine_similarity
def main(keyspace, table_name):
    df = spark.read.format('org.apache.spark.sql.cassandra').options(table = table_name, keyspace = keyspace).load()
    df.createOrReplaceTempView('nasa_weblogs')
    query = """SELECT host, 
                      COUNT(1) AS no_of_requests,
                      SUM(bytes) AS sum_request_bytes
                 FROM nasa_weblogs
             GROUP BY host"""
    df1 = spark.sql(query)
    df2 = df1.withColumn('squared_requests',functions.pow(df1.no_of_requests,2))
    df3 = df2.withColumn('squared_bytes',functions.pow(df2.sum_request_bytes,2)).drop(df2.host)
    df4 = df3.withColumn('request_mul_bytes',(df3.no_of_requests * df3.sum_request_bytes))
    df5 = df4.withColumn('sq_request_mul_bytes', functions.pow(df4.request_mul_bytes,2))
    df5.createOrReplaceTempView('corr_c')
    query1 = """SELECT SUM(no_of_requests) as xi, 
                      SUM(sum_request_bytes) AS yi, 
                      SUM(squared_requests) AS xi2,
                      SUM(squared_bytes) AS yi2,
                      SUM(request_mul_bytes) AS xiyi
                 FROM corr_c"""
    df6 = spark.sql(query1)
    df7 = df6.withColumn('corr_num',((df1.count()*df6.xiyi) - (df6.xi*df6.yi)))
    df8 = df7.withColumn('corr_den', (functions.sqrt((df1.count() * df6.xi2) - functions.pow(df6.xi, 2))*functions.sqrt((df1.count() * df6.yi2) - functions.pow(df6.yi, 2))))
    df_corr = df8.withColumn('correlation',df8.corr_num/df8.corr_den)
    value = df_corr.select("correlation").collect()[0][0]
    print(f'r = {value}')
    print(f'r^2 = {value**2}')
Esempio n. 6
0
    def test_math_functions(self):
        df = self.sc.parallelize([Row(a=i, b=2 * i) for i in range(10)]).toDF()
        from pyspark.sql import functions

        SQLTestUtils.assert_close([math.cos(i) for i in range(10)],
                                  df.select(functions.cos(df.a)).collect())
        SQLTestUtils.assert_close([math.cos(i) for i in range(10)],
                                  df.select(functions.cos("a")).collect())
        SQLTestUtils.assert_close([math.sin(i) for i in range(10)],
                                  df.select(functions.sin(df.a)).collect())
        SQLTestUtils.assert_close([math.sin(i) for i in range(10)],
                                  df.select(functions.sin(df["a"])).collect())
        SQLTestUtils.assert_close([math.pow(i, 2 * i) for i in range(10)],
                                  df.select(functions.pow(df.a,
                                                          df.b)).collect())
        SQLTestUtils.assert_close([math.pow(i, 2) for i in range(10)],
                                  df.select(functions.pow(df.a, 2)).collect())
        SQLTestUtils.assert_close([math.pow(i, 2) for i in range(10)],
                                  df.select(functions.pow(df.a,
                                                          2.0)).collect())
        SQLTestUtils.assert_close(
            [math.hypot(i, 2 * i) for i in range(10)],
            df.select(functions.hypot(df.a, df.b)).collect(),
        )
        SQLTestUtils.assert_close(
            [math.hypot(i, 2 * i) for i in range(10)],
            df.select(functions.hypot("a", "b")).collect(),
        )
        SQLTestUtils.assert_close([math.hypot(i, 2) for i in range(10)],
                                  df.select(functions.hypot("a", 2)).collect())
        SQLTestUtils.assert_close([math.hypot(i, 2) for i in range(10)],
                                  df.select(functions.hypot(df.a,
                                                            2)).collect())
Esempio n. 7
0
    def update(self, df, value_col, count_col, mean_col):
        past_value = F.col(self.past_features_column)[value_col]
        current_value = F.col(self.current_features_column)[value_col]
        past_count = F.col(self.past_features_column)[count_col]
        current_count = F.col(self.current_features_column)[count_col]
        past_mean = F.col(self.past_features_column)[mean_col]
        current_mean = F.col(self.current_features_column)[mean_col]

        updated_mean = (
            past_count * past_value + current_count * current_value
        ) / (past_count + current_count)

        updated_past_counts = (past_count - 1) * F.pow(past_value, F.lit(2))
        updated_current_counts = (current_count - 1) * F.pow(
            current_value, F.lit(2)
        )
        updated_past_means = past_count * F.pow(
            (past_mean - updated_mean), F.lit(2)
        )
        updated_current_means = current_count * F.pow(
            (current_mean - updated_mean), F.lit(2)
        )
        population_size = past_count + current_count - 1

        return df.withColumn(
            self.updated_feature_col_name,
            ((
                updated_past_counts +
                updated_current_counts +
                updated_past_means +
                updated_current_means
            ) / population_size).cast('float')
        )
def main(keyspace, table):
    df = spark.read.format("org.apache.spark.sql.cassandra") \
    .options(table=table, keyspace=keyspace).load()
    logs_df = df.select(df['host'],
                        df['bytes'].cast(types.IntegerType())).withColumn(
                            'count', functions.lit(1))
    logs_df_sum = logs_df.groupBy('host').sum().withColumnRenamed(
        'sum(bytes)', 'y').withColumnRenamed('sum(count)', 'x')
    six_values = logs_df_sum.withColumn(
        'x_square', functions.pow(logs_df_sum['x'], 2)).withColumn(
            'y_square', functions.pow(logs_df_sum['y'], 2)).withColumn(
                'xy', logs_df_sum['x'] * logs_df_sum['y']).withColumn(
                    '1', functions.lit(1))
    six_sums = six_values.groupBy().sum().cache()
    numerator = six_sums.select(
        (six_sums['sum(xy)'] * six_sums['sum(1)'] -
         six_sums['sum(x)'] * six_sums['sum(y)'])).collect()
    denominator_1 = six_sums.select(
        functions.sqrt(six_sums['sum(1)'] * six_sums['sum(x_square)'] -
                       functions.pow(six_sums['sum(x)'], 2))).collect()
    denominator_2 = six_sums.select(
        functions.sqrt(six_sums['sum(1)'] * six_sums['sum(y_square)'] -
                       functions.pow(six_sums['sum(y)'], 2))).collect()
    result = numerator[0][0] / (denominator_1[0][0] * denominator_2[0][0])
    # result=logs_df_sum.corr('x','y')
    result_sqr = result**2
    print('r=', result)
    print('r^2=', result_sqr)
def benchmarkCalculatePiUsingDF(spark, samples, parallelism, jobLogger):
    def inside(p):
        x, y = random.random(), random.random()
        return x * x + y * y < 1

    jobLogger.info(
        '****************************************************************')
    jobLogger.info(
        'Starting benchmark test calculatng Pi via dataframe manipulations '
        'with {0:,} samples'.format(samples))

    start_time = timer()

    # Note that the random seed for each of the columns must be different otherwise
    # each column will have identical values on each row
    pi_df = (spark.range(0, samples, numPartitions=parallelism).withColumn(
        'x', F.rand(seed=8675309)
    ).withColumn('y', F.rand(seed=17760704)).withColumn(
        'within_circle',
        F.when(
            (F.pow(F.col('x'), F.lit(2)) + F.pow(F.col('y'), F.lit(2)) <= 1.0),
            F.lit(1).cast(T.LongType())).otherwise(
                F.lit(0).cast(T.LongType()))).agg(
                    F.sum('within_circle').alias('count_within_circle'),
                    F.count('*').alias('count_samples')))
    res = pi_df.collect()
    pi_val = 4.0 * (res[0].count_within_circle) / (res[0].count_samples)
    end_time = timer()
    return (end_time - start_time), pi_val
Esempio n. 10
0
def add_datediff(df, date_col, start_date):
    """添加日期序号
    """
    df = df.withColumn('datediff',
                       F.datediff(F.col(date_col), F.lit(start_date)))
    df = df.withColumn('datediff_square', F.pow(F.col('datediff'), 2))
    df = df.withColumn('datediff_square_root', F.pow(F.col('datediff'), 0.5))
    return df
Esempio n. 11
0
def haversine(lng1: Column, lat1: Column, lng2: Column, lat2: Column):
    radius = 6378137
    #将度数转成弧度
    radLng1 = f.radians(lng1)
    radLat1 = f.radians(lat1)
    radLng2 = f.radians(lng2)
    radLat2 = f.radians(lat2)

    result = f.asin(
        f.sqrt(
            f.pow(f.sin((radLat1 - radLat2) / 2.0), 2) +
            f.cos(radLat1) * f.cos(radLat2) *
            f.pow(f.sin((radLng1 - radLng2) / 2.0), 2))) * 2.0 * radius
    return result
Esempio n. 12
0
def dailytrades(symbol, date_string, tsunit):
    '''return table of trades'''
    s = '''SELECT
            time,
            trade_size AS quantity,
            price,
            buy_broker,
            sell_broker,
            trade_condition,
            time,
            date_trunc('%s', time) as timed,
            ROW_NUMBER() OVER (ORDER BY time) row
        FROM trades 
        WHERE symbol = '%s' 
            AND date_string = '%s' 
            AND price > 0
        ORDER BY time ASC'''
    sargs = (tsunit, symbol, date_string)
    spark.sql(s % sargs).createOrReplaceTempView('tradetemp')

    s = '''SELECT
            a.*, 
            a.price - b.price AS price_diff,
            LOG(a.price/b.price) AS logreturn
        FROM tradetemp a
        LEFT JOIN tradetemp b
            ON a.row = b.row-1'''
    df = spark.sql(s)
    df = df.withColumn('price_diff2', F.pow(df.price_diff, 2))
    return df
Esempio n. 13
0
def addErrorCols(transformedFull,
                   col_target,
                   col_predict,
                   verbose,
                   logger):    
    try:
        if verbose:
            logger.info('Add error columns to spark df start, function add_error_cols()')
        transformedFull = transformedFull\
                         .select('*', abs((transformedFull[col_target] - transformedFull[col_predict])
                                         /transformedFull[col_target]*100)\
                         .alias(col_target+'_APE'))
        transformedFull = transformedFull\
                         .select('*', abs((transformedFull[col_target] - transformedFull[col_predict]))\
                         .alias(col_target+'_AE'))
        transformedFull = transformedFull\
                         .select('*', pow(transformedFull[col_target] - transformedFull[col_predict],2)\
                         .alias(col_target+'_SE'))

        if verbose:
            logger.info('Add error columns to spark df end')
    except Exception:
        logger.exception("Fatal error in add_error_cols()")
        raise
    return transformedFull
Esempio n. 14
0
def geo_density(df):
    producOfCardinalities = multiply_list(get_cardinalities(new_subtensor))
    if producOfCardinalities == 0:
        return -1

    return df.select(
        F.sum('measure') / F.pow(F.lit(producOfCardinalities), 1.0 /
                                 (len(cols) - 1))).collect()[0][0]
Esempio n. 15
0
def add_median_salary(df, *groups, sort_field="-avg_salary"):
    window = Window.partitionBy(*groups)
    rank_spec = window.orderBy(sort(sort_field))
    df = df.withColumn("percent_rank", F.percent_rank().over(rank_spec))
    # 按中位数排序
    median_spec = window.orderBy(F.pow(df.percent_rank - 0.5, 2))
    df = df.withColumn("avg_salary", F.first("avg_salary").over(median_spec))
    return df
Esempio n. 16
0
def squares(range_):
    return (
        range_
        .withColumn(
            'squares',
            F.pow(F.col('id'), F.lit(2))
        )
    )
Esempio n. 17
0
def df(train: DataFrame):
    """
    Some experiments
    """
    cols = train.columns
    print(cols)
    t1 = train.withColumn("mse",
                          F.pow((F.col("Sales_Pred") - F.col("sales")), 2))
    t1.show()
Esempio n. 18
0
def wgs84_to_rds(latitude_col: str,
                 longitude_col: str) -> [pyspark.sql.column.Column]:
    """
    Creates two column definitions based on lat/lon column names

    Example usage:
    ```
    # df = yourdataframe_with_columns_lat_and_lon
    x, y = wgs_84_to_rds('lat', 'lon')

    # method 1: select
    df.select(
        '*',
        x.alias('rd_x'),
        y.alias('rd_y')
    )

    # method 2: withColumn
    df \
        .withColumn('rd_x', x) \
        .withColumn('rd_y', y)

    # method 3: one-liner
    df.select('*', *wgs_84_to_rds('lat', 'lon'))
    ```

    :param latitude_col: name of latitude column
    :param longitude_col: name of longitude column
    :returns: tuple column definitions for x and y
    """
    d_lat = 0.36 * (f.col(latitude_col) - phi0)
    d_lon = 0.36 * (f.col(longitude_col) - lam0)

    x = x0
    for i, v in enumerate(Rpq):
        x += v * f.pow(d_lat, Rp[i]) * f.pow(d_lon, Rq[i])

    y = y0
    for i, v in enumerate(Spq):
        y += v * f.pow(d_lat, Sp[i]) * f.pow(d_lon, Sq[i])

    return x, y
def AvgSqError(spark, df, geolevels, queries, schema):
    # This function calculates the average squared error for levels at each geounit and geolevel
    u = sdftools.getAnswers(spark, df, geolevels, schema, queries)
    print("u is")
    u.show()
    # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
    u = u.withColumn('diff', sf.col('priv') - sf.col('orig'))
    u = u.withColumn('sq', sf.lit(2))
    u = u.withColumn('sq diff', sf.pow(sf.col('diff'), sf.col('sq')))
    u = u.groupBy(['geocode', 'geolevel', 'level']).avg()
    return u
Esempio n. 20
0
def main(topic):
    messages = spark.readStream.format('kafka') \
        .option('kafka.bootstrap.servers', '199.60.17.210:9092,199.60.17.193:9092') \
        .option('subscribe', topic).load()
    values = messages.select(messages['value'].cast('string'))
    values = values.withColumn('tmp', f.split(values['value'], " "))
    xy = values.select(values['tmp'].getItem(0).alias('x'),
                       values['tmp'].getItem(1).alias('y'))
    sums = xy.select(f.sum('x').alias('sum_x'), f.sum('y').alias('sum_y'), (f.sum(xy['x']*xy['y'])).alias('sum_xy'), \
        f.count('x').alias('n'), f.sum(f.pow(xy['x'],2)).alias('sum_x_square'))
    results = sums.withColumn(
        'slope',
        ((sums['sum_xy'] - (1 / sums['n']) * sums['sum_x'] * sums['sum_y']) /
         (sums['sum_x_square'] - (1 / sums['n']) * (f.pow(sums['sum_x'], 2)))))
    results = results.withColumn(
        'intercept', (results['sum_y'] / results['n']) - results['slope'] *
        (results['sum_x'] / results['n']))
    final = results.drop('sum_x', 'sum_y', 'sum_xy', 'n', 'sum_x_square')
    stream = final.writeStream.format('console').option(
        "truncate", "false").outputMode('complete').start()
    stream.awaitTermination(60)
Esempio n. 21
0
def main(in_directory):
    logs = spark.createDataFrame(create_row_rdd(in_directory))
    num_points = logs.count()

    grouped_host = logs.groupBy('hostname').agg(functions.count('hostname').alias('xi'),\
                                                functions.sum('bytes').alias('yi'),\
                                                functions.pow(functions.count('hostname'), 2).alias('xi^2'),\
                                                functions.pow(functions.sum('bytes'), 2).alias('yi^2'),\
                                                (functions.sum('bytes')*functions.count('hostname')).alias('xiyi')).cache()
    grouped_host = grouped_host.groupBy().agg(functions.count('hostname').alias('n'),\
                                    functions.sum('xi').alias('sum_xi'),\
                                    functions.sum('yi').alias('sum_yi'),\
                                    functions.sum('xi^2').alias('sum_xi^2'),\
                                    functions.sum('yi^2').alias('sum_yi^2'),\
                                    functions.sum('xiyi').alias('sum_xiyi'))

    # TODO: calculate r.

    n, sum_x, sum_y, sum_x_2, sum_y_2, sum_xy = grouped_host.first() 
    r = (n*sum_xy - sum_x*sum_y) / (((n*sum_x_2-sum_x*sum_x) ** 0.5) * ((n*sum_y_2 - sum_y*sum_y) ** 0.5))
    print("r = %g\nr^2 = %g" % (r, r**2))
Esempio n. 22
0
def main(inputs):

    rdd = sc.textFile(inputs)
    rdd_filtered = rdd.flatMap(reg_exp)
    rdd_1 = rdd_filtered.map(lambda x: (x[0], x[3]))
    df = spark.createDataFrame(rdd_1, ['hostname', 'bytes'])
    df.createOrReplaceTempView('nasa_weblogs')
    query = """SELECT hostname, 
                      COUNT(1) AS no_of_requests,
                      SUM(bytes) AS sum_request_bytes
                 FROM nasa_weblogs
             GROUP BY hostname"""
    df1 = spark.sql(query)
    df2 = df1.withColumn('squared_requests',
                         functions.pow(df1.no_of_requests, 2))
    df3 = df2.withColumn('squared_bytes',
                         functions.pow(df2.sum_request_bytes,
                                       2)).drop(df2.hostname)
    df4 = df3.withColumn('request_mul_bytes',
                         (df3.no_of_requests * df3.sum_request_bytes))
    df5 = df4.withColumn('sq_request_mul_bytes',
                         functions.pow(df4.request_mul_bytes, 2))
    df5.createOrReplaceTempView('corr_c')
    query1 = """SELECT SUM(no_of_requests) as xi, 
                      SUM(sum_request_bytes) AS yi, 
                      SUM(squared_requests) AS xi2,
                      SUM(squared_bytes) AS yi2,
                      SUM(request_mul_bytes) AS xiyi
                 FROM corr_c"""
    df6 = spark.sql(query1)
    df7 = df6.withColumn('corr_num',
                         ((df1.count() * df6.xiyi) - (df6.xi * df6.yi)))
    df8 = df7.withColumn(
        'corr_den',
        (functions.sqrt((df1.count() * df6.xi2) - functions.pow(df6.xi, 2)) *
         functions.sqrt((df1.count() * df6.yi2) - functions.pow(df6.yi, 2))))
    df_corr = df8.withColumn('correlation', df8.corr_num / df8.corr_den)
    value = df_corr.select("correlation").collect()[0][0]
    print(f'r = {value}')
    print(f'r^2 = {value**2}')
 def queryLp(df, p, groupby=[AC.GEOLEVEL, AC.GEOCODE, AC.QUERY, AC.RUN_ID, AC.PLB, AC.BUDGET_GROUP]):
     """
     Calculates the L^p-norm for each unique (GEOLEVEL, GEOCODE, QUERY, RUN_ID, PLB, BUDGET_GROUP) group
     """
     sdftools.show(p, "Value of p in the L^p metric")
     if p == "inf":
         # 'priv' means "protected via the differential privacy routines in this code base" variable to be renamed after P.L.94-171 production
         df = df.withColumn("L^inf_norm", sf.abs(sf.col(AC.PRIV) - sf.col(AC.ORIG))).persist()
         sdftools.show(df, "L^inf_norm as | protected - orig | before taking the max")
         df = df.groupBy(groupby).agg(sf.max(sf.col("L^inf_norm"))).persist()
         sdftools.show(df, "L^inf_norm after taking the max per group")
         df = sdftools.stripSQLFromColumns(df).persist()
     else:
         df = df.withColumn(f"L^{p}", sf.pow(sf.abs(sf.col(AC.PRIV) - sf.col(AC.ORIG)), sf.lit(p))).persist()
         sdftools.show(df, f"L^{p} after taking | protected - orig | ^ {p}")
         df = df.groupBy(groupby).sum().persist()
         sdftools.show(df, f"L^{p} after groupby and sum")
         df = sdftools.stripSQLFromColumns(df).persist()
         df = df.withColumn(f"L^{p}_norm", sf.pow(sf.col(f"L^{p}"), sf.lit(1/p))).persist()
         sdftools.show(df, f"L^{p} after taking {p}-th root of the sum")
         df = sdftools.stripSQLFromColumns(df).persist()
     return df
Esempio n. 24
0
def cal_movies_similarities(movie_ratings_df: DataFrame):
    """
    Calculate similarity of movie pairs based on their co-occurrence
    and the cosine similarity of their ratings when watched by same person.
    :param movie_ratings_df:
    :return:
    """
    # Find all pair of different movies watched by the same person.
    # func.col('mr1.movie_id') < func.col('mr2.movie_id') to avoid duplication.
    # Parenthesis is mandatory for combined condition (e.g. &, |)
    ratings_pairs_df = movie_ratings_df.alias('mr1'). \
        join(movie_ratings_df.alias('mr2'),
             (func.col('mr1.user_id') == func.col('mr2.user_id')) & (
                         func.col('mr1.movie_id') < func.col('mr2.movie_id'))). \
        select(
        func.col('mr1.movie_id').alias('movie_id_1'),
        func.col('mr2.movie_id').alias('movie_id_2'),
        func.col('mr1.rating').alias('rating_1'),
        func.col('mr2.rating').alias('rating_2')
    )

    # Calculate dot product (numerator) and magnitude (denominator) of cosine similarity equation.
    # Each movie is considered a vector of its ratings.
    ratings_pairs_df = ratings_pairs_df.groupBy('movie_id_1', 'movie_id_2'). \
        agg(func.sum(func.col('rating_1') * func.col('rating_2')).alias('sim_dot_product'),
            (func.sqrt(func.sum(func.pow(func.col('rating_1'), 2))) * func.sqrt(
                func.sum(func.pow(func.col('rating_2'), 2)))).alias('sim_magnitude'),
            func.count(func.col('movie_id_1')).alias('co_occurrence_count')
            )

    # Calculate cosine similarity as a new column:
    # (doc product of two movie ratings / doc product of two magnitude ratings)
    movies_similarities_df = ratings_pairs_df. \
        withColumn('similarity_score',
                   func.when(func.col('sim_magnitude') != 0,
                             func.col('sim_dot_product') / func.col('sim_magnitude')).otherwise(0)
                   ).select('movie_id_1', 'movie_id_2', 'similarity_score', 'co_occurrence_count')

    return movies_similarities_df
def main(inputs, output):
    observation_schema = types.StructType([
        types.StructField('hostname', types.StringType(), False),
        types.StructField('datetime', types.StringType(), False),
        types.StructField('path', types.StringType(), False),
        types.StructField('bytecount', types.IntegerType(), False),
        ])

    web_logs = sc.textFile(inputs)
    web_logs1 = web_logs.map(match_pattern)
    web_logs2 = web_logs1.filter(lambda x: x is not None)
    #web_logs2.saveAsTextFile(output)
    web_df = spark.createDataFrame(web_logs2, schema = observation_schema)
    web_df1 = web_df.select('hostname','bytecount')
    #web_df1.show()
    #web_df1.createOrReplaceTempView('web_df1')
    #print(web_df1.printSchema())
    web_df2 = web_df1.groupBy('hostname').agg(functions.count('hostname').alias('x'))
    web_df2 = web_df2.withColumnRenamed('hostname', 'hostname1')
    #web_df2.show()
    web_df3 = web_df1.groupBy('hostname').agg(functions.sum('bytecount').alias('y'))
    #web_df3.show()
    web_df_join = web_df2.join(web_df3, web_df2.hostname1 == web_df3.hostname)
    #web_df_join.show()
    web_df4 = web_df_join.select('hostname','x','y').withColumn('one_val',functions.lit(1))
    #web_df4.show()
    web_df5 = web_df4.groupBy('hostname','x','y','one_val').agg((functions.pow(web_df4.x,2).alias('x2')),functions.pow(web_df4.y,2).alias('y2')).cache()
    web_df5 = web_df5.withColumn('xy',web_df5.x * web_df5.y).cache()
    #web_df5.show()
    sum_one = web_df5.groupBy().sum('one_val','x','y','x2','y2','xy').collect()
    #print(sum_one)
    one_val = sum_one[0][0]
    print('sum_1s: ',one_val)
    x = sum_one[0][1]
    print('sum_x: ', x)
    y = sum_one[0][2]
    print('sum_y: ', y)
    x2 = sum_one[0][3]
    print('sum_x2: ' ,x2)
    y2 = sum_one[0][4] 
    print('sum_y2: ', y2)  
    xy = sum_one[0][5]
    print('sum_xy: ',xy)
    r1 = ((one_val * xy) - (x * y))
    #print('r1: ', r1)
    r2 = (sqrt((one_val * x2) - (x ** 2))) * (sqrt((one_val * y2) - (y ** 2)))
    #print('r2: ',r2)
    r = r1 / r2
    print('r: ', r)
    r2 = r ** 2
    print('r^2: ',r2)
Esempio n. 26
0
def dailycbbo(symbol, date_string, tsunit):
    '''return table of consolidated order
    Args:
        tsunit: one of ["HOUR", "MINUTE", "SECOND"]
            https://spark.apache.org/docs/2.3.0/api/sql/#date_trunc
    '''
    s = '''SELECT
            bid_price,
            bid_size,
            ask_price,
            ask_size,
            ask_price - bid_price AS spread,
            (bid_price + ask_price) / 2 AS mid_price,
            ((ask_size * bid_price) + (bid_size * ask_price)) / (ask_size + bid_size) AS weighted_price,
            time,
            date_trunc('%s', time) AS timed,
            ROW_NUMBER() OVER (ORDER BY time) row
        FROM cbbo
        WHERE symbol = '%s' 
            AND date_string = '%s' 
        ORDER BY time ASC'''
    sargs = (tsunit, symbol, date_string)
    dftemp = spark.sql(s%sargs)
    dftemp.createOrReplaceTempView('cbbotemp')

    s = '''SELECT
            a.*, 
            a.mid_price - b.mid_price AS mid_price_diff,
            LOG(a.mid_price/b.mid_price) AS mid_price_logreturn,
            a.weighted_price - b.weighted_price AS weighted_price_diff,
            LOG(a.weighted_price/b.weighted_price) AS weighted_price_logreturn
        FROM cbbotemp a
        LEFT JOIN cbbotemp b
            ON a.row = b.row-1'''
    dftemp = spark.sql(s)
    dftemp = dftemp.withColumn('mid_price_diff2', F.pow(dftemp.mid_price_diff, 2))
    dftemp = dftemp.withColumn('weighted_price_diff2', F.pow(dftemp.weighted_price_diff, 2))
    return dftemp
Esempio n. 27
0
def main(inputs):
    server_logs = sc.textFile(inputs)
    server_logs_dis = server_logs.map(disassemble)
    server_logs_dis = server_logs_dis.filter(lambda x: len(x) == 6)
    log_schema = types.StructType([
        types.StructField('empty_1', types.StringType()),
        types.StructField('host_name', types.StringType()),
        types.StructField('datetime', types.StringType()),
        types.StructField('requested_path', types.StringType()),
        types.StructField('bytes', types.StringType()),
        types.StructField('empty_2', types.StringType()),
    ])
    df = spark.createDataFrame(server_logs_dis, schema=log_schema)
    logs_df = df.select(df['host_name'],
                        df['bytes'].cast(types.IntegerType())).withColumn(
                            'count', functions.lit(1))
    logs_df_sum = logs_df.groupBy('host_name').sum().withColumnRenamed(
        'sum(bytes)', 'y').withColumnRenamed('sum(count)', 'x')
    six_values = logs_df_sum.withColumn(
        'x_square', functions.pow(logs_df_sum['x'], 2)).withColumn(
            'y_square', functions.pow(logs_df_sum['y'], 2)).withColumn(
                'xy', logs_df_sum['x'] * logs_df_sum['y']).withColumn(
                    '1', functions.lit(1))
    six_sums = six_values.groupBy().sum().cache()
    numerator = six_sums.select(
        (six_sums['sum(xy)'] * six_sums['sum(1)'] -
         six_sums['sum(x)'] * six_sums['sum(y)'])).collect()
    denominator_1 = six_sums.select(
        functions.sqrt(six_sums['sum(1)'] * six_sums['sum(x_square)'] -
                       functions.pow(six_sums['sum(x)'], 2))).collect()
    denominator_2 = six_sums.select(
        functions.sqrt(six_sums['sum(1)'] * six_sums['sum(y_square)'] -
                       functions.pow(six_sums['sum(y)'], 2))).collect()
    result = numerator[0][0] / (denominator_1[0][0] * denominator_2[0][0])
    # result=logs_df_sum.corr('x','y')
    result_sqr = result**2
    print('r=', result)
    print('r^2=', result_sqr)
def add_distance_column(dfs, order_column='timestamp'):
    # Radians lat/lon
    dfs = dfs.withColumn('latitude2', F.radians('latitude')).withColumn(
        'longitude2', F.radians('longitude'))

    # Groups GPS locations into chucks. A chunk is formed by groups of points that are distant no more than roam_dist
    w = Window.partitionBy(['userID']).orderBy(order_column)
    dfs = dfs.withColumn('next_lat', F.lead('latitude2', 1).over(w))
    dfs = dfs.withColumn('next_lon', F.lead('longitude2', 1).over(w))

    # Haversine distance
    dfs = dfs.withColumn(
        'distance_next', EARTH_RADIUS * 2 * F.asin(
            F.sqrt(
                F.pow(F.sin((col('next_lat') - col('latitude2')) / 2.0), 2) +
                F.cos('latitude2') * F.cos('next_lat') *
                F.pow(F.sin((col('next_lon') - col('longitude2')) / 2.0), 2))))
    dfs = dfs.withColumn(
        'distance_prev',
        F.lag('distance_next',
              default=0).over(w)).drop('latitude2').drop('longitude2').drop(
                  'next_lon').drop('next_lat').drop('distance_next')
    return dfs
Esempio n. 29
0
def main(in_directory):
    logs = spark.createDataFrame(create_row_rdd(in_directory))
    # TODO: calculate r.
    groups = logs.groupby(logs.hostname)
    hosts1 = groups.agg({'bytes': 'count'})
    hosts1 = hosts1.cache()
    hosts2 = groups.agg({'bytes': 'sum'})
    hosts2 = hosts2.cache()
    hosts = hosts1.join(hosts2, on='hostname')
    hosts.show()
    groups = hosts.groupby().agg(
        functions.count(hosts.hostname), functions.sum(hosts['count(bytes)']),
        functions.sum(functions.pow(hosts['count(bytes)'], 2)),
        functions.sum(hosts['sum(bytes)']),
        functions.sum(functions.pow(hosts['sum(bytes)'], 2)),
        functions.sum(hosts['count(bytes)'] * hosts['sum(bytes)']))

    groups.show()

    # n = logs.count()
    # x = functions.sum(hosts['count(bytes)']).first()
    # x2 = functions.sum(functions.pow(hosts['count(bytes)'], 2)).first()
    # y = functions.sum(hosts['sum(bytes)']).first()
    # y2 = functions.sum(functions.pow(hosts['sum(bytes)'], 2)).first()
    # xy = functions.sum(hosts['count(bytes)'] * hosts['sum(bytes)']).first()

    a = groups.first()
    n = a[0]
    x = a[1]
    x2 = a[2]
    y = a[3]
    y2 = a[4]
    xy = a[5]

    r = (n * xy - x * y) / (math.sqrt(n * x2 - x**2) *
                            math.sqrt(n * y2 - y**2))
    print("r = %g\nr^2 = %g" % (r, r**2))
    def distance(lat, lon, lat2, lon2):
        '''
        Uses the "haversine" formula to calculate the distance between two points
        using they latitude and longitude

        Parameters
        ----------
        lat: latitude co-ordinate using signed decimal degrees without compass direction for first location 
        lon: longitude co-ordinate using signed decimal degrees without compass direction for first location 
        lat2: latitude co-ordinate using signed decimal degrees without compass direction for second location 
        lon2: longitude co-ordinate using signed decimal degrees without compass direction for second location 

        Returns
        -------
        Returns distance between two points
    
    
        Notes
        -----
        Haversine formula
        Δφ = φ1 - φ2
        Δλ = λ1 - λ2
        a = sin²(Δφ/2) + cos φ1 ⋅ cos φ2 ⋅ sin²(Δλ/2)
        c = 2 ⋅ atan2( √a, √(1−a) )
        d = R ⋅ c
        φ -> latitude 
        λ -> longitude
        R -> 6371
        '''
        
        R = 6371
        delta_lat = lat - lat2
        delta_lon = lon - lon2
        a = pow(sin(toRadians(delta_lat/2)),2) + cos(toRadians(lat)) * cos(toRadians(lat2)) * pow(sin(toRadians(delta_lon/2)),2)
        c = 2 * atan2(pow(a,0.5) , pow(1-a, 0.5) )
        d = R * c
        return d
Esempio n. 31
0
def join_and_analyze(df_poi,df_sample):
    """ Joins the Requests data and POI list data, calculates distance between POI Centers
    and retains the record with the minimum distance to a particular POI center
    
    Parameters: df_poi: POI List datafarme 
                df_sample: Requests dataframe
    
    """
    # Since there are no matching fields between the data, cartesian product is done to combine the datasets
    df_joined = df_sample.crossJoin(df_poi)
    # Caching to memory
    df_joined.cache()
    # Applying the Haversine formula to determine distance between coordinate pairs
    df_joined = df_joined.withColumn("a", (
    F.pow(F.sin(F.radians(F.col("POI_Latitude") - F.col("Latitude")) / 2), 2) +
    F.cos(F.radians(F.col("Latitude"))) * F.cos(F.radians(F.col("POI_Latitude"))) *
    F.pow(F.sin(F.radians(F.col("POI_Longitude") - F.col("Longitude")) / 2), 2)
    )).withColumn("distance", F.atan2(F.sqrt(F.col("a")), F.sqrt(-F.col("a") + 1)) * 2 * 6371)
    
    # Applying window function to retain the records with the least distance to a POI center
    w = Window.partitionBy('_ID')
    df_joined = df_joined.withColumn('min', F.min('distance').over(w))    .where(F.col('distance') == F.col('min'))    .drop('min').drop('a')

    return df_joined
  .where("isExpensive")\
  .select("unitPrice", "isExpensive").show(5)


# COMMAND ----------

from pyspark.sql.functions import expr
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))\
  .where("isExpensive")\
  .select("Description", "UnitPrice").show(5)


# COMMAND ----------

from pyspark.sql.functions import expr, pow
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)


# COMMAND ----------

df.selectExpr(
  "CustomerId",
  "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, round, bround

df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)