Esempio n. 1
0
    def test_optimize_zorder_by_w_partition_filter(self) -> None:
        # write an unoptimized delta table
        df = self.spark.createDataFrame([i for i in range(0, 100)], IntegerType()) \
            .withColumn("col1", floor(col("value") % 7)) \
            .withColumn("col2", floor(col("value") % 27)) \
            .withColumn("p", floor(col("value") % 10)) \
            .repartition(4).write.partitionBy("p")

        df.format("delta").save(self.tempFile)

        # create DeltaTable
        dt = DeltaTable.forPath(self.spark, self.tempFile)

        # execute Z-OrderBy
        optimizer = dt.optimize().where("p = 2")
        result = optimizer.executeZOrderBy(["col1", "col2"])
        metrics = result.select("metrics.*").head()

        # assertions (partition 'p = 2' has four files)
        self.assertTrue(metrics.numFilesAdded == 1)
        self.assertTrue(metrics.numFilesRemoved == 4)
        self.assertTrue(metrics.totalFilesSkipped == 0)
        self.assertTrue(metrics.totalConsideredFiles == 4)
        self.assertTrue(metrics.zOrderStats.strategyName == 'all')
        self.assertTrue(metrics.zOrderStats.numOutputCubes == 1)
Esempio n. 2
0
    def test_optimize_zorder_by(self) -> None:
        # write an unoptimized delta table
        self.spark.createDataFrame([i for i in range(0, 100)], IntegerType()) \
            .withColumn("col1", floor(col("value") % 7)) \
            .withColumn("col2", floor(col("value") % 27)) \
            .withColumn("p", floor(col("value") % 10)) \
            .repartition(4).write.partitionBy("p").format("delta").save(self.tempFile)

        # create DeltaTable
        dt = DeltaTable.forPath(self.spark, self.tempFile)

        # execute Z-Order Optimization
        optimizer = dt.optimize()
        result = optimizer.executeZOrderBy(["col1", "col2"])
        metrics = result.select("metrics.*").head()

        self.assertTrue(metrics.numFilesAdded == 10)
        self.assertTrue(metrics.numFilesRemoved == 37)
        self.assertTrue(metrics.totalFilesSkipped == 0)
        self.assertTrue(metrics.totalConsideredFiles == 37)
        self.assertTrue(metrics.zOrderStats.strategyName == 'all')
        self.assertTrue(metrics.zOrderStats.numOutputCubes == 10)

        # negative test: Z-Order on partition column
        def optimize() -> None:
            dt.optimize().where("p = 1").executeZOrderBy(["p"])

        self.__intercept(
            optimize, "p is a partition column. "
            "Z-Ordering can only be performed on data columns")
Esempio n. 3
0
def skipp_attributes(df):
    song = udf(lambda x: int(x == 'NextSong'), IntegerType())
    skipped = udf(lambda x: int(x != 0), IntegerType())
    session = Window.partitionBy("userId", "sessionId").orderBy(desc("ts"))
    return df.select(
        'userId', 'page', 'ts', 'length', 'sessionId', 'itemInSession'
    ).where((df.page != 'Thumbs Up') & (df.page != 'Thumbs Down')).withColumn(
        'song', song('page')).orderBy(
            'userId', 'sessionId', 'itemInSession').withColumn(
                'nextActSong',
                lag(col('song')).over(session)).withColumn(
                    'tsDiff',
                    (lag('ts').over(session) - col('ts')) / 1000).withColumn(
                        'timeSkipped',
                        (floor('length') - col('tsDiff'))).withColumn(
                            'roundedLength',
                            floor('length')).where((col('song') == 1) & (
                                (col('nextActSong') != 0)
                                & (col('timeSkipped') >= 0))).withColumn(
                                    'skipped', skipped('timeSkipped')).select(
                                        'userId', 'timeSkipped', 'skipped',
                                        'length', 'ts',
                                        'tsDiff').groupBy('userId').agg({
                                            'skipped':
                                            'avg',
                                            'timeSkipped':
                                            'avg'
                                        }).withColumnRenamed(
                                            'avg(skipped)',
                                            'skipRate').withColumnRenamed(
                                                'avg(timeSkipped)',
                                                'avgTimeSkipped')
Esempio n. 4
0
def convert_time(data_df, min_time):
    data_df = data_df.withColumn(
        'day',
        f.floor((f.col('time') - min_time) / (3600 * 24)).cast('integer'))
    data_df = data_df.withColumn('week', f.col('day') % 7)
    data_df = data_df.withColumn(
        'hour',
        f.floor((f.col('time') - min_time) / 3600).cast('integer') % 24)
    #data_df = data_df.withColumn('hour', f.round(f.col('hour')).cast('integer'))
    return data_df
def calculation(criteria):
    """
    This function does column function and calculations on all rows of the
    dataframe through column operations.
    @type  criteria: dataframe
    @param criteria: Joined table of user information and stock information
                     which needs to be calculated 
    """
    # Buy-shares
    criteria = criteria.withColumn('numb_share',
    when(col('previous_price') - col('price') > col('buy'),
        (col('numb_share') + floor(col('cash')/col('price'))))
        .otherwise(col('numb_share')))
    # Buy-total value adjustment
    criteria = criteria.withColumn('total_value',
    when(col('previous_price') - col('price') > col('buy'),
        col('total_value') + floor(col('cash')/col('price'))*col('price'))
        .otherwise(col('total_value')))
    # Buy-cash adjustment
    criteria = criteria.withColumn('cash',
    when(col('previous_price') - col('price') > col('buy'),
         col('cash') - floor(col('cash')/col('price'))*col('price'))
        .otherwise(col('cash')))
    # sell-Profit Calculation
    criteria = criteria.withColumn('profit',
                                   when(col('price') - col('previous_price') > col('sell'),
                                        col('profit') + (col('numb_share')*col('price')) - col('total_value'))
                                       .otherwise(col('profit')))
    # sell-cash adjustment
    criteria = criteria.withColumn('cash',
                                   when(col('price') - col('previous_price') > col('sell'),
                                        col('cash') + col('total_value')).otherwise(col('cash')))
    # sell-Total Value adjustment
    criteria = criteria.withColumn('total_value',
                                   when(col('price') - col('previous_price') > col('sell'),
                                        0).otherwise(col('total_value')))  
    # Sell-shares adjustment 
    criteria = criteria.withColumn('numb_share',
                                   when(col('price') - col('previous_price') > col('sell'),
                                        0).otherwise(col('numb_share')))
    # time adjustment 
    criteria = criteria.withColumn('time',
    when((col('previous_price') - col('price') > col('buy'))
        |((col('price') - col('previous_price') > col('sell'))),
        col('time_new')).otherwise(col('time')))
    # previous price adjustment 
    criteria = criteria.withColumn('previous_price',
                                 when((col('previous_price') - col('price') > col('buy')) |
                                    ((col('price') - col('previous_price') > col('sell'))),
col('price')).otherwise(col('previous_price')))
    criteria = criteria.drop('time_new', 'volume', 'price')
    combine(criteria, cass_data)
Esempio n. 6
0
def percentiles(df,
                c,
                by=None,
                p=[10, 25, 50, 75, 90],
                index='_idx',
                result='_res'):
    _gcols = [by] if isinstance(by, str) and by else by or []
    ptile = f'{c}##p'

    # percentiles per row
    w = Window.partitionBy(*_gcols).orderBy(c)
    d = df.select(c, *_gcols,
                  F.floor(100 * (F.percent_rank().over(w))).alias(ptile))

    # aggregate
    agg_keys = F.array(*[F.lit(x) for x in p])
    agg_values = F.array(
        *[F.max(F.when(F.col(ptile) < x, F.col(c))) for x in p])
    r = d.groupby(*_gcols).agg(
        F.map_from_arrays(agg_keys, agg_values).alias(result))

    # add colname
    r = r.withColumn(index, F.lit(c))

    return r
Esempio n. 7
0
def create_train_data():

    w1 = Window.orderBy("uid")
    w2 = Window.partitionBy("seg").orderBy("uid")
    df_train = spark.read.csv(
        os.path.join("datasets", "train.csv"), header=True,
        schema=schema).withColumn(
            "uid", monotonically_increasing_id()).withColumn(
                "idx",
                row_number().over(w1).cast(IntegerType())).withColumn(
                    "seg",
                    fn.floor(((fn.col("idx") - 1) / 150000)).cast(
                        IntegerType())).withColumn(
                            "no",
                            row_number().over(w2).cast(
                                IntegerType())).withColumn(
                                    "name",
                                    fn.concat(
                                        lit("raw_"),
                                        fn.lpad(fn.col("seg"), 4, "0").cast(
                                            StringType()))).withColumn(
                                                "set", lit(0))

    df_train.createOrReplaceTempView("data")
    df_train_f = spark.sql("""
    SELECT uid, set, seg, no, name, x, y FROM data 
    ORDER BY set, seg, no, uid
    """)

    df_train_f = df_train_f.repartition(1)
    df_train_f.write.mode("overwrite").parquet(
        os.path.join("datasets", "train.parquet"))
def granularityPartition(dataDF, N, aggMode='mean'):
    '''
    针对数值型数据表,需要指定划分的记录块大小、字段的聚合方式。
    仅仅支持纯数值型表格
    :param dataDF:待处理数据表
    :param N:块大小
    :param aggMode:
    :return:聚合方式,可选mean/min/max/count
    '''
    mode2index = {"mean": 1, "max": 3, "min": 2, "count": 0}
    if aggMode not in mode2index:
        raise ValueError("aggMode必须为mean\min\max\count之一")
    dataDF = addIdCol(dataDF, idFieldName="id_temp_bob")
    dataDF = dataDF.withColumn("group_id_temp_bob",
                               floor(dataDF.id_temp_bob / N))
    dataDF = eval("dataDF.groupby('group_id_temp_bob')." + aggMode + "()")
    dataDF = dataDF.drop("id_temp_bob").drop("group_id_temp_bob")
    dataDF = changeFieldName(dataDF, "count", "countN")
    for each in dataDF.columns:
        dataDF = changeFieldName(dataDF, each,
                                 each.replace('(', '_').replace(')', '_'))
    if aggMode == 'mean':
        return dataDF.drop('avg_id_temp_bob_').drop("avg_group_id_temp_bob_")
    else:
        return dataDF.drop(aggMode +
                           "_id_temp_bob_").drop(aggMode +
                                                 "_group_id_temp_bob_")
Esempio n. 9
0
 def floordiv(left, right):
     return F.when(F.lit(right is np.nan), np.nan).otherwise(
         F.when(
             F.lit(right != 0) | F.lit(right).isNull(),
             F.floor(left.__div__(right))).otherwise(
                 F.when(
                     F.lit(left == np.inf) | F.lit(left == -np.inf),
                     left).otherwise(F.lit(np.inf).__div__(left))))
Esempio n. 10
0
 def floordiv(left: Column, right: Any) -> Column:
     return F.when(SF.lit(right is np.nan), np.nan).otherwise(
         F.when(
             SF.lit(right != 0) | SF.lit(right).isNull(),
             F.floor(left.__div__(right))).otherwise(
                 F.when(
                     SF.lit(left == np.inf) | SF.lit(left == -np.inf),
                     left).otherwise(SF.lit(np.inf).__div__(left))))
Esempio n. 11
0
    def _discretize_time(self, column: sf.Column) -> sf.Column:
        days_since_study_start = sf.datediff(column, sf.lit(self.study_start))
        bucket = sf.floor(days_since_study_start /
                          self.bucket_size).cast("int")

        if self.bucket_rounding == "floor":
            bucket = (sf.when(
                (bucket < self.n_buckets) | bucket.isNull(),
                bucket).otherwise(self.n_buckets - 1).cast("int"))
        return bucket
Esempio n. 12
0
def main(conf):
    spark_session = SparkSession.builder.appName("TopMoviesPerDecade")\
        .getOrCreate()

    movies_df, ratings_df = load_data(spark_session)

    ratings_decade_wise = ratings_df.withColumn('decade',
                                            func.floor(func.year(
                                            func.from_unixtime('time_stamp')\
                                            .cast(DateType())) /10)*10)\
                                            .drop('time_stamp')

    movie_data_tmp = movies_df.drop('movie_name')

    ratings_w_movies = ratings_decade_wise.join(
        func.broadcast(movie_data_tmp),
        ratings_decade_wise.movie_id == movie_data_tmp.movie_id,
        how='left').drop(movie_data_tmp.movie_id)

    ratings_w_movies = ratings_w_movies.withColumn(
        'categories', func.explode(func.split(ratings_w_movies["genre"],
                                              "\\|"))).drop('genre', 'rating')

    ratings_agg = ratings_w_movies.groupBy("decade", "categories",
                                           "movie_id").agg({
                                               'categories':
                                               'count'
                                           }).withColumnRenamed(
                                               'count(categories)', 'freq')

    window_spec = Window.partitionBy("decade",
                                     "categories").orderBy(func.desc("freq"))

    ratings_agg = ratings_agg.withColumn("rank", func.rank().over(window_spec))

    top10 = ratings_agg.where(ratings_agg["rank"] <= 10)

    top10.show(100)
    categories = [('Crime', 1), ('Romance', 2), ('Thriller', 3),
                  ('Adventure', 4), ('Drama', 5), ('War', 6),
                  ('Documentary', 7), ('Fantasy', 8), ('Mystery', 9),
                  ('Musical', 10), ('Animation', 11), ('Film-Noir', 12),
                  ('(no genres listed)', 13), ('IMAX', 14), ('Horror', 15),
                  ('Western', 16), ('Comedy', 17), ('Children', 18),
                  ('Action', 19), ('Sci-Fi', 20)]
    category_df = spark_session.createDataFrame(categories,
                                                ['categories', 'category_id'])
    top10 = top10.join(func.broadcast(category_df), ['categories'])
    movie_data = movies_df.drop('genre')
    top10 = top10.join(func.broadcast(movie_data), ['movie_id'],
                       how='left').drop('categories', 'freq')
    top10 = top10.withColumnRenamed('r', 'rank')

    print(top10.show(1000))
    pg_cred = conf.pg_db['pg_data_lake']
Esempio n. 13
0
def gts_from_impute (infile):
  # Get the main data and put a unique index on each variant
  maindata = infile.filter(infile.data[0:1] != "#")
  splitdata = maindata.select("filename",f.split(maindata.data,"[\t ]+").alias("split_data"),maindata.lineid.alias("VAR_IDX"))

  gtdata1 = splitdata.select("filename", "VAR_IDX", f.posexplode(splitdata.split_data)).toDF("filename","VAR_IDX","COLUMN_IDX","GTPROB").filter("COLUMN_IDX > 4")
  # Now, get subject ID and which GT
  gtdata2 = gtdata1.select("filename", "VAR_IDX", "GTPROB", "COLUMN_IDX", f.floor((gtdata1.COLUMN_IDX - 5) / 3).alias("SAMPLE_IDX"), ((gtdata1.COLUMN_IDX - 5) % 3).cast(StringType()).alias("GT_IDX"))
  gtdata3 = rkutil.withColumnsRenamed(gtdata2.groupBy("filename","VAR_IDX","SAMPLE_IDX").pivot("GT_IDX",["0","1","2"]).agg(f.collect_list("GTPROB")), ["0","1","2"],["c0","c1","c2"])
  gtdata4 = gtdata3.select("filename","VAR_IDX","SAMPLE_IDX", f.element_at(gtdata3.c0, 1).cast(FloatType()).alias("P11"), f.element_at(gtdata3.c1, 1).cast(FloatType()).alias("P12"), f.element_at(gtdata3.c2, 1).cast(FloatType()).alias("P22"))
  return(gtdata4)
Esempio n. 14
0
    def __init__(self):
        super(FeatureResponse5xxTotal, self).__init__()

        self.group_by_aggs = {
            '5xx': F.count(F.when(F.col('5xx') == True, F.col('5xx')))  # noqa
        }
        self.pre_group_by_calcs = {
            'response_code_category':
            F.floor(F.col('http_response_code') / 100.),
            '5xx': F.col('response_code_category') == 5,
        }
Esempio n. 15
0
    def __init__(self):
        super(FeatureResponse4xxToRequestRatio, self).__init__()

        self.group_by_aggs = {
            '4xx': F.count(F.when(F.col('4xx') == True, F.col('4xx'))),  # noqa
            'num_requests': F.count(F.col('@timestamp')).cast('float'),
        }
        self.pre_group_by_calcs = {
            'response_code_category': F.floor(
                F.col('http_response_code') / 100.),
            '4xx': F.col('response_code_category') == 4,
        }
def GetFirstDate(df, _unitoftime):
    #find the minimum date
    df_grouped = df.groupby([df['STUDYID'],
                             df["CODE"]]).agg(min(df['DAYS_INDEX']))
    #convert to first date
    df_grouped = df_grouped.withColumn(
        "DAYS_INDEX",
        floor((df_grouped["min(DAYS_INDEX)"].cast(FloatType())) / _unitoftime))
    #drop the minimum days
    df_grouped = df_grouped.drop(df_grouped["min(DAYS_INDEX)"])
    #Filter out all diagnoses that occured at time 0 AND also filter out the diagnosis we are looking for
    return df_grouped
Esempio n. 17
0
    def __calc_stats(self, df, resolution):
        """
        Calculates statistics for every column in the Spark DF and returns a seperate DF with the results.
        Statistics: sum, min, max, count, mean, kurtosis, skewness, stddev, variance.
        :param df: DF containing the columns that you want to run your statistics calculations on
        :param resolution: int resolution in milli or microseconds OR string '5m'/'1h'/'1d'
        :return: aggregation dataframe containing statistics
        """

        if type(resolution) is str:
            # resolution to microseconds
            res_dict = {'5m': 300000000, '1h': 3600000000, '1d': 86400000000}
            agg_interval = res_dict[resolution]

        elif type(resolution) is int:
            if len(str(resolution)) < 16:
                resolution = int(str(resolution).ljust(16, '0'))
            agg_interval = resolution

        ts_col = F.col('timestamp')
        df_ori_cols = list(set(df.columns) - set(['timestamp']))

        df = df.withColumn('interval_start',
                           (F.floor(ts_col / agg_interval) * agg_interval))  #\
        #.withColumn('interval_stop', F.ceil(ts_col/agg_interval) * agg_interval)\
        #.orderBy(F.col('interval_start'))
        agg_df = df.groupBy('interval_start').agg(
            F.max(ts_col).alias('max_ts'))

        # TODO Column type checking: string columns are automatically ignored and parse as NaN, so
        # TODO drop NaN columns?

        # TODO: interval_stop ignore, as well as drop max_ts
        # TODO: filter out NaN columns

        # TODO: question: run the statistics job as a seperate job without having to make a udf script

        stat_cols = df_ori_cols  #[c for c in df_ori_cols if c not in ['interval_start', 'interval_stop', 'timestamp', 'max_ts']]
        for column in stat_cols:
            grouped_df = df.groupBy('interval_start')\
                           .agg(F.sum(column).alias('sum_%s' % column),
                                F.min(column).alias('min_%s' % column),
                                F.max(column).alias('max_%s' % column),
                                F.count(column).alias('count_%s' % column),
                                F.kurtosis(column).alias('kurtosis_%s' % column),
                                F.mean(column).alias('mean_%s' % column),
                                F.skewness(column).alias('skewness_%s' % column),
                                F.stddev(column).alias('stddev_%s' % column),
                                F.variance(column).alias('var_%s' % column))
            agg_df = grouped_df.join(agg_df, on='interval_start')
        #agg_df = agg_df.drop('max_ts').drop(F.when(F.col('*').isna())).dropna(how='all').drop_duplicates()

        return agg_df
Esempio n. 18
0
 def __init__(self):
     super(FeatureResponse4xxRate, self).__init__()
     self.group_by_aggs.update({
         '4xx':
         F.count(F.when(F.col('4xx') == True, F.col('4xx'))),  # noqa
     })
     self.pre_group_by_calcs.update({
         'response_code_category':
         F.floor(F.col('http_response_code') / 100.),
         '4xx':
         F.col('response_code_category') == 4,
     })
Esempio n. 19
0
def add_decade_column(df: DataFrame, date_col: str = 'date') -> DataFrame:
    """
    Add year and decade columns from date column.
    :param df: dataframe including date column
    :param date_col: column name of date
    :return: dataframe
    """
    df = df.withColumn('year', F.year(date_col))
    df = df.withColumn('decade',
                       (F.floor(F.col('year') / 10) * 10).cast('string'))
    df = df.withColumn('decade', F.concat('decade', F.lit('s')))
    logging.info("Decade and year columns are generated from date column")
    return df
Esempio n. 20
0
 def get_distribucion_de_clientes_por_facturas_emitidas(data):
     df = None
     if(data is not None):
         df = (data
                 #InvoiceID equivale a InvoiceNo
                 .select('CustomerID', 'InvoiceNo', 'Total')
                 .groupBy('CustomerID')
                 .agg(count('InvoiceNo').alias('facturas'), sum('Total').alias('Total'))
                 .withColumn('facturas', floor(col('facturas') / 10) * 10)
                 .groupBy('facturas')
                 .agg(count('CustomerID').alias('Clientes'))
                 .sort('facturas'))
         return df
Esempio n. 21
0
def add_features(df):
    df = df.withColumn("hour", hour(df["pickupDatetime"]).cast("int"))
    df = df.withColumn("year", year(df["pickupDatetime"]).cast("int"))
    df = df.withColumn("month", month(df["pickupDatetime"]).cast("int"))
    df = df.withColumn("day", dayofmonth(df["pickupDatetime"]).cast("int"))
    df = df.withColumn("day_of_week", dayofweek(df["pickupDatetime"]).cast("int"))

    df = df.withColumn(
        "diff", datediff(df["dropoffDatetime"], df["pickupDatetime"]).cast("int")
    )

    df = df.withColumn(
        "startLatr", (F.floor(df["startLat"] / (0.01)) * 0.01).cast("double")
    )
    df = df.withColumn(
        "startLonr", (F.floor(df["startLon"] / (0.01)) * 0.01).cast("double")
    )
    df = df.withColumn(
        "endLatr", (F.floor(df["endLat"] / (0.01)) * 0.01).cast("double")
    )
    df = df.withColumn(
        "endLonr", (F.floor(df["endLon"] / (0.01)) * 0.01).cast("double")
    )

    # df = df.drop('pickup_datetime', axis=1)
    # df = df.drop('dropoff_datetime', axis=1)

    import numpy

    # df.withColumn("h_distance",haversine_distance(
    #     df.select("startLat"),
    #     df.select("startLon"),
    #     df.select("endLat"),
    #     df.select("endLon"),
    # ).cast('double'))

    df = df.withColumn("is_weekend", (df["day_of_week"] > 5).cast("int"))
    return df
Esempio n. 22
0
 def get_distribucion_de_beneficios_por_facturas_emitidas(data):
     df = None
     if(data is not None):
         df = (data
                     .select('CustomerID', 'InvoiceNo', 'Total')
                     .groupBy('CustomerID')
                     .agg(count('InvoiceNo').alias('facturas'),
                         sum('Total').alias('Total'))
                     .withColumn('facturas', floor(col('facturas') / 10) * 10)
                     .groupBy('facturas')
                     .agg(sum('Total').alias('Importe Total'))
                     .sort('facturas')
             )
     return df
Esempio n. 23
0
def cut(infile, QE_info, sorted_res=True):
    spark = SparkSession.builder.master('local').appName("slice").getOrCreate()
    dataschema = StructType([ StructField("H", FloatType(), False), \
                              StructField("K", FloatType(), False), \
                              StructField("L", FloatType(), False), \
                              StructField("E", FloatType(), False), \
                              StructField("I", FloatType(), False)])
    df = spark.read.csv(infile, sep=",", schema=dataschema)
    starts, ends, steps = convert_to_ses(QE_info)
    heads = ['H', 'K', 'L', 'E']

    res_heads, res_shape = [], []

    df_in_range = df.filter((df.H>=starts[0]) & (df.H<ends[0]) & \
                            (df.K>=starts[1]) & (df.K<ends[1]) & \
                            (df.L>=starts[2]) & (df.L<ends[2]) & \
                            (df.E>=starts[3]) & (df.E<ends[3]))

    for col_ix, col_name in enumerate(heads):
        if steps[col_ix] != 0 and steps[
                col_ix] != ends[col_ix] - starts[col_ix]:
            res_heads.append(col_name + '_bin_ix')
            res_shape.append(
                ceil((ends[col_ix] - starts[col_ix]) / steps[col_ix]))
            #find_ix = UserDefinedFunction(lambda x: floor( (x-starts[col_ix])/steps[col_ix] ), IntegerType())
            df_in_range = df_in_range.withColumn(col_name+'_bin_ix', \
                                                 func.floor( (col(col_name)-starts[col_ix])/steps[col_ix] ))

    if not res_heads:  # means 0-Dimension
        spark.stop()
        return np.array(df.groupBy().avg('I').collect())

    raw_res = np.array(
        df_in_range.groupBy(*res_heads).agg({
            'I': 'mean'
        }).collect())
    spark.stop()

    if sorted_res:
        res = np.full((*res_shape), np.nan)
        if len(res_shape) == 1:  # means 1-Dimension
            for row in raw_res:
                res[int(row[0])] = row[1]
        else:
            for row in raw_res:
                res[tuple(row[:-1].astype(int))] = row[-1]
        return res
    else:
        return raw_res
Esempio n. 24
0
def compute_precision_recall_graph(predictions, n_points):
    inf_cumulative_window = \
        (Window
         .partitionBy('label')
         .orderBy('id_bucket')
         .rowsBetween(Window.unboundedPreceding, Window.currentRow))
    sup_cumulative_window = \
        (Window
         .partitionBy('label')
         .orderBy('id_bucket')
         .rowsBetween(1, Window.unboundedFollowing))

    def prob_positive(v):
        try:
            return float(v[1])
        except ValueError:
            return None

    prob_positive = udf(prob_positive, DoubleType())

    return \
        (predictions
         .select('label',
                 floor(prob_positive('probability') * n_points)
                 .alias('id_bucket'))
         .groupBy('label', 'id_bucket').count()
         .withColumn('count_negatives',
                     sum('count').over(inf_cumulative_window))
         .withColumn('count_positives',
                     sum('count').over(sup_cumulative_window))
         .groupBy('id_bucket').pivot('label', [0, 1])
         .sum('count_negatives', 'count_positives')
         .select(((col('id_bucket') + 1) / n_points).alias('threshold'),
                 col('0_sum(count_negatives)').alias('true_negative'),
                 col('0_sum(count_positives)').alias('false_positive'),
                 col('1_sum(count_negatives)').alias('false_negative'),
                 col('1_sum(count_positives)').alias('true_positive'))
         .select(col('threshold').alias('Threshold'),
                 (col('true_positive')
                 / (col('true_positive') + col('false_positive')))
                 .alias('Precision'),
                 (col('true_positive')
                 / (col('true_positive') + col('false_negative')))
                 .alias('Recall'),
                 (col('false_positive')
                 / (col('false_positive') + col('true_negative')))
                 .alias('FPR'))
         .orderBy('Threshold')
         .toPandas())
Esempio n. 25
0
def windowing(df, batch_size):
    """
    Args:
        df: dataframe to perform windowing on
        batch_size: number of rows per batch
    """
    if "timestamp" not in df.columns:
        raise ValueError("timestamp column not found!")
    df = df.withColumn("timestamp_1", F.unix_timestamp(F.col("timestamp")))
    window_spec = Window.orderBy("timestamp_1")
    return df.withColumn(
        "batch_id",
        F.floor(
            (F.row_number().over(window_spec) - F.lit(1)) / int(batch_size)),
    )
Esempio n. 26
0
    def load(self, df):

        # if df.rdd.isEmpty():
        #     return

        column_timestamp = col('_time_updated').cast('bigint')
        column_period = floor(column_timestamp / self.period_seconds)

        df = df \
            .withColumn('_time_updated', current_timestamp()) \
            .withColumn('_time_updated_period', column_period)

        function.write_delta(df,
                             self.path_target,
                             name_column_partition='_time_updated_period')
Esempio n. 27
0
    def dist_buss_toronto_stars(self, output):
        df_business = self.df_business

        df_business.createOrReplaceTempView('business')
       
        df_business_clean=spark.sql("select *,regexp_replace(PostalCode,' ','') as ZipCode from business")

        df_toronto_data = df_business_clean.select(df_business_clean['BusinessID'], df_business_clean['Name'],\
                                            df_business_clean['ZipCode'],df_business_clean['Latitude'],\
                                           df_business_clean['Longitude'],functions.floor(df_business_clean['BusinessStars']).alias('Stars'))\
                          .where(df_business_clean['City']=='Toronto')

        
        # write data to output
        df_toronto_data.write.csv(output,header=True)
Esempio n. 28
0
def customer_meta(df):
    SENIOR_CUTOFF = 65
    ADULT_CUTOFF = 18
    DAYS_IN_YEAR = 365.25
    EXPONENTIAL_DIST_SCALE = 6.3

    augmented_original = replicate_df(df, options["dup_times"] or 1)

    customerMetaRaw = augmented_original.select(
        "customerID",
        F.lit(now).alias("now"),
        (F.abs(F.hash(augmented_original.customerID)) % 4096 /
         4096).alias("choice"),
        "SeniorCitizen",
        "gender",
        "Partner",
        "Dependents",
        F.col("MonthlyCharges").cast(
            get_currency_type()).alias("MonthlyCharges"),
    )

    customerMetaRaw = customerMetaRaw.withColumn(
        "ageInDays",
        F.floor(
            F.when(
                customerMetaRaw.SeniorCitizen == 0,
                (customerMetaRaw.choice *
                 ((SENIOR_CUTOFF - ADULT_CUTOFF - 1) * DAYS_IN_YEAR)) +
                (ADULT_CUTOFF * DAYS_IN_YEAR),
            ).otherwise((SENIOR_CUTOFF * DAYS_IN_YEAR) +
                        (DAYS_IN_YEAR *
                         (-F.log1p(-customerMetaRaw.choice) *
                          EXPONENTIAL_DIST_SCALE)))).cast("int"),
    )

    customerMetaRaw = customerMetaRaw.withColumn(
        "dateOfBirth", F.expr("date_sub(now, ageInDays)"))

    return customerMetaRaw.select(
        "customerID",
        "dateOfBirth",
        "gender",
        "SeniorCitizen",
        "Partner",
        "Dependents",
        "MonthlyCharges",
        "now",
    ).orderBy("customerID")
Esempio n. 29
0
    def gen_star_counts(self, output):
        df_users = self.df_users

        # get users stars base number
        df_stars_range = df_users.select(df_users['UserID'], functions.floor(df_users['AverageStars']).alias('Stars'))

        # group by user stars
        df_stars_groups = df_stars_range.groupBy(df_stars_range['Stars'])

        # get count of each group
        df_stars_count = df_stars_groups.agg(functions.count(df_stars_range['UserID']).alias('UsersCount'))

        # sort data
        df_sorted = df_stars_count.orderBy(df_stars_count['Stars'])
        
        # write data to output
        df_sorted.write.csv(output, header=True)
Esempio n. 30
0
def process_data(raw_data_sdf, bert_layer):
    """
    Performs the bulk of the work of tokenization and other cleanups. Returns a reduced spark data frame including ids, masks, and segments, and other helpful elements

    :param raw_data_sdf: spark dataframe, the news stories to be processed
    :param bert_layer: tensorflow Keras layer for the BERT model being used.
    """
    global stop_words_bc, tokenizer, domains_bc
    # add weeks column
    clean_data_sdf = raw_data_sdf.withColumn(
        'weeks',
        f.floor(f.datediff(f.col('published'), f.lit('2010-01-01')) / 7))
    log_time("Begin regex")

    clean_data_sdf = clean_data_sdf.withColumn('regex',
                                               udf_add_regex('source_domain'))
    # remove all the identifying text from stories
    clean_data_sdf = clean_data_sdf.withColumn(
        'clean_text', udf_clean_text(f.array('text_or_desc', 'regex')))

    clean_data_sdf.take(100)

    log_time("Begin tokenizer")
    tokenizer = get_tokenizer(bert_layer)

    clean_data_sdf = clean_data_sdf.withColumn('tokens',
                                               udf_get_tokens('clean_text'))

    log_time("Begin masks, etc.")

    clean_data_sdf = clean_data_sdf.withColumn('masks',
                                               udf_get_masks('tokens'))
    clean_data_sdf = clean_data_sdf.withColumn('segments',
                                               udf_get_segments('tokens'))
    clean_data_sdf = clean_data_sdf.withColumn('ids', udf_get_ids('tokens'))
    clean_data_sdf = clean_data_sdf.withColumn(
        'source_index',
        udf_source_index('source_domain').cast('int'))
    # let's slim down the dataframe before we save it to disk.
    clean_data_sdf = clean_data_sdf[[
        'source_domain', 'text_or_desc', 'clean_text', 'published', 'year',
        'title', 'url', 'weeks', 'tokens', 'masks', 'segments', 'ids',
        'source_index'
    ]]

    return clean_data_sdf