Esempio n. 1
0
def prepare_firmware_cve_counts(firmware_cves_df: DataFrame,
                                firmware_hashes_df: DataFrame) -> DataFrame:
    # yapf: disable
    # Ensure that the windows for each of low, med, hi, and crit are over the entire firmware space instead of just
    # those for which a CVE is known to exist
    firmware_cves_full_df = firmware_hashes_df.join(
        firmware_cves_df,
        'firmware_hash',
        'left'
    ).na.fill(0)

    low_window = Window.orderBy(firmware_cves_full_df['low'])
    med_window = Window.orderBy(firmware_cves_full_df['medium'])
    high_window = Window.orderBy(firmware_cves_full_df['high'])
    crit_window = Window.orderBy(firmware_cves_full_df['critical'])
    # sum(wi*xi)/sum(wi)
    cve_composite_score = (percent_rank().over(low_window) + (2 * percent_rank().over(med_window)) + (3 * percent_rank().over(high_window)) + (4 * percent_rank().over(crit_window))) / 10

    fwc_with_score_df = firmware_cves_full_df.withColumn(
        'firmware_cve_component_score', cve_composite_score
    ).select(
        'firmware_hash',
        'firmware_cve_component_score'
    )
    # yapf: enable
    return fwc_with_score_df
Esempio n. 2
0
def split_user_interactions(interactions):
    interactions_rand = interactions.withColumn(
        'rand_num', F.rand()
    )
    
    interactions_with_ranks = interactions_rand.select(
        'user_id',
        'book_id',
        'rating',
        F.percent_rank().over(
            Window.partitionBy(
                interactions_rand['user_id']
            ).orderBy(
                interactions_rand['rand_num']
            )
        ).alias('book_id_rank')
    )
    
    interactions_train = interactions_with_ranks.filter(
        interactions_with_ranks['book_id_rank'] >= 0.5
    ).select(
        'user_id',
        'book_id',
        'rating'
    )
    
    interactions_val = interactions_with_ranks.filter(
        interactions_with_ranks['book_id_rank'] < 0.5
    ).select(
        'user_id',
        'book_id',
        'rating'
    )
    
    return interactions_train, interactions_val
def modify_group_for_dim(model_dict, df, d, colname):
    '''given a DF with a groups assigned (variable colname),
    apply a dictionary to post-process the groups according 
    to that one dimension. returns original DF with modified
    colname column.
    e.g. move specific seasons to the holdout or throwaway sets.
    '''
    dim_props = model_dict['dimensional_dataset_proportions'].iteritems()
    for grp, grp_dict_list in dim_props:
        for grp_dict in grp_dict_list:
            window = Window.orderBy('dim_rnd')\
                           .partitionBy(grp_dict['dim'], colname)
            df = df.withColumn('dim_rk', F.percent_rank().over(window))

            ## if (1) the column is within the set values,
            ## (2) the pre-existing group falls within those set values, and
            ## (3) the random value is below the set threshold,
            ## then override and modify the group membership
            if grp_dict['prop_to_move'] > 0:
                df = df.withColumn(
                    colname,
                    F.when((col(grp_dict['dim']).isin(grp_dict['vals']))
                           & (col(colname).isin(grp_dict['from_groups']))
                           & (col('dim_rk') >= 1 - grp_dict['prop_to_move']),
                           grp).otherwise(col(colname)))
    return df
Esempio n. 4
0
def percentiles(df,
                c,
                by=None,
                p=[10, 25, 50, 75, 90],
                index='_idx',
                result='_res'):
    _gcols = [by] if isinstance(by, str) and by else by or []
    ptile = f'{c}##p'

    # percentiles per row
    w = Window.partitionBy(*_gcols).orderBy(c)
    d = df.select(c, *_gcols,
                  F.floor(100 * (F.percent_rank().over(w))).alias(ptile))

    # aggregate
    agg_keys = F.array(*[F.lit(x) for x in p])
    agg_values = F.array(
        *[F.max(F.when(F.col(ptile) < x, F.col(c))) for x in p])
    r = d.groupby(*_gcols).agg(
        F.map_from_arrays(agg_keys, agg_values).alias(result))

    # add colname
    r = r.withColumn(index, F.lit(c))

    return r
Esempio n. 5
0
def sort_volume_stocks(df, n=10):
    # this function should be revised when using more than one trading day data.
    # return: a list of pyspark dataframe with stocks sorted
    window_spec1 = Window.partitionBy().orderBy('STOCKD_SUM_YENVOL')
    grouped = df.groupBy(df['Ticker']).agg(
        expr("sum(YenVolume) as STOCKD_SUM_YENVOL"))
    prc_rank = grouped.select(
        'Ticker',
        percent_rank().over(window_spec1).alias('percent_rank')).toPandas()

    def sort_con(_n=n):
        # construct percentiles
        unit = 1 / _n
        first = unit
        last = unit * (_n - 1)
        conditions = []
        for i in np.arange(first, 1 + unit, unit):
            if i == first:
                conditions.append(prc_rank['percent_rank'] <= i)
            elif i == 1:
                conditions.append(prc_rank['percent_rank'] > i - unit)
            else:
                conditions.append((prc_rank['percent_rank'] <= i)
                                  & (prc_rank['percent_rank'] > i - unit))
        return conditions

    conds = sort_con(n)
    stock_lists = [prc_rank[cond]['Ticker'].tolist() for cond in conds]
    assert len(stock_lists) == n, "number of stock lists is not equal to n"
    return [df.filter(df.Ticker.isin(l)) for l in stock_lists]
def send_df_to_model(df):
    df = df.withColumn(
        "rank",
        percent_rank().over(Window.partitionBy().orderBy("timestamp")))
    train_df = df.where("rank <= .5").drop("rank")
    print("Train Set")
    train_df.show()
    test_df = df.where("rank > .5").drop("rank")
    print("Test Set")
    test_df.show()
    vectorAssembler = VectorAssembler(inputCols=['num_mentions'],
                                      outputCol='features')
    train_vdf = vectorAssembler.transform(train_df)
    train_vdf = train_vdf.select(['features', 'avg_price'])
    test_vdf = vectorAssembler.transform(test_df)
    test_vdf = test_vdf.select(['features', 'avg_price'])
    lr = LinearRegression(featuresCol='features',
                          labelCol='avg_price',
                          maxIter=10,
                          regParam=0.3,
                          elasticNetParam=0.8)
    lr_model = lr.fit(train_vdf)
    lr_predictions = lr_model.transform(test_vdf)

    lr_evaluator = RegressionEvaluator(predictionCol="prediction",
                                       labelCol="avg_price",
                                       metricName="r2")
    print("R Squared (R2) on test data = %g" %
          lr_evaluator.evaluate(lr_predictions))
    test_result = lr_model.evaluate(test_vdf)
    print("Root Mean Squared Error (RMSE) on test data = %g" %
          test_result.rootMeanSquaredError)

    return lr_predictions.select("prediction", "avg_price", "features")
Esempio n. 7
0
def train_val_test_split(data):
    data = data.withColumn(
        "rank",
        percent_rank().over(Window.partitionBy().orderBy("Year")))
    train_df = data.where("rank <= .6").drop("rank")
    val_df = data.where("rank > 0.6 AND rank<=0.8 ").drop("rank")
    test_df = data.where("rank>0.8 ").drop("rank")
    return train_df, val_df, test_df
Esempio n. 8
0
def add_median_salary(df, *groups, sort_field="-avg_salary"):
    window = Window.partitionBy(*groups)
    rank_spec = window.orderBy(sort(sort_field))
    df = df.withColumn("percent_rank", F.percent_rank().over(rank_spec))
    # 按中位数排序
    median_spec = window.orderBy(F.pow(df.percent_rank - 0.5, 2))
    df = df.withColumn("avg_salary", F.first("avg_salary").over(median_spec))
    return df
 def _cal_percent_rank(self, dfBase, col_name, order):
     print "sort column[{0}],order by[{1}]".format(col_name, order)
     if order == "asc":
         window = Window.orderBy(col_name)
     else:
         window = Window.orderBy(fn.desc(col_name))
     df = dfBase.where("%s is not null" % col_name) \
         .withColumn(col_name + "_rank", fn.percent_rank().over(window)) \
         .persist(storageLevel=StorageLevel.DISK_ONLY) \
         .union(dfBase.where("%s is null" % col_name)
                .withColumn(col_name + "_rank", fn.lit(None))) \
         .repartition(20).persist(storageLevel=StorageLevel.DISK_ONLY)
     df.count()
     return df
Esempio n. 10
0
def random_split_users_int(interactions, users, splitProportion):
    """ Returns a split of the interactions of the users received.
        Used to include interactions from validation and test users in the training set. """

    w = Window.partitionBy('user_id').orderBy('book_id')
    ranked_interactions = interactions.join(users, 'user_id', 'leftsemi').\
        select("user_id", "book_id", "rating", F.percent_rank().over(w).alias("percent_rank"))

    split1 = ranked_interactions.filter(
        ranked_interactions.percent_rank <= splitProportion).drop(
            'percent_rank')
    split2 = ranked_interactions.filter(
        ranked_interactions.percent_rank > splitProportion).drop(
            'percent_rank')

    return (split1, split2)
Esempio n. 11
0
def partitioned_split_by_cols(df, partit_col, split_col, parts_fractions):
    probs = list(np.cumsum(parts_fractions))
    window = Window.orderBy(split_col).partitionBy(partit_col)
    df_tmp = df\
        .withColumn('rank', F.percent_rank().over(window))

    parts = []
    prev_p = 0.
    sum_p = 0.
    for p in parts_fractions:
        sum_p += p
        part = df_tmp \
            .filter((F.col('rank') >= prev_p) & (F.col('rank') < sum_p)) \
            .drop('rank')
        parts.append(part)
        prev_p += p

    return parts
Esempio n. 12
0
    def get_percentile_no_recurrence(no_recurrents_work):
        """
        Get the percentile to assign the amount that corresponds to non recurrent payments
        :param no_recurrents_work: Dataframe
        :return: Dataframe
        """

        filtered_window = Window.partitionBy(
            fields.customer_id).orderBy("sum_importe")
        ranged_window = Window.partitionBy(fields.customer_id).rowsBetween(
            -sys.maxsize, sys.maxsize)
        no_recurrents = no_recurrents_work.select(
            '*',
            percent_rank().over(filtered_window).alias("percentile"),
            count('*').over(ranged_window).alias('num_rows'))

        return no_recurrents.select(
            '*',
            when(
                c("percentile") >= (1 - (1 / c("num_rows"))),
                c("sum_importe")).otherwise(None).alias("percentile_im"))
def assign_group(model_dict, df, d, strata_cols, colname):
    '''given (1) a dictionary of ranges,
    (2) a DF with random values ranked 
    by random block, and 
    (3) a name for the grouped columns,
    return DF with a new column that 
    assigns group membership'''
    window = Window.orderBy('dataset_rnd')\
                   .partitionBy(*model_dict['strata_cols'])
    df = df.withColumn('dataset_rk', F.percent_rank().over(window))
    for i, (k, v) in enumerate(d.iteritems()):
        ## if the bottom is 0, make it -1 to include 0
        min_val = -1 if v[0] == 0 else min_val
        if i == 0:
            group_assign_cond = F.when((col('dataset_rk') > min_val)
                                       & (col('dataset_rk') <= v[1]), F.lit(k))
        else:
            group_assign_cond = group_assign_cond.when(
                (col('dataset_rk') > min_val)
                & (col('dataset_rk') <= v[1]), F.lit(k))

    return df.withColumn(colname, group_assign_cond)
Esempio n. 14
0
def benjamini_hochberg_filter(sdf, alpha=0.001, filter=True):
    """
  Input Spark dataframe must contain columns item1_count, both_count, item2_prevalence as produced by `get_item_pair_stats`
  """
    import pandas as pd
    from scipy.stats import norm
    m = sdf.count()
    z_to_pval_pudf = fn.pandas_udf(lambda z: pd.Series(norm.sf(abs(z)) * 2),
                                   DoubleType())  # two-sided p-value
    sdf2 = sdf\
      .withColumn("Z", fn.expr("(both_count - item1_count*item2_prevalence)/sqrt(item1_count*item2_prevalence*(1-item2_prevalence))")) \
      .withColumn("absZ", fn.expr("abs(Z)")) \
      .withColumn("Z_rank_fraction", fn.percent_rank().over(Window.orderBy(fn.desc("absZ")))) \
      .withColumn("p_value", z_to_pval_pudf(fn.col("Z"))) \
      .withColumn('benjamini_hochberg_criterion', fn.col('p_value') <= fn.col('Z_rank_fraction') * alpha)

    if filter:
        sdf2 = sdf2.filter('benjamini_hochberg_criterion')
        drop_cols = set(sdf.columns) - set(sdf2.columns)
        for col in drop_cols:
            sdf2 = sdf2.drop(col)

    return sdf2
Esempio n. 15
0
This function leaves gaps in rank when there are ties.
'''
df.withColumn("rank",rank().over(window_spec)) \
    .show()


#dense_rank Window Function
'''
dense_rank() window function is used to get the result with rank of rows within a window partition without any gaps. 
This is similar to rank() function difference being rank function leaves gaps in rank when there are ties.
'''
df.withColumn("dense_rank",dense_rank().over(window_spec)) \
    .show()

#percent_rank Window Function
df.withColumn("percent_rank",percent_rank().over(window_spec)) \
    .show()


#ntile Window Function
'''
ntile() window function returns the relative rank of result rows within a window partition. 
In below example we have used 2 as an argument to ntile hence it returns ranking between 2 values (1 and 2)
'''
df.withColumn("ntile",ntile(2).over(window_spec)) \
    .show()


#PySpark Window Analytic functions
#cume_dist Window Function
'''
Esempio n. 16
0
sql_ctx = pyspark.SQLContext(sc)

# Explicitly setting the schema type - may be unnecessary
# Notes: https://stackoverflow.com/questions/44706398/spark-csv-reader-quoted-numerics
schema = StructType([
    StructField("RecId", IntegerType(), True),
    StructField("Name", StringType(), True),
    StructField("PassengerClass", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Sex", StringType(), True),
    StructField("Survived", IntegerType(), True),
    StructField("SexCode", IntegerType(), True),
])

# Print the average age
titanic = sql_ctx.read.option("delimiter", ",").option(
    "header", True).schema(schema).csv('titanic.csv')
titanic.groupBy().avg('Age').show()

# Print the top 75th percentile in age
# Notes: https://databricks.com/blog/2015/07/15/introducing-window-functions-in-spark-sql.html
# https://stackoverflow.com/questions/40885266/how-to-take-the-top-percentage-from-a-relatively-large-spark-dataframe-and-save

window = Window.partitionBy().orderBy(titanic['Age'].asc())
titanic.select('*',
               percent_rank().over(window).alias('age_rank')).filter(
                   col('age_rank') >= 0.75).show(1000)

# Saving our DF to Parquet file
titanic.write.parquet('titanic_p', mode='overwrite')
Esempio n. 17
0
 def with_window_column(df):
     from pyspark.sql.window import Window
     from pyspark.sql.functions import percent_rank
     windowSpec = Window.partitionBy(df['id']).orderBy(df['forecast'])
     return df.withColumn("r", percent_rank().over(windowSpec))
Esempio n. 18
0
    def num_iqr_filter(self,
                       data_frame,
                       inputCol="VALUE",
                       labelCol="ITEMID",
                       REPARTITION_CONST=None,
                       sc=None):
        from pyspark.sql.window import Window
        from pyspark.sql.functions import abs, percent_rank, row_number, collect_list, udf, struct, count, avg, stddev_pop
        from pyspark.sql.types import MapType, StringType, DoubleType
        self.logger.debug("[NUM_IQR_FILTER] IN")
        self.logger.debug("[NUM_IQR_FILTER] BEFORE QUANTILE TAGGED")
        if REPARTITION_CONST is None:
            value_order = Window.partitionBy(labelCol).orderBy(
                col(inputCol).cast("float"))
            Q1_percentile = Window.partitionBy(labelCol).orderBy(
                abs(0.25 - col("percentile")))
            Q2_percentile = Window.partitionBy(labelCol).orderBy(
                abs(0.5 - col("percentile")))
            Q3_percentile = Window.partitionBy(labelCol).orderBy(
                abs(0.75 - col("percentile")))
            percent_data_frame = data_frame.select(
                labelCol, inputCol,
                percent_rank().over(value_order).alias("percentile"))
            Q1_data_frame = percent_data_frame.withColumn("Q1_rn",row_number().over(Q1_percentile)).where("Q1_rn == 1")\
                            .select(labelCol,inputCol,lit("Q1").alias("quantile"))
            Q2_data_frame = percent_data_frame.withColumn("Q2_rn",row_number().over(Q2_percentile)).where("Q2_rn == 1")\
                            .select(labelCol,inputCol,lit("Q2").alias("quantile"))
            Q3_data_frame = percent_data_frame.withColumn("Q3_rn",row_number().over(Q3_percentile)).where("Q3_rn == 1")\
                            .select(labelCol,inputCol,lit("Q3").alias("quantile"))
            self.logger.debug(
                "[NUM_IQR_FILTER] REPARTITION_CONST Not Asserted")
            merge_all = Q1_data_frame.unionAll(Q2_data_frame).unionAll(
                Q3_data_frame).persist()

            self.logger.debug("[NUM_IQR_FILTER] Qs Merged")

            udf_parse_list_to_map = udf(
                lambda maps: dict(list(tuple(x) for x in maps)),
                MapType(StringType(), StringType()))

            self.logger.debug("[NUM_IQR_FILTER] Before merge quantiles")
            aggregate_quantiles = merge_all.groupBy(labelCol).agg(
                collect_list(struct("quantile", inputCol)).alias("quant_info"))
            self.logger.debug("AGG ONLY")
            aggregate_quantiles = aggregate_quantiles.select(
                labelCol,
                udf_parse_list_to_map("quant_info").alias("quant_info"))
            self.logger.debug("TRANSFORM")
            iqr_data_frame = aggregate_quantiles.withColumn("Q1",col("quant_info").getItem("Q1").cast("float"))\
                .withColumn("Q2",col("quant_info").getItem("Q2").cast("float"))\
                .withColumn("Q3",col("quant_info").getItem("Q3").cast("float"))
            self.logger.debug("QUANTILE_EXTRACTION")
        else:
            cur_label_list = data_frame.select(
                labelCol).distinct().rdd.flatMap(list).collect()
            cur_iqr_list = list()
            cnt = -1
            for cur_item in cur_label_list:
                cnt = cnt + 1
                data_frame.where(
                    col(labelCol) == cur_item).registerTempTable("cur_table")
                self.logger.debug("{0}/{1},::{2}".format(
                    cnt, len(cur_label_list),
                    sc.sql(
                        "select {0} from cur_table".format(labelCol)).count()))
                cur_iqr = sc.sql(
                    "select {0}, percentile_approx({1},0.25) as Q1, percentile_approx({2},0.5) as Q2, percentile_approx({3},0.75) as Q3 from cur_table group by {4}"
                    .format(labelCol, inputCol, inputCol, inputCol,
                            labelCol)).first().asDict()
                cur_iqr_list.append(cur_iqr)
                sc.catalog.dropTempView("cur_table")
                #percent_data_frame = data_frame.select(labelCol, inputCol, percent_rank().over(value_order).alias("percentile")).repartition(REPARTITION_CONST).cache().checkpoint()
            iqr_data_frame = sc.createDataFrame(cur_iqr_list).repartition(
                REPARTITION_CONST)

        if REPARTITION_CONST is None:
            iqr_data_frame = iqr_data_frame.withColumn("IQR",col("Q3")-col("Q1"))\
                                       .withColumn("LB",col("Q1")-1.5*col("IQR"))\
                                       .withColumn("UB",col("Q3")+1.5*col("IQR"))\
                                       .select(labelCol,"LB","UB")
        else:
            iqr_data_frame = iqr_data_frame.withColumn("IQR",col("Q3")-col("Q1"))\
                                       .withColumn("LB",col("Q1")-1.5*col("IQR"))\
                                       .withColumn("UB",col("Q3")+1.5*col("IQR"))\
                                       .select(labelCol,"LB","UB").repartition(REPARTITION_CONST).persist()

            self.logger.debug("CUR_ITEMID_ALL_COUNT:{0}".format(
                iqr_data_frame.count()))

        self.logger.debug("[NUM_IQR_FILTER] iqr_data_frame merged")
        if REPARTITION_CONST is None:
            self.logger.debug(
                "[NUM_IQR_FILTER] RETURN_PREP, REPARTITION_CONST NOT ASSERTED")
            ret_data_frame = data_frame.repartition(labelCol).join(iqr_data_frame,labelCol).where((col("LB").cast("float") <= col(inputCol).cast("float")) & (col("UB").cast("float")>=col(inputCol).cast("float")))\
                                                                 .drop("LB").drop("UB").persist()
            ref_df = ret_data_frame.repartition(labelCol).groupBy(labelCol)\
                               .agg(count(inputCol).alias("ref_count"),avg(inputCol).alias("ref_avg"),stddev_pop(inputCol).alias("ref_std")).persist()
            self.logger.debug("CHECK DF")
            self.logger.debug(ref_df.count())
            self.logger.debug(ret_data_frame.count())

            return (ret_data_frame, ref_df)
        else:
            self.logger.debug(
                "[NUM_IQR_FILTER] RETURN_PREP, REPARTITION_CONST ASSERTED: {0}"
                .format(REPARTITION_CONST))
            ret_data_frame = data_frame.join(iqr_data_frame,labelCol).where((col("LB").cast("float") <= col(inputCol).cast("float")) & (col("UB").cast("float")>=col(inputCol).cast("float")))\
                                                                 .drop("LB").drop("UB").repartition(REPARTITION_CONST)
            ref_df = ret_data_frame.groupBy(labelCol)\
                               .agg(count(inputCol).alias("ref_count"),avg(inputCol).alias("ref_avg"),stddev_pop(inputCol).alias("ref_std")).repartition(REPARTITION_CONST)

            return (ret_data_frame, ref_df)
Esempio n. 19
0
    'wnd_direction', 'dest_wnd_direction', 'OP_UNIQUE_CARRIER', 'OP_CARRIER',
    'ORIGIN', 'ORIGIN_STATE_ABR', 'DEST', 'cig_code', 'cig_cavok_code',
    'dest_cig_code', 'dest_cig_cavok_code', 'dest_vis_var_code', 'unique_tail',
    'DAY_OF_WEEK', 'DEP_DEL15_PREV', 'MONTH', 'QUARTER', 'DAY_OF_MONTH',
    'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER_FL_NUM', 'DISTANCE_GROUP', 'OD_GROUP')

# drop some problematic NAs
df = df.na.drop(subset=["DEP_DEL15", 'unique_tail'])

# limit for small batch testing
#df = df.limit(130459)

# create a variable that will be used to split the data into train/valid/test later
df = df.withColumn(
    "rank",
    percent_rank().over(Window.partitionBy().orderBy("departure_time")))

# COMMAND ----------

# DBTITLE 1,Convert String and Int. Categorical to Zero-min Categorical
# list of str features
str_features = [
    'DEST_STATE_ABR', 'wnd_direction', 'dest_wnd_direction',
    'OP_UNIQUE_CARRIER', 'OP_CARRIER', 'ORIGIN', 'ORIGIN_STATE_ABR', 'DEST',
    'cig_code', 'cig_cavok_code', 'dest_cig_code', 'dest_cig_cavok_code',
    'dest_vis_var_code', 'unique_tail'
]

# list of int categorical features
int_categorical = [
    'DAY_OF_WEEK', 'DEP_DEL15_PREV', 'MONTH', 'QUARTER', 'DAY_OF_MONTH',
Esempio n. 20
0
case_status_history.show()

escalation_starts = (case_status_history.filter(
    F.col('is_escalate') == 'Y').groupby('reference_id').agg(
        F.min('seconds_since_case_start').alias('escalation_start'),
        F.max('seconds_since_case_start').alias('case_end')))

escalation_starts.count()  # 646
escalation_starts.filter(F.col('reference_id') == 100087).show()

escalation_points_distribution = (escalation_starts.withColumn(
    'escalation_point_relative_to_case_duration',
    F.col('escalation_start') / F.col('case_end')).withColumn(
        'percent_rank_escalation_point_relative_to_case_duration',
        F.percent_rank().over(Window.partitionBy().orderBy(
            F.asc("escalation_point_relative_to_case_duration")))
    ).withColumn(
        'percentile',
        F.floor(
            F.col('percent_rank_escalation_point_relative_to_case_duration') *
            100)).groupBy('percentile').agg(
                F.avg('escalation_point_relative_to_case_duration').alias(
                    'average_percentile_escalation_point')))
escalation_points_distribution.withColumn(
    'r', F.round(F.col("average_percentile_escalation_point"), 2)).show(n=100)

escalation_training_ids = (escalation_starts.select('reference_id').distinct())
escalation_training_ids.count()

escalation_training_targets = (case_status_history.join(
    escalation_starts,
# Create a simple window specification:
from pyspark.sql.window import Window
from pyspark.sql.functions import desc
ws = Window.orderBy(desc("score"))

from pyspark.sql.functions import row_number, cume_dist, ntile
df.select("score", row_number().over(ws).alias("row_number")).show()
df.select("score", cume_dist().over(ws).alias("cume_dist")).show()
df.select("score", ntile(2).over(ws).alias("ntile(2)")).show()

from pyspark.sql.functions import rank, dense_rank, percent_rank
df.select("score",
          rank().over(ws).alias("rank"),
          dense_rank().over(ws).alias("dense_rank")).show()
df.select("score", percent_rank().over(ws).alias("percent_rank")).show()

from pyspark.sql.functions import lag, lead
df.select("score",
          lag("score", count=1).over(ws).alias("lag"),
          lead("score", count=2).over(ws).alias("lead")).show()

# ## Compute mean star rating over last five rides

ws = Window.partitionBy("driver_id").orderBy("date_time").rowsBetween(-4, 0)

from pyspark.sql.functions import count, mean, last
rides \
  .select("driver_id", "date_time", "star_rating", mean("star_rating").over(ws).alias("moving_average")) \
  .groupby("driver_id") \
  .agg(count("driver_id"), mean("star_rating"), last("moving_average")) \
Esempio n. 22
0
def data_split(spark, sample_size = 0.1, seed = 1):
    '''
    Function to Data Splitting and Sub sampling
    
    Parameters
    ----------
    spark : spark session object
    sample_size: type-float: Data subsampling 1 for complete data 
    seed: Seed for sampling
    
    Return
    ----------
    train_final_data: type-Dataframe:Training data
    val_data: type-Dataframe: Validation data
    test_data: type-Dataframe: Testing data
    '''

    #Load Dataset
    interactions = spark.read.csv('hdfs:/user/bm106/pub/goodreads/goodreads_interactions.csv',\
                                header=True, schema="user_id INT, book_id INT,is_read INT,rating INT,is_reviewed INT")
    #Create temp view 
    interactions.createOrReplaceTempView('interactions')

    #Remove Rows with is_read = 0, is_reviewed = 0 and ratings = 0
    filtered_interactions = spark.sql(
        "SELECT * FROM interactions WHERE is_read=1 AND is_reviewed=1 AND rating > 0")
    filtered_interactions.createOrReplaceTempView('filtered_interactions')

    # keep data from 10 or more interactions per user
    more_than_ten_interactions = spark.sql("SELECT D.book_id, D.is_read, D.rating, D.user_id, D.is_reviewed from filtered_interactions D INNER JOIN \
        (SELECT user_id, count(user_id) FROM filtered_interactions GROUP BY user_id having count(user_id) > 10) R ON D.user_id = R.user_id")
    more_than_ten_interactions.createOrReplaceTempView('more_than_ten_interactions')

    #Get all the unique user ids from the generated dataframe
    user_id_sampled = more_than_ten_interactions.select(more_than_ten_interactions.user_id).distinct()
    #Sub sampling
    if sample_size < 1:
        #Get-sub sampled unique user id list
        user_id_sampled = user_id_sampled.sample(withReplacement = False, fraction = sample_size, seed = seed)
        #Get interactions corresponding to sub-sampled unique user id list
        more_than_ten_interactions = more_than_ten_interactions.join(user_id_sampled, more_than_ten_interactions.user_id == user_id_sampled.user_id,'inner')\
                                .select(more_than_ten_interactions.user_id, more_than_ten_interactions.book_id,more_than_ten_interactions.rating )


    #Split unique user id  data in the ratio Train:0.6, Val:0.2, Test:0.2
    train_users, val_users, test_users = user_id_sampled.randomSplit([0.6, 0.2, 0.2])

    #Get corresponding Train/ Val/Test Interactions
    train_interactions = more_than_ten_interactions.join(train_users, more_than_ten_interactions.user_id == train_users.user_id,'inner')\
        .select(more_than_ten_interactions.user_id, more_than_ten_interactions.book_id,more_than_ten_interactions.rating )
    val_interactions = more_than_ten_interactions.join(val_users, more_than_ten_interactions.user_id == val_users.user_id,'inner')\
        .select(more_than_ten_interactions.user_id, more_than_ten_interactions.book_id,more_than_ten_interactions.rating )
    test_interactions = more_than_ten_interactions.join(test_users, more_than_ten_interactions.user_id == test_users.user_id,'inner')\
        .select(more_than_ten_interactions.user_id, more_than_ten_interactions.book_id,more_than_ten_interactions.rating )

    #Grouping
    val_interactions = (val_interactions.select('user_id','book_id','rating',percent_rank().over(Window.partitionBy(val_interactions['user_id']).orderBy(val_interactions['book_id'])).alias('percent_50')))
    #reserve 50 percent into training 
    val_to_train = val_interactions.filter(col('percent_50') < 0.5).select('user_id','book_id','rating')
    #Take remaining 50 percent as validation
    val_data = val_interactions.filter(col('percent_50') >= 0.5).select('user_id','book_id','rating')

    #Grouping
    test_interactions = (test_interactions.select('user_id','book_id','rating',percent_rank().over(Window.partitionBy(test_interactions['user_id']).orderBy(val_interactions['book_id'])).alias('percent_50')))
    #reserve 50 percent into training 
    test_to_train = test_interactions.filter(col('percent_50') < 0.5).select('user_id','book_id','rating')
    #Take remaining 50 percent as Testing
    test_data = test_interactions.filter(col('percent_50') >= 0.5).select('user_id','book_id','rating')

    #Append validation extracted rows into training
    train_val_data = train_interactions.unionByName(val_to_train)
    #Append Testing extracted rows into training
    train_final_data = train_val_data.unionByName(test_to_train)

    return train_final_data, val_data, test_data
Esempio n. 23
0
File: train.py Progetto: araji/mlfow

# In[5]:



sales = order.join(customer, order.o_custkey == customer.c_custkey, how = 'inner')
sales = sales.join(lineitem, lineitem.l_orderkey == sales.o_orderkey, how = 'full')
sales = sales.where('c_mktsegment == "BUILDING"').select('l_quantity','o_orderdate')

sales = sales.groupBy('o_orderdate').agg({'l_quantity': 'sum'}) .withColumnRenamed("sum(l_quantity)", "TOTAL_SALES") .withColumnRenamed("o_orderdate", "ORDERDATE")


sales = sales.withColumn('DATE', F.unix_timestamp(sales.ORDERDATE) )             .withColumn('DAY', F.dayofmonth(sales.ORDERDATE) )             .withColumn('WDAY', F.dayofweek(sales.ORDERDATE) )              .withColumn('YDAY', F.dayofyear(sales.ORDERDATE) )              .withColumn('WEEK', F.weekofyear(sales.ORDERDATE) )

sales = sales.withColumn("rank", percent_rank().over(Window.partitionBy().orderBy("DATE")))
training = sales.where("rank <= .8").drop("rank").drop("ORDERDATE")
testing  = sales.where("rank > .8").drop("rank").drop("ORDERDATE")


# In[7]:


featuresCols = training.columns
featuresCols.remove('TOTAL_SALES')
va = VectorAssembler(inputCols = featuresCols, outputCol = "features")


# In[8]:

with mlflow.start_run() as run:
Esempio n. 24
0
    -1, Window.currentRow))

df = (data.withColumn(
    "list_salary",
    F.collect_list(F.col("salary")).over(windowSpec)).withColumn(
        "average_salary",
        F.avg(F.col("salary")).over(windowSpec)).withColumn(
            "total_salary",
            F.sum(F.col("salary")).over(windowSpec)))

df.show()

# In[14]:

windowSpec = Window.partitionBy("dept").orderBy(F.asc("salary"))

df = (data.withColumn("average_salary",
                      F.avg(F.col("salary")).over(windowSpec)).withColumn(
                          "total_salary",
                          F.sum(F.col("salary")).over(windowSpec)).withColumn(
                              "rank",
                              F.rank().over(windowSpec)).withColumn(
                                  "dense_rank",
                                  F.dense_rank().over(windowSpec)).withColumn(
                                      "perc_rank",
                                      F.percent_rank().over(windowSpec)))

df.show()

# In[ ]:
Esempio n. 25
0
udf_median = func.udf(median, spark.sql.types.FloatType())

group_df = df.groupby(['a', 'd'])
df_grouped = group_df.agg(udf_median(func.collect_list(col('c'))).alias('median'))
df_grouped.show()


from pyspark.sql.window import Window
from pyspark.sql.functions import percent_rank,col,count

w =  Window.partitionBy('Reviewer_Nationality').orderBy(df_nationality.Reviewer_Score )


df_median_countries = df_nationality.select('Reviewer_Nationality',
                      'Reviewer_Score',
                      percent_rank().over(w).alias("percentile"),
                      count(col('Reviewer_Nationality')).over(w).alias('N'))\
.orderBy('N', ascending = False)\
.where('percentile>0.48 and percentile<0.52')


spark.catalog.dropTempView("medians")

df_median_countries.createTempView('medians')

spark.sql("SELECT * FROM medians").show()

spark.sql("SELECT Reviewer_Nationality, \ 
AVG(percentile) AS PERC \
FROM medians\
GROUP BY Reviewer_Nationality\
        'Itau_open', 'BVSP_open', 'USDBRL_open', 'lag_1', 'lag_2', 'lag_3'
    ]
    vectorAssembler = VectorAssembler(inputCols=FEATURES_COL1,
                                      outputCol="features")
    vdf = vectorAssembler.transform(df.na.drop())
    vdf = vdf.select(
        ['Date', 'Itau_Close', 'features', 'class_N', 'class_NN', 'class_P'])
    scale_features = MinMaxScaler(inputCol='features',
                                  outputCol='scaled_features')
    model_scale = scale_features.fit(vdf)
    df_scaled = model_scale.transform(vdf)
    FEATURES_COL1 = ['scaled_features', 'class_N', 'class_NN', 'class_P']
    vectorAssembler = VectorAssembler(inputCols=FEATURES_COL1,
                                      outputCol="Col_features")
    df_completed = vectorAssembler.transform(df_scaled)
    df_completed = df_completed.select(['Date', 'Itau_Close', 'Col_features'])
    df_completed = df_completed.withColumn(
        "rank",
        percent_rank().over(Window.partitionBy().orderBy("Date")))
    train_df = df_completed.where("rank <= .95").drop("rank")
    test_df = df_completed.where("rank > .95").drop("rank")
    lr = LinearRegression(featuresCol='Col_features', labelCol='Itau_Close')
    lr_model = lr.fit(train_df)
    lr_predictions = lr_model.transform(test_df)
    lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                     labelCol="Itau_Close",metricName="r2")
    test_result = lr_model.evaluate(test_df)
    predictions = lr_model.transform(test_df)
    predictions.select("prediction", "Itau_close").write.option(
        "header", "true").mode("overwrite").parquet('pred.parquet')
Esempio n. 27
0
def score_hive(hc, conf):
    with Timer('[score] command'):
        rdd, all_colnames = read_hive_scoring_data(hc, conf['hive_scoring_data']['input_db_name'],
                                                   conf['hive_scoring_data']['input_table_name'],
                                                   os.path.join(conf['path_to_output'], 'scoring', 'colnames_for_scoring.txt'),
                                                   conf['colname_for_id'], log_start_time=False, indent=2, level='debug')
        rdd = rdd.cache()

        label_encoders = import_object(os.path.join(conf['path_to_output'], 'scoring', 'label_encoders', 'label_encoders.p'),
                                       log_start_time=False, indent=2, level='debug')
        label_decoders = {cat_colname: {value: key for key, value in label_encoder.items()} for
                          cat_colname, label_encoder in label_encoders.items()}

        best_model_names = pd.read_csv(os.path.join(conf['path_to_output'], 'scoring', 'best_models.csv')).set_index('target')['best_model']

        id_index = all_colnames.index(conf['colname_for_id'].upper())
        targets = conf['targets']

        estimators, feature_info, bias_adjusters = prepare_scoring(all_colnames, label_decoders, targets,
                                                                   best_model_names, conf['path_to_output'],
                                                                   conf['train_event_rate'],
                                                                   conf['fill_values_categorical_missing'],
                                                                   log_start_time=False, indent=2, level='debug')

        estimators_bc = sc.broadcast(estimators)
        feature_info_bc = sc.broadcast(feature_info)
        bias_adjusters_bc = sc.broadcast(bias_adjusters)

        def extract_feature(row, index, label, fill_value):
            variable_raw = row[index]

            if variable_raw == None or variable_raw == u'':
                variable_raw = fill_value

            if label is None:
                variable = float(variable_raw)
            else:
                variable = 1.0 if variable_raw == label else 0.0

            return variable

        def extract_records(row):
            records = [[extract_feature(row, index, label, fill_value) for index, label, fill_value in feature_info_bc.value[i]] for i in range(len(targets))]
            return (row[id_index], records)

        def score_chunck(chunck):
            ids, records = zip(*chunck)
            records = zip(*records)
            results = [ids]
            for i in range(len(targets)):
                results.append(estimators_bc.value[i].predict_proba(list(records[i]))[:, 1])
            return zip(*results)

        def adjust_bias(row):
            row_adjusted = list(row)
            for i in range(len(row) - 1):
                bias_adjuster = bias_adjusters_bc.value[i]
                row_adjusted[i + 1] = bias_adjuster[0] * row[i + 1] / (bias_adjuster[1] - bias_adjuster[2] * row[i + 1])
            return row_adjusted

        def make_initial_schema(targets):
            schema = StructType()
            schema_str_list = list()

            for header in targets:
                if header.upper() == conf['colname_for_id'].upper():
                    schema.add(header, data_type=StringType())
                    schema_str_list.append(header + " STRING")
                else:
                    schema.add(header + "_SCORE", data_type=FloatType())
                    schema_str_list.append(header + "_SCORE FLOAT")

            return schema, schema_str_list

        headers = [conf['colname_for_id'].upper()] + conf['targets']
        initial_schema, initial_schema_str_list = make_initial_schema(headers)
        score_cols = [col for col in initial_schema.names if col.endswith("_SCORE")]

        def break_ties(target):
            return (funcs.col(target) + (funcs.randn(conf['random_state']) / funcs.lit(10000000000))).alias(target)

        scores_df = (rdd.map(extract_records)
                     .mapPartitions(score_chunck)
                     .map(adjust_bias)
                     .map(lambda x: [z.item() if type(z) is pd.np.float64 else z for z in x])
                     .toDF(initial_schema)
                     .repartition(conf['colname_for_id'])
                     .select(conf['colname_for_id'], *(break_ties(target) for target in score_cols))
                     .cache()
                     )

        if conf['score_analysis_flag'] > 0:
            final_df = None
            for target in score_cols:
                w = Window.orderBy(scores_df[target].desc())
                perc_rank_func = (funcs.percent_rank().over(w))
                rank_func = (funcs.dense_rank().over(w))
                temp = (scores_df.select(conf['colname_for_id'],
                                         target,
                                         rank_func.alias(target + "_rank"),
                                         perc_rank_func.alias(target + "_percent_rank"))
                        .repartition(conf['colname_for_id'])
                        .cache()
                        )
                if final_df is None:
                    final_df = temp
                else:
                    final_df = final_df.join(temp, on=conf['colname_for_id'], how='outer')
        else:
            final_df = scores_df

        full_table_name = conf['hive_scoring_data']['output_db_name'] + '.' + conf['hive_scoring_data']['output_table_name']

        drop_sql = "DROP TABLE IF EXISTS " + full_table_name
        hc.sql(drop_sql)

        final_df.registerTempTable('final_table')
        create_sql = "CREATE TABLE " + full_table_name + " ROW FORMAT DELIMITED FIELDS TERMINATED BY '\\t' STORED AS SEQUENCEFILE AS SELECT * FROM final_table"
        hc.sql(create_sql)