Esempio n. 1
0
    def test_bucketizer(self):
        values = [(0.1, ), (0.4, ), (1.2, ), (1.5, )]
        data = self.spark.createDataFrame(values, ["features"])
        model = Bucketizer(splits=[-float("inf"), 0.5, 1.4,
                                   float("inf")],
                           inputCol="features",
                           outputCol="buckets")

        feature_count = len(data.select('features').first())
        model_onnx = convert_sparkml(
            model, 'Sparkml Bucketizer',
            [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)
        # run the model
        predicted = model.setHandleInvalid("error").transform(data)
        expected = predicted.select("buckets").toPandas().values.astype(
            numpy.float32)
        data_np = [data.toPandas().values.astype(numpy.float32)]
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlBucketizer")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['buckets'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
def create_buckets(percentage_of_missing_ctus_per_partyid):
    """
    Devide party ids by percentage of missing ctus into a list of 5 buckets
    > 0   < 0.25
    > 0.25 < 0.5
    > 0.5 < 0.75
    > 0.75 < 0.99
    > 0.99
    Output:
    +--------+-----------------------+-------+
    |party_id|percentage_missing_ctus|buckets|
    +--------+-----------------------+-------+
    |       1|                    0.2|    0.0|
    |       2|                   0.33|    1.0|
    |       3|                    1.0|    4.0|
    |       4|                   0.75|    3.0|
    |       5|                    0.6|    2.0|
    |       6|                    0.6|    2.0|
    +--------+-----------------------+-------+
    """

    bucketizer = Bucketizer(splits=[ 0, 0.25, 0.5, 0.75, 0.99, float('Inf') ], \
                            inputCol="percentage_missing_ctus", outputCol="buckets")
    df_of_buckets_ratio_between_imputed_distinct_ctus\
            = bucketizer.setHandleInvalid("keep").\
            transform(percentage_of_missing_ctus_per_partyid)
    return df_of_buckets_ratio_between_imputed_distinct_ctus
    def get_column_hist(self, column, bins):
        """return a list of counts corresponding to bins"""
        bins = list(copy.deepcopy(bins))  # take a copy since we are inserting and popping
        if bins[0] == -np.inf or bins[0] == -float("inf"):
            added_min = False
            bins[0] = -float("inf")
        else:
            added_min = True
            bins.insert(0, -float("inf"))

        if bins[-1] == np.inf or bins[-1] == float("inf"):
            added_max = False
            bins[-1] = float("inf")
        else:
            added_max = True
            bins.append(float("inf"))

        temp_column = self.spark_df.select(column).where(col(column).isNotNull())
        bucketizer = Bucketizer(
            splits=bins, inputCol=column, outputCol="buckets")
        bucketed = bucketizer.setHandleInvalid("skip").transform(temp_column)

        # This is painful to do, but: bucketizer cannot handle values outside of a range
        # (hence adding -/+ infinity above)

        # Further, it *always* follows the numpy convention of lower_bound <= bin < upper_bound
        # for all but the last bin

        # But, since the last bin in our case will often be +infinity, we need to
        # find the number of values exactly equal to the upper bound to add those

        # We'll try for an optimization by asking for it at the same time
        if added_max == True:
            upper_bound_count = temp_column.select(column).filter(col(column) == bins[-2]).count()
        else:
            upper_bound_count = 0

        hist_rows = bucketed.groupBy("buckets").count().collect()
        # Spark only returns buckets that have nonzero counts.
        hist = [0] * (len(bins) - 1)
        for row in hist_rows:
            hist[int(row["buckets"])] = row["count"]

        hist[-2] += upper_bound_count

        if added_min:
            below_bins = hist.pop(0)
            bins.pop(0)
            if below_bins > 0:
                logger.warning("Discarding histogram values below lowest bin.")

        if added_max:
            above_bins = hist.pop(-1)
            bins.pop(-1)
            if above_bins > 0:
                logger.warning("Discarding histogram values above highest bin.")

        return hist
Esempio n. 4
0
def model_train(zipcode, complaint, day):
    print("Loading Data ...")
    data311 = spark.read.format("csv").option("header",
                                              "true").load("Data_Final/*.csv")
    infer_schema = "true"
    first_row_is_header = "true"
    delimiter = ","
    data311.registerTempTable("data311")
    data311 = data311.withColumn("ResTimeH",
                                 data311.Resolution_Time_Hours.cast('int'))
    data311 = data311.withColumn('day_of_week',
                                 dayofweek(data311['Created Date']))
    data311 = data311.withColumn("Zip", data311["Incident Zip"].cast('int'))
    data311 = data311.filter(data311.ResTimeH > 0)
    data311 = data311.filter(data311.ResTimeH < 99)
    bucketizer = Bucketizer(splits=[0, 2, 6, float('Inf')],
                            inputCol="ResTimeH",
                            outputCol="categories")
    data311 = bucketizer.setHandleInvalid("keep").transform(data311)
    X = data311['Zip', 'Complaint_Type_Groups', 'day_of_week', 'categories']
    X = X.filter(X["Zip"].isNotNull())
    X = X.filter(X["Complaint_Type_Groups"].isNotNull())
    X = X.filter(X["day_of_week"].isNotNull())

    stage_1 = StringIndexer(inputCol="Complaint_Type_Groups",
                            outputCol="categoryIndex")
    stage_2 = OneHotEncoderEstimator(inputCols=["categoryIndex"],
                                     outputCols=["categoryVec"])
    stage_3 = VectorAssembler(inputCols=['Zip', 'day_of_week', 'categoryVec'],
                              outputCol="features")
    stage_4 = StandardScaler().setInputCol("features").setOutputCol(
        "Scaled_ip_features")
    stage_5 = LogisticRegression(labelCol="categories",
                                 featuresCol="Scaled_ip_features")
    # setup the pipeline
    pipeline = Pipeline(stages=[stage_1, stage_2, stage_3, stage_4, stage_5])
    # fit the pipeline model and transform the data as defined
    pipeline_model = pipeline.fit(X)

    zipcode = int(zipcode)
    day = int(day)
    input_variables = pd.DataFrame(
        [[zipcode, complaint, day]],
        columns=['Zip', 'Complaint_Type_Groups', 'day_of_week'])
    input_variables = spark.createDataFrame(input_variables)

    transformed = pipeline_model.transform(input_variables)
    ans = transformed.select(collect_list('prediction')).first()[0]

    if (ans[0] == 0.0):
        prediction = "Your complaint will be resolved within 2 hours."
    elif (ans[0] == 1.0):
        prediction = "Your complaint will be resolved within 2-6 hours."
    else:
        prediction = "Your complaint will be resolved after 6 hours"
    return prediction
Esempio n. 5
0
def age_recoder(spark_df, age_col):
    """
    :param spark_df:
    :param age_col:
    :return:
    """

    age1 = Bucketizer(splits=[0, 2, 6, 12, 18, 25, 35, 45, 55, 65, 150],
                      inputCol=age_col,
                      outputCol="age1")
    age7 = Bucketizer(splits=[0, 18, 25, 35, 45, 55, 65, 150],
                      inputCol=age_col,
                      outputCol="age7")
    age8 = Bucketizer(splits=[0, 12, 18, 25, 35, 45, 55, 65, 150],
                      inputCol=age_col,
                      outputCol="age8")

    sdf_1 = age1.setHandleInvalid("keep").transform(spark_df)
    sdf_2 = age7.setHandleInvalid("keep").transform(sdf_1)
    res_sdf = age8.setHandleInvalid("keep").transform(sdf_2)

    return res_sdf
Esempio n. 6
0
 def _bucketize_age_column(
         self, dataframe: DataFrame, input_col: str,
         output_col: str) -> Tuple[DataFrame, int, List[str]]:
     bucketizer = Bucketizer(splits=self.age_groups,
                             inputCol=input_col,
                             outputCol=output_col)
     output = bucketizer.setHandleInvalid("keep").transform(dataframe)
     splits = [s for s in bucketizer.getSplits()]
     mapping = [
         "[{}, {})".format(splits[i], splits[i + 1])
         for i in range(len(splits) - 1)
     ]
     n_age_groups = len(mapping)
     return output, n_age_groups, mapping
Esempio n. 7
0
def add_age_id(spark, df, logger):
    """Calculate the age_id by splitting the visitor age into buckets"""
    agebucketizer = Bucketizer(splits=[ float('-Inf'), 0, 2, 11, 16, 21,
                                        26, 36, 46, 56, 66, float('Inf') ],
                                inputCol="i94bir",
                                outputCol="agebuckets")
    agebuck_df = agebucketizer.setHandleInvalid("keep").transform(df)
    age_id_df = agebuck_df.withColumn("age_id", when(col("i94bir") == -1, 999)\
                                                .otherwise(col("agebuckets")
                                                .cast(IntegerType()))
                                    )
    logger.info("Added age_id")
    age_id_df.persist()
    return age_id_df
Esempio n. 8
0
def add_duration_id(spark, df, logger):
    """Calculate the visitduration_id by splitting the visit duration into buckets"""
    durdays_df = df.withColumn("duration_days", datediff("depdate", "arrdate"))
    ddbucketizer = Bucketizer(splits=[ float('-Inf'), 0, 4, 8, 11, 15, 22,
                                        29, float('Inf') ],
                                        inputCol="duration_days",
                        outputCol="ddbuckets")
    ddbuck_df = ddbucketizer.setHandleInvalid("keep").transform(durdays_df)
    dur_id_df = ddbuck_df.withColumn("visitduration_id",
                                   when(isnull(col("arrdate")) |
                                        isnull(col("depdate")), 999)\
                                   .otherwise(col("ddbuckets").cast(IntegerType()))
                                 )
    logger.info("Added duration_id")
    return dur_id_df
Esempio n. 9
0
df_rb_short = df_rb.drop('helpful').drop('reviewerName').drop('unixReviewTime')

df_join = df_rb_short.join(df_mb_short, df_rb_short.asin == df_mb_short.asin,
                           'left').drop(df_mb_short.asin)
cols_todrop = ['reviewText', 'summary', 'description', 'imUrl', 'title']
df_join_short = df_join.drop(*cols_todrop)
df_join_short = df_join_short.withColumn('reviewYear', F.year('reviewTime'))
max_rank = df_join_short.agg({
    'salesRank_Books': 'max'
}).collect()[0]["max(salesRank_Books)"]
bucketizer = Bucketizer(
    splits=[i * 5000 for i in list(range(int(max_rank / 5000) + 2))],
    inputCol="salesRank_Books",
    outputCol="salesRank_Books_buckets")
df_join_short = bucketizer.setHandleInvalid("keep").transform(df_join_short)
df_join_short = df_join_short.withColumn(
    'salesRank_ranged', df_join_short.salesRank_Books_buckets * 5000)
df_join_short = df_join_short.drop('salesRank_Books_buckets')

#Visualizations
#viz1
viz1_df = df_join_short.groupBy(
    'overall').count().toPandas()  #distribution of ratings
plt.bar(viz1_df['overall'], viz1_df['count'])
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Rating Distribution')
plt.show()
#viz2
viz2_df = df_join_short.groupBy('reviewYear').agg(
Esempio n. 10
0
data.describe('age').show()
data_group_month = data.groupBy('month').count()
data_group_weekday = data.groupBy('weekday').count()

data.createOrReplaceTempView('datas')
path = spark.sql(
    "select dpstation, rcstation, count(*) as num from datas group by dpstation, rcstation order by num desc"
)

# 分组统计数据应该使用passenger表进行
# agegroup
from pyspark.ml.feature import Bucketizer
splits = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
bucketizer = Bucketizer(splits=splits, inputCol='age', outputCol='agegroup')
cust_group_age = bucketizer.setHandleInvalid('keep').transform(cust)
cust_group_age = cust_group_age.groupBy('agegroup').count()
cust_group_age.coalesce(1).write.format('csv').option('header',
                                                      'true').save('custage')

# sex
cust_group_sex = cust.groupBy('sex').count()
cust_group_sex.coalesce(1).write.format('csv').option('header',
                                                      'true').save('custsex')

# freq
data = data.withColumn('year', lit('2020'))
data = data.withColumn(
    'dpdate', concat_ws('-', data['year'], data['month'], data['day']))
data = data.withColumn('dpdate', to_date(data['dpdate']))
data.createOrReplaceTempView('datas')
Esempio n. 11
0
df_demo = df_demo.withColumn('islamic', islamic_udf(df_demo.DEMO_CAPTUREDATE))

df_demo = df_demo.withColumn('islamic_month', get_month_udf(df_demo.islamic))

# df_demo = df_demo.withColumn('day_week', day_of_week_udf(df_demo.DEMO_CAPTUREDATE))

df_demo = df_demo.withColumn('age', age_udf(df_demo.birth_Year))

df_demo = df_demo.withColumn('location', location_udf(df_demo.cityTel))

bucketizer = Bucketizer(splits=[0, 10, 20, 30, 40, 60,
                                float('Inf')],
                        inputCol="age",
                        outputCol="age_Category")

df_cat = bucketizer.setHandleInvalid("keep").transform(df_demo)

#df_cat = df_cat.withColumn("age_Category", df_cat["age_Category"].cast("integer")) # converting string type to integer

#df_cat = df_cat.withColumn("age_Category", df_cat["age_Category"].cast("string")) # converting string type to integer

df = df_cat.filter((df_cat.islamic_month == '09'))

dfCount = df.groupBy('ID',
                     'DEMO_HOUR').count().select('ID', 'DEMO_HOUR',
                                                 f.col('count').alias('count'))

dfMax = dfCount.groupBy('ID').agg(
    f.last("count").alias("maxCount"),
    f.last("DEMO_HOUR").alias("interest_HOUR"))
Esempio n. 12
0
qc2 = sqlContext.sql("select StateCodes, TotalC2014, 2014_GDP_per_Capita, t1.State from Consumption as t1 join income as t2 where t1.State = t2.State")
qc2.registerTempTable("incomeConsumption")
qc2.coalesce(1).write.format('com.databricks.spark.csv').save('incomeConsumption',header = 'true')

#join the incomeConsumption with homelessness
qc3 = sqlContext.sql("select TotalC2014, 2014_GDP_per_Capita, Count, t1.StateCodes from incomeConsumption as t1 join states as t2 where t1.StateCodes = t2.StateCodes")
qc3.registerTempTable("homelessconsumption")
qc3.coalesce(1).write.format('com.databricks.spark.csv').save('consumptionHomeless',header = 'true')

#binning consumption to see if more homeless people are in high consumption areas
qc4 = sqlContext.sql("select TotalC2014, Count, StateCodes from homelessconsumption order by TotalC2014")
crdd = qc4.rdd.map(tuple)
crdd = crdd.map(lambda x: (int(x[0]), x[1], x[2]))
consumption_df = spark.createDataFrame(crdd, ["TotalC2014", "Count", "StateCodes"])
bucketizer = Bucketizer(splits=[ 0, 500000, 2000000, float('Inf') ], inputCol="TotalC2014", outputCol="buckets")
consumption_df_buck = bucketizer.setHandleInvalid("keep").transform(consumption_df)
t = {0.0:"low Consumption", 1.0: "Medium Consumption", 2.0:"High Consumption"}
udf_foo = udf(lambda x: t[x], StringType())
consumption_df_buck.withColumn("consumption_bucket", udf_foo("buckets")).show()
c_df_bucket = consumption_df_buck.withColumn("consumption_bucket", udf_foo("buckets"))
c_df_bucket.registerTempTable("consumptionHomelessBuck")
qc5 = sqlContext.sql("select consumption_bucket, Count, buckets from consumptionHomelessBuck order by consumption_bucket")
qc5.registerTempTable("binnedC")
qc5.coalesce(1).write.format('com.databricks.spark.csv').save('binnedconsumptionHomeless',header = 'true')

#analysis for average
qc6 = sqlContext.sql("select consumption_bucket, Count from binnedC")
crdd2 = qc6.rdd.map(tuple)
crdd2 = crdd2.mapValues(lambda x: int(x)).reduceByKey(lambda x,y: x+y)
mapped_binnedc_df = sqlc.createDataFrame(crdd2, ['consumption_bucket', 'Count'])
mapped_binnedc_df.registerTempTable("mappedbinnedc")
Esempio n. 13
0
# calculate outlier, 3 sigma, standard deviation
# df.select(
#     [F.mean(c).alias(c+'_mean') for c in outlier_cols]+[F.stddev(c).alias(c+'_std') for c in outlier_cols]
# ).collect()
df.select(
    [(F.mean(c)+3*F.stddev(c)).alias(c+'_ceil') for c in outlier_cols]
).show()

# 根据条件给数据打标签
df.select(*bounds,
          *[F.when(~F.col(c).between(bounds[c]['min'], bounds[c]['max']), "yes").otherwise("no").alias(c+'_outlier') for c in bounds])

# cut feature into bins, bucketizer, buckets, discretization
from pyspark.ml.feature import Bucketizer
bucketizer = Bucketizer(splits=[ 0, 6, 18, 60, float('Inf') ],inputCol="ages", outputCol="buckets")
df_buck = bucketizer.setHandleInvalid("keep").transform(df)

# bins, method2
from functools import reduce
splits = [0, 5, 9, 10, 11]
splits = list(enumerate(splits))  # [(0, 0), (1, 5), (2, 9), (3, 10), (4, 11)]
bins = reduce(lambda c, i: c.when(F.col('Age') <= i[1], i[0]), splits, F.when(F.col('Age') < splits[0][0], None)).otherwise(splits[-1][0] + 1).alias('bins')
df = df.select('age', bins)

# 基于聚类的特征处理,根据数据本身的特性,对特征进行聚类
# 剔除异常值后再聚类

from pyspark.ml.evaluation import ClusteringEvaluator

data_len = new_df.count()
print("数据量:{}".format(data_len))
# num_tweets_each_hour.show()
# num_tweets_each_hour_list = list(map(lambda row: row.asDict(), num_tweets_each_hour.collect()))

# Now check sentiment wise tweet count
logging.info(
    f"Calculating total sentiment wise tweets by each user at hour level...")
# First flatten the dataframe
df_flattened = flatten_df(df2)

# first bucketed sentiment scores
bucketizer = Bucketizer(splits=[-1, -0.25, 0.25, 1],
                        inputCol="sentiment_score",
                        outputCol="sentiment_score_integer")

df_bucketized = bucketizer.setHandleInvalid("keep").transform(df_flattened)

sentiment_labels = ["negative", "neutral", "positive"]
label_array = array(*(lit(label) for label in sentiment_labels))

df_bucketized_with_sentiment_label = df_bucketized.withColumn(
    "sentiment_score_label",
    label_array.getItem(col("sentiment_score_integer").cast("integer")))

# df_bucketized_with_sentiment_label.toPandas()

num_tweets_each_hour_sentimental = df_bucketized_with_sentiment_label.groupBy("username", "author_id",
                                                                              window("created_at", "1 hour"),
                                                                              "sentiment_score_label") \
    .count() \
    .select(col("username").alias("username_b"), col("author_id").alias("author_id_b"), col("window").alias("window_b"),
Esempio n. 15
0
############################
# Bucketizer ile Değişken Türetmek/Dönüştürmek
############################

from pyspark.ml.feature import Bucketizer

spark_df.select('age').describe().toPandas().transpose()
"""
             0        1                   2    3    4
summary  count     mean              stddev  min  max
age      10000  38.9218  10.487806451704587   18   92

"""
bucketizer = Bucketizer(splits=[0, 35, 45, 65], inputCol="age", outputCol="age_cat")
spark_df = bucketizer.setHandleInvalid("keep").transform(spark_df)
spark_df.show(20)

spark_df = spark_df.withColumn('age_cat', spark_df.age_cat + 1)
spark_df.groupby("age_cat").count().show()
spark_df.groupby("age_cat").agg({'geography': "mean"}).show()

spark_df = spark_df.withColumn("age_cat", spark_df["age_cat"].cast("integer"))
#Out[89]: DataFrame[rownumber: int, customerid: int, surname: string, creditscore: int, geography: string, gender: string, age: int, tenure: int, balance: double, numofproducts: int, hascrcard: int, isactivemember: int, estimatedsalary: double, exited: int, age_cat: int]

############################
# when ile Değişken Türetmek (segment)
############################

spark_df = spark_df.withColumn('segment', when(spark_df['tenure'] < 5, "segment_b").otherwise("segment_a"))