コード例 #1
0
    def _recency_aggregation(self, save_results, eval_path):
        print('[ '+str(datetime.utcnow())+' ] : Calculating recency aggregations for predictions by model = '+self.name)
        df = self.pred_df \
            .filter((F.col('recency_x') >= 0) & (F.col('recency_x') <= 365)) \
            .withColumn('days_until_deact', (F.lit(365) - F.col('recency_x')).cast(IntegerType())) \
            .withColumn('log_loss', F.when(F.col('deactivated') == 1, -F.log(F.col('prob_deact')))
                        .otherwise(-F.log(F.lit(1.0) - F.col('prob_deact')))) \
            .groupBy('days_until_deact') \
            .agg(F.count('consumer_id').alias('count_users'),
                 F.sum('deactivated').alias('deacts_actual'),
                 F.sum('prob_deact').cast(IntegerType()).alias('deacts_pred'),
                 F.avg('log_loss').alias('avg_log_loss'),
                 F.avg('prob_deact').alias('avg_prob_deact'),
                 # Accuracy uses "prediction" column, which assigns consumers to class based on cutoff point of 0.50
                 F.avg((F.col('deactivated') == F.col('prediction')).cast(IntegerType())).alias('accuracy')) \
            .withColumn('pct_deact_actual', F.col('deacts_actual') / F.col('count_users')) \
            .withColumn('pct_deact_pred', F.col('deacts_pred') / F.col('count_users')) \
            .withColumn('pred_over_actual_deacts', F.col('deacts_pred') / F.col('deacts_actual')) \
            .withColumn('diff_pct_deact', F.col('pct_deact_pred') - F.col('pct_deact_actual')) \
            .sort('days_until_deact') \
            .toPandas()
        self.recency_agg = df

        if save_results:
            df.to_csv(eval_path + 'model=' + self.name + '/recency_aggregation.tsv', sep='\t', index=False)
コード例 #2
0
ファイル: helper.py プロジェクト: bps10/github-churn
def create_KMeans_features(df, original=True):
    if original:
        df = df.withColumn(
            'non_passive_events',
            F.log(df.frequency -
                  (df.DeleteEvent_count + df.GollumEvent_count +
                   df.IssueCommentEvent_count + df.MemberEvent_count +
                   df.WatchEvent_count + 1)))

        df = df.withColumn(
            'public_repos_gists',
            F.log(df.public_repos_count + df.public_gists_count + 1))

        # Assemble pipeline
        stages = [
            VectorAssembler(
                inputCols=['non_passive_events', 'public_repos_gists'],
                outputCol="KMeans_features").setHandleInvalid("skip")
        ]
    else:
        # Assemble pipeline
        stages = [
            VectorAssembler(
                inputCols=['frequency', 'recency'],
                outputCol="KMeans_features").setHandleInvalid("skip")
        ]

    pipeline = Pipeline(stages=stages)
    pipelineModel = pipeline.fit(df)
    df = pipelineModel.transform(df)
    #selectedCols = ['label', 'features']
    #churn_data = churn_data.select(selectedCols)
    #churn_data.printSchema()
    return df
コード例 #3
0
def prepare_data():
    """Commodity function to read the data from the files and prepare the features for the kmeans model fit.
    """
    # Read data from files.
    _data = load_data()

    # As the distribution of the following feature is not normal they will be log scaled to have a more
    # normally distributed distribution. This is required for kmeans algorithm to work better.
    _data = _data.withColumn('log_age', F.log('age')).withColumn('log_avg_buy', F.log('avg_buy'))\
        .withColumn('log_min_buy', F.log('min_buy')).withColumn('log_max_buy', F.log('max_buy'))

    # Select the features to use in kmeans. The features will be also standard scaled, that is mean centered
    # and scaled to have standard deviation of one.
    features = _data.columns[4:]

    assembler = VectorAssembler(inputCols=features,
                                outputCol='features_unscaled')
    assembled = assembler.transform(_data)

    scaler = StandardScaler(inputCol='features_unscaled',
                            outputCol='features',
                            withStd=True,
                            withMean=True)
    scaler_model = scaler.fit(assembled)
    scaled_data = scaler_model.transform(assembled)

    return scaled_data, features
コード例 #4
0
def popularity_based_metrics(ratings, tips):
    total_reviews = ratings.groupBy("business_id").agg(
        F.count(F.lit(1)).alias("total_reviews"))

    all_pairs = ratings.join(
        ren(ratings, ["business_id"]),
        "business_id").filter(col("user_id") < col("user_id_2"))

    all_pairs = all_pairs.join(total_reviews, "business_id")

    adamic_ratings = all_pairs.groupBy("user_id", "user_id_2").agg(
        F.sum(1 /
              F.log("total_reviews")).cast("float").alias("aa_pop_ratings"))

    tips = tips.join(ratings.select("user_id").distinct(), "user_id", "right")

    total_tips = tips.groupBy("business_id").agg(
        F.count(F.lit(1)).alias("total_tips"))

    all_pairs = ratings.join(
        ren(ratings, ["business_id"]),
        "business_id").filter(col("user_id") < col("user_id_2"))

    all_pairs = all_pairs.join(total_tips, "business_id")

    adamic_tips = all_pairs.groupBy("user_id", "user_id_2").agg(
        F.sum(1 / F.log("total_tips")).cast("float").alias("aa_pop_tips"))

    return adamic_ratings.join(adamic_tips, ["user_id", "user_id_2"], "outer")
コード例 #5
0
ファイル: deact.py プロジェクト: PipCourbois/miscellaneous
    def recency_aggregation(self):
        print('[ {0} ] : Calculating aggregations by recency for model predictions'.format(datetime.utcnow()))
        prob_col = 'masked_prob' if self.masked else 'prob_deact_cal' if self.calibrated else 'prob_deact'
        df = self.prediction_df \
            .filter((F.col('recency_x') >= 0) & (F.col('recency_x') <= 365)) \
            .withColumn('days_until_deact', (F.lit(365) - F.col('recency_x')).cast(IntegerType())) \
            .withColumn('log_loss', F.when(F.col('deactivated') == 1, -F.log(F.col(prob_col)))
                        .otherwise(-F.log(F.lit(1.0) - F.col(prob_col)))) \
            .groupBy('days_until_deact') \
            .agg(F.count('consumer_id').alias('count_users'),
                 F.sum('deactivated').alias('deacts_actual'),
                 F.sum(prob_col).cast(IntegerType()).alias('deacts_pred'),
                 F.avg('log_loss').alias('avg_log_loss'),
                 F.avg(prob_col).alias('avg_prob_deact'),
                 F.avg((F.col('deactivated') == F.round(F.col(prob_col), 0)).cast(IntegerType())).alias('accuracy'),
                 ) \
            .withColumn('pct_deact_actual', F.col('deacts_actual') / F.col('count_users')) \
            .withColumn('pct_deact_pred', F.col('deacts_pred') / F.col('count_users')) \
            .withColumn('pred_over_actual_deacts', F.col('deacts_pred') / F.col('deacts_actual')) \
            .withColumn('diff_pct_deact', F.col('pct_deact_pred') - F.col('pct_deact_actual')) \
            .sort('days_until_deact') \
            .toPandas()
        self.recency_df = df

        if self.save_results:
            df.to_csv(self.eval_path + 'recency_aggregation.tsv', sep='\t', index=False)

        self._recency_plots()
コード例 #6
0
def calculate_volatility(rolling_windows=20):
    spark = SparkSession.builder.master('local[*]').appName('Volatility').getOrCreate()
    df = spark.read.format('csv')\
                     .option('header', 'true')\
                     .load('/media/guolewen/research_data/compustats/*.csv')
    # adjust price with stock/dividend split ratio
    df = df.withColumn('adjprccd', df['prccd'] / df['ajexdi'])
    df = df.withColumn('adjprchd', df['prchd'] / df['ajexdi'])
    df = df.withColumn('adjprcld', df['prcld'] / df['ajexdi'])
    # create window
    win_spec = Window.partitionBy('isin').orderBy('datadate')
    # lag price
    df = df.withColumn('ladjprccd', lag('adjprccd').over(win_spec))
    # compute squared daily log returns as the square of natural logarithm of
    # the current closing price divided by previous closing price.
    df = df.withColumn('retsq', pow(log(df['adjprccd'] / df['ladjprccd']), 2))
    # construct a 20-trading-day rolling window
    win_rolling = Window.partitionBy('isin').orderBy('datadate').rowsBetween(-rolling_windows, -1)
    # traditional volatility approach: square root of the average squared daily log returns in a 20-rolling window
    df = df.withColumn('volatility', sqrt(avg('retsq').over(win_rolling)))
    # compute squared daily log high low as the square of natural logarithm
    # of daily high price divided by low price.
    # fill na values with 0 (this is for the case if no trading during the day)
    df = df.withColumn('loghlsq', pow(log(df['adjprchd'] / df['adjprcld']), 2)).fillna(0, subset=['loghlsq'])
    # Parkison's extreme value method: square root of 1/4*Ln2 times the average of squared daily log high low
    # in a 20-rolling window
    df = df.withColumn('Parkinsonvol', sqrt((1/(4*np.log(2))) * avg('loghlsq').over(win_rolling)))
    return df.selectExpr('datadate as Date', 'isin as ISIN', 'volatility', 'Parkinsonvol').toPandas()
コード例 #7
0
def _cross_entropy(y_true, y_prob, df=None, normalize=False):
    """Function to calculate cross entropy
       If y_true or y_prob is of shape (num_samples, ),
       the labels are assumed to be binary
    """
    eps = 1e-15

    if df is None:
        # Pre-processing
        if y_prob.ndim == 1:
            y_prob = np.vstack((1 - y_prob, y_prob)).T

        if y_true.ndim == 1:
            y_true = np.vstack((1 - y_true, y_true)).T

        y_prob = np.clip(y_prob, eps, 1 - eps)

        # Re-normalize and calculate entropy
        y_prob /= y_prob.sum(axis=1)[:, np.newaxis]

        entropy_arr = -(y_true * np.log(y_prob)).sum(axis=1)

        return entropy_arr.mean() if normalize else entropy_arr.sum()

    else:
        df = df.withColumn(y_prob, F.when(F.col(y_prob) < eps, eps)
                                    .when(F.col(y_prob) > (1 - eps), 1 - eps)
                                    .otherwise(F.col(y_prob)))
        df = df.withColumn('entropy', -F.col(y_true) * F.log(F.col(y_prob)) -
                           (1 - F.col(y_true)) * F.log(1 - F.col(y_prob)))

        if normalize:
            return df.agg(F.avg('entropy').alias('loss')).select('loss')
        else:
            return df.agg(F.sum('entropy').alias('loss')).select('loss')
コード例 #8
0
ファイル: RedHat.py プロジェクト: yingcuhk/SparkPractice
def log_loss(df):

	epsilon = 1e-12
	temp = df.select("label", when(df.outcome == 1.0, 1.0-epsilon).otherwise(df.outcome).alias("p"))
	temp = temp.select("label", when(temp.p == .0,epsilon).otherwise(temp.p).alias("p"))
	temp = temp.select("p","label", when(temp.label == 1, -log(temp.p)).otherwise(-log(1-temp.p)).alias("log_loss"))	
	
	return temp.selectExpr("mean(log_loss)").first()[0]
    def evaluate_agg_prob(self):
        import pyspark
        from pyspark.sql.functions import col
        #terminal_outcome.show()

        from pyspark.sql.functions import udf, log, sum, exp
        from pyspark.ml.evaluation import BinaryClassificationEvaluator

        udf_prob = udf(lambda x: x.toArray().tolist()[1])
        cur_terminal_df = self.get_terminal_df()
        self.flatten_terminal_outcome()
        for cur_of in [self.target_disch_col]:
            self.logger.info(cur_of)
            try:
                cur_training_df = self.spark.read.parquet(
                    self.training_result_dest_template.format(cur_of)).select(
                        "ID", "TIME_SPAN",
                        udf_prob("Probability").cast("double").alias(
                            "probability"),
                        col("{0}_label".format(cur_of)).alias("label"))
                cur_testing_df = self.spark.read.parquet(
                    self.testing_result_dest_template.format(cur_of)).select(
                        "ID", "TIME_SPAN",
                        udf_prob("Probability").cast("double").alias(
                            "probability"),
                        col("{0}_label".format(cur_of)).alias("label"))
            except pyspark.sql.utils.AnalysisException as ex:
                template = "An exception of type {0} occurred. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                self.logger.info(message)
                self.logger.info("PROCESS")
                self.logger.debug("{0} Not exists".format(cur_of))
                continue
            cur_tr_agg = cur_training_df.groupBy("ID").agg(
                sum(log(1.0 - col("probability"))).alias("agg_prob")).select(
                    "ID",
                    (1.0 - exp("agg_prob")).alias("agg_prob").cast("double"))
            cur_te_agg = cur_testing_df.groupBy("ID").agg(
                sum(log(1.0 - col("probability"))).alias("agg_prob")).select(
                    "ID",
                    (1.0 - exp("agg_prob")).alias("agg_prob").cast("double"))

            # TODO terminal_df is flattened terminal DX for now. Need to merge with other DF with ALI,AKI,ALF,AHF column separately.

            cur_tr_agg = cur_tr_agg.join(self.target_terminal_outcome_table,
                                         "ID")
            cur_te_agg = cur_te_agg.join(self.target_terminal_outcome_table,
                                         "ID")

            #cur_tr_agg.show()
            #cur_te_agg.show()

            from pyspark.sql.functions import count
            #cur_te_agg.select(cur_of).groupBy(cur_of).agg(count("*")).show()

            return cur_tr_agg, cur_te_agg
コード例 #10
0
ファイル: metrics.py プロジェクト: KatyaKos/MLBD
def logloss(model, df, probabilities_col='probability'):
    df = model.transform(df)
    df = df.withColumn(
        'proba',
        F.udf(lambda v: float(v[1]), FloatType())(F.col(probabilities_col)))

    df = df.withColumn(
        'logloss', -F.col('label') * F.log(F.col('proba')) -
        (1. - F.col('label')) * F.log(1. - F.col('proba')))

    return df.agg(F.mean('logloss')).first()[0]
コード例 #11
0
ファイル: metrics.py プロジェクト: niksaz/MLBD
def logloss(model,
            df,
            probabilities_col='probability',
            negative_downsampling_rate=1.0):
    df = df_with_proba_column(model, df, probabilities_col,
                              negative_downsampling_rate)

    df = df.withColumn(
        'logloss', -F.col('label') * F.log(F.col('proba')) -
        (1. - F.col('label')) * F.log(1. - F.col('proba')))

    return df.agg(F.mean('logloss')).first()[0]
コード例 #12
0
def preprocess(df, logger):
    # Check if database is empty
    if df.rdd.isEmpty():
        logger.error("Couldn't read data")
        sys.exit(0)

    # Select interesting columns (the only ones containing data)
    logger.info('Selecting interesting columns')
    df = df.select(df.td.cast('float'), df.sp.cast('int'), df.dp.cast('int'),
                   df.pr, df.flg, df.ipkt.cast('int'), df.ibyt.cast('int'))

    # Remove rows with NaN values
    df = df.dropna()

    # Apply a logarithmic transformation to the td, ipkt and ibyt columns
    logger.info('Applying logarithmic transorm to columns')
    df = df.withColumn('td', F.log(df.td + 1)).withColumn(
        'ipkt', F.log(df.ipkt + 1)).withColumn('ibyt', F.log(df.ibyt + 1))

    proto_transform = F.udf(lambda z: transform_protocol(z),
                            ArrayType(IntegerType()))

    logger.info('Transforming flag into one-hot encoding')
    flag_transform = F.udf(lambda z: process_flag(z), ArrayType(IntegerType()))

    # Transform protocol column into one-hot encoding
    logger.info('Transforming protocol into one-hot encoding')
    df = df.withColumn('proto_onehot', proto_transform(df.pr))
    df = df.withColumn('proto_onehot0', df.proto_onehot[0])
    df = df.withColumn('proto_onehot1', df.proto_onehot[1])
    df = df.withColumn('proto_onehot2', df.proto_onehot[2])
    df = df.withColumn('proto_onehot3', df.proto_onehot[3])
    df = df.withColumn('proto_onehot4', df.proto_onehot[4])

    # Decode flag column and transform it into one-hot encoding
    logger.info('Transforming flag into one-hot encoding')
    df = df.withColumn('flag_onehot', flag_transform(df.flg))
    df = df.withColumn('flag_onehot0', df.flag_onehot[0])
    df = df.withColumn('flag_onehot1', df.flag_onehot[1])
    df = df.withColumn('flag_onehot2', df.flag_onehot[2])
    df = df.withColumn('flag_onehot3', df.flag_onehot[3])
    df = df.withColumn('flag_onehot4', df.flag_onehot[4])
    df = df.withColumn('flag_onehot5', df.flag_onehot[5])

    # Select final columns for training the algorithms
    df = df.select(df.td.cast('float'), df.flag_onehot0, df.flag_onehot1,
                   df.flag_onehot2, df.flag_onehot3, df.flag_onehot4,
                   df.flag_onehot5, df.proto_onehot0, df.proto_onehot1,
                   df.proto_onehot2, df.proto_onehot3, df.proto_onehot4,
                   df.ipkt.cast('float'), df.ibyt.cast('float'))

    return df
コード例 #13
0
def main(spark, train_file, val_file, test_file, ext_type):
    train_df = spark.read.parquet(train_file)
    print("Loaded train file")
    val_df = spark.read.parquet(val_file)
    print("Loaded val file")
    test_df = spark.read.parquet(test_file)
    print("Loaded test file")

    val_df.createOrReplaceTempView("val_df")
    users_val = spark.sql("SELECT DISTINCT userIndex FROM val_df")
    print("Created val users list")

    test_df.createOrReplaceTempView("test_df")
    users_test = spark.sql("SELECT DISTINCT userIndex FROM test_df")
    print("Created test users list")

    if ext_type == "log":
        train_df = train_df.withColumn("count", F.log(1 + train_df["count"]))
    elif ext_type == "square":
        train_df = train_df.withColumn("count",
                                       train_df["count"] * train_df["count"])
    elif ext_type == "cube":
        train_df = train_df.withColumn(
            "count", train_df["count"] * train_df["count"] * train_df["count"])
    elif ext_type == "log2":
        train_df = train_df.withColumn(
            "count",
            F.log(1 + train_df["count"]) / math.log(2))
    print("Transformed counts")

    params = {"regParam": 10, "rank": 100, "alpha": 40.0}
    reg = params["regParam"]
    rank = params["rank"]
    alpha = params["alpha"]

    als = ALS(maxIter=10,
              regParam=reg,
              rank=rank,
              alpha=alpha,
              implicitPrefs=True,
              userCol="userIndex",
              itemCol="trackIndex",
              ratingCol="count")
    model = als.fit(train_df)
    print("Fitted ALS model")

    map_val = compute_MAP(model, users_val, val_df)
    print('Validation: RegParam:{} | Rank:{} | Alpha:{} | MAP:{}'.format(
        reg, rank, alpha, map_val))
    map_test = compute_MAP(model, users_test, test_df)
    print('Test: RegParam:{} | Rank:{} | Alpha:{} | MAP:{}'.format(
        reg, rank, alpha, map_test))
コード例 #14
0
def log_loss_from_prediction(predictions):
    # predictions are what returns from model.transform
    # the data frame should have a column named probability, which is a tuple:
    # we need to extract the second item of the tuple and calculate log loss with it
    epsilon = 1e-16
    split1_udf = udf(lambda value: value[1].item(), FloatType())
    predictions = predictions.select('*', split1_udf('probability').\
                                     alias('prob'))
    loss = predictions.select("*",
                           when(predictions.label == 1, 0. - log(predictions.prob + epsilon)).\
                           otherwise(0. - log(1. - predictions.prob + epsilon)).\
                           alias('log_loss')).\
                agg({'log_loss': 'avg'}).\
                take(1)
    return loss
コード例 #15
0
def socialBasedMetrics(ratings):
    fold_user_friend = user_friend.join(
        ratings.select("user_id").distinct(), "user_id", "right")
    fu_with_friendsize = fold_user_friend.join(fold_user_friend.select(col("user_id").alias("friend_id"),
                                                                       col("nf").alias("nf_friend")).distinct(),
                                               "friend_id") \
        .select("user_id", "nf", "friend_id", "nf_friend")

    ufJoin = fu_with_friendsize.join(
        ren(fu_with_friendsize, ["friend_id"]),
        "friend_id").filter(col("user_id") < col("user_id_2"))

    intersection = ufJoin.groupBy("user_id", "user_id_2", "nf", "nf_2").agg(
        count(lit(1)).alias("intersection"),
        sum_sql(1 / log("nf_friend")).cast("float").alias("adamic_adar_graph"))

    graph = intersection.withColumn(
        "jaccard_graph",
        (col("intersection") / (col("nf") + col("nf_2") - col("intersection"))
         ).cast("float")).withColumn(
             "cosine_graph",
             (col("intersection") /
              (sqrt(col("nf") * col("nf_2")))).cast("float")).withColumn(
                  "preferential_attachment",
                  col("nf") * col("nf_2")).select(
                      "user_id", "user_id_2", "adamic_adar_graph",
                      "jaccard_graph", "cosine_graph",
                      "preferential_attachment").filter(
                          (col("adamic_adar_graph") > 0)
                          | (col("jaccard_graph") > 0)
                          | (col("cosine_graph") > 0))

    return graph
コード例 #16
0
ファイル: ExLogger.py プロジェクト: BingZou/RecommenderSystem
def Trainer(spark, df_train, rank, regParam, alpha, K=500):
    df_train = df_train.withColumn('count', F.log('count'))  #takes log
    output_file = 'ALSModel_%s_%s__%s' % (str(rank), str(regParam), str(alpha))
    FileExistFlag = os.system('hadoop fs -test -e %s' % output_file)
    if not FileExistFlag == 0:
        beg = time()
        als = ALS(rank=rank,
                  maxIter=10,
                  regParam=regParam,
                  alpha=alpha,
                  implicitPrefs=True,
                  userCol="user_id_numeric",
                  itemCol="track_id_numeric",
                  ratingCol="count",
                  coldStartStrategy="drop")
        model = als.fit(df_train)
        print('Train Finished')
        model.write().overwrite().save(output_file)
        end = time()
        print('ALSModel_%s_%s__%s Saved. Took %f s' %
              (str(rank), str(regParam), str(alpha), end - beg))
    else:
        print('ALSModel_%s_%s__%s Already Exist.' %
              (str(rank), str(regParam), str(alpha)))
    return
コード例 #17
0
def log_transform(dataset, features):
    """
    This function is used to do log transformation on quantitative features to make them follow normal distribution
    """
    for feature in features:
        dataset = dataset.withColumn(feature, F.log(feature))
    return dataset
コード例 #18
0
    def __init__(self):
        super(FeaturePayloadSizeLogAverage, self).__init__()

        self.group_by_aggs = {
            'reply_length_log':
            F.avg(F.log(F.col('reply_length_bytes') + 1.)).cast('float')
        }
コード例 #19
0
ファイル: test_GBM.py プロジェクト: yab/sparkling-water
def insuranceFrame(hc, spark):
    df = spark \
        .read.csv("file://" + unit_test_utils.locate("smalldata/insurance.csv"), header=True, inferSchema=True) \
        .withColumn("Offset", log(col("Holders")))
    frame = hc.asH2OFrame(df)
    frame["Group"] = frame["Group"].asfactor()
    frame["Age"] = frame["Age"].asfactor()
    return frame
コード例 #20
0
def insuranceFrame(hc, spark, insuranceDatasetPath):
    df = spark \
        .read.csv(insuranceDatasetPath, header=True, inferSchema=True) \
        .withColumn("Offset", log(col("Holders")))
    frame = hc.asH2OFrame(df)
    frame["Group"] = frame["Group"].asfactor()
    frame["Age"] = frame["Age"].asfactor()
    return frame
コード例 #21
0
 def get_log_of_grades(self, df):
     # type: (dataframe) -> dataframe
     for col in self.grade_cols:
         df = df.withColumn(
             col,
             F.coalesce(F.log(F.lit(1) - F.col(col)),
                        F.lit(self.log_value_for_ones)))
     return df
コード例 #22
0
        def cumprod(scol):
            @pandas_udf(returnType=self._kdf._internal.spark_type_for(self.name))
            def negative_check(s):
                assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), \
                    "values should be bigger than 0: %s" % s
                return s

            return F.sum(F.log(negative_check(scol)))
コード例 #23
0
ファイル: metrics.py プロジェクト: katerynak/hay_checker
def _mutual_info_todo(when, then, df):
    """
    Returns what (columns, as in spark columns) to compute to get the results requested by
    the parameters.

    :param when:
    :type when: str/int
    :param then:
    :type then: str/int
    :param df:
    :type df: DataFrame
    :return: Pyspark columns representing what to compute.
    """
    # group on the pair of columns, count occurrences
    pairs_table = df.groupBy([when,
                              then]).agg(count("*").alias("_pairs_count"))
    # ignore nulls
    pairs_table = pairs_table.filter((~col(when).isNull())
                                     & (~col(then).isNull()))
    pairs_table.cache()

    when_table = pairs_table.groupBy(col(when).alias("wt")).agg(
        sum("_pairs_count").alias("_when_count"))
    then_table = pairs_table.groupBy(col(then).alias("tt")).agg(
        sum("_pairs_count").alias("_then_count"))
    final_table = pairs_table.join(
        when_table, pairs_table[when].eqNullSafe(when_table["wt"]))
    final_table = final_table.join(
        then_table, final_table[then].eqNullSafe(then_table["tt"]))

    # prepare 4 subformulas of MI to later sum, plus the total
    todo = final_table.select(
        sum(col("_pairs_count") *
            log(col("_pairs_count"))).alias("_s1"),  # c_xy * logc_xy
        sum(col("_pairs_count")).alias("_s2"),  # c_xy
        sum(col("_pairs_count") *
            log(col("_when_count"))).alias("_s3"),  # c_xy * logc_x
        sum(col("_pairs_count") *
            log(col("_then_count"))).alias("_s4"),  # c_xy * logc_y
        sum(col("_pairs_count")).alias("_total")  # total
    )
    todo = todo.select((col("_s1") / col("_total")) +
                       (log(col("_total")) * (col("_s2") / col("_total"))) -
                       ((col("_s3")) / col("_total")) -
                       ((col("_s4")) / col("_total")).alias("mutual_info"))
    return todo
コード例 #24
0
def compile_log(t, expr, scope, timecontext, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope, timecontext)
    # Spark log method only takes float
    return F.log(
        float(t.translate(op.base, scope, timecontext, raw=True)), src_column
    )
コード例 #25
0
def main(spark):
    train_data = spark.read.parquet('anshul_project/train_index.parquet')
    val_data = spark.read.parquet('anshul_project/val_index.parquet')

    # train_data.createOrReplaceTempView('train_data')

    train_data_log = train_data.withColumn("logcount", log(train_data["count"]))
    val_data_log = val_data.withColumn("logcount", log(val_data["count"]))
    uid_indexer = StringIndexer(inputCol="user_id", outputCol="user_num", handleInvalid ="skip")
    tid_indexer = StringIndexer(inputCol="track_id", outputCol="track_num", handleInvalid ="skip")

    ranks =[4]
    regs = [1]
    alphas = [0.5]
    
    best_rmse = None
    best_rank = None
    best_alpha = None
    best_reg = None

    for rank in ranks :
        for alpha in alphas :
            for reg in regs :

                als = ALS(maxIter = 3 , regParam= reg, userCol= "user_num" , itemCol= "track_num" , ratingCol ="logcount" , implicitPrefs=True , coldStartStrategy="drop" , alpha= a , rank = r)

                pipeline = Pipeline(stages=[uid_indexer, tid_indexer, als])
                
                als_model = pipeline.fit(train_data_log)
                predictions = als_model.transform(val_data_log)
                
                evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction")
                
                rmse = evaluator.evaluate(predictions)

                if best_rmse is None or best_rmse > rmse :
                   best_rmse = rmse
                   best_rank = rank
                   best_alpha = alpha
                   best_reg = reg

    print('The best hyper parameters: Rank: {}, Reg: {}, Alpha: {}, RMSE: {}'.format(best_rank,best_alpha,best_reg,best_rmse))


    als_model.save('anshul_project/log_model')
    pass
コード例 #26
0
ファイル: generic.py プロジェクト: penguinkang/koalas
        def cumprod(scol):
            @pandas_udf(returnType=self._kdf._sdf.schema[self.name].dataType)
            def negative_check(s):
                assert len(s) == 0 or ((s > 0) | (s.isnull())).all(), \
                    "values should be bigger than 0: %s" % s
                return s

            return F.sum(F.log(negative_check(scol)))
コード例 #27
0
def main(spark,
         data_file,
         model_file,
         user_file,
         track_file,
         model_formulation=None):
    df = spark.read.parquet(data_file)

    if model_formulation == 'log':
        #log compression on training
        df = df.withColumn('count', F.log(F.col('count')))
        print("log")

    elif model_formulation == 'ct1':
        #subsetting all train counts greater than 1
        df.createOrReplaceTempView('df')
        df = spark.sql('SELECT * FROM df WHERE count > 1')
        print("ct1")

    elif model_formulation == 'ct2':
        #subsetting all train counts greater than 2
        df.createOrReplaceTempView('df')
        df = spark.sql('SELECT * FROM df WHERE count > 2')
        print("ct2")

    else:
        #If no model formulation is specified, pass
        print("default")
        pass

    user_indexer = StringIndexer(inputCol="user_id",
                                 outputCol="user_idx",
                                 handleInvalid="keep")
    track_indexer = StringIndexer(inputCol="track_id",
                                  outputCol="track_idx",
                                  handleInvalid="keep")

    pipeline = Pipeline(stages=[user_indexer, track_indexer])
    mapping = pipeline.fit(df)
    df = mapping.transform(df)

    #create + fit an ALS model
    als = ALS(maxIter=5,
              regParam=0.01,
              implicitPrefs=True,
              ratingCol="count",
              userCol="user_idx",
              itemCol="track_idx")
    als_model = als.fit(df)

    #save trained ALS model
    als_model.write().overwrite().save(model_file)
    print("Model sucessfully saved to HFS")

    #save string indexers
    user_indexer.write().overwrite().save(user_file)
    track_indexer.write().overwrite().save(track_file)
    print("String Indexers sucessfully saved to HFS")
コード例 #28
0
def main(spark, df_test, model_file):

    # import model
    model = PipelineModel.load(model_file)
    print("imported model")

    # import test data
    test = spark.read.parquet(df_test)
    print("imported test data")
    # log transform test data
    test = test.withColumn("count", log(test["count"] + 1))
    print("log-transformed test data")

    # predict on test data
    testdf = model.transform(test)
    testdf = testdf.select([
        c for c in testdf.columns
        if c in ["user_index", "count", "track_index"]
    ])

    # make labels
    testdf.createOrReplaceTempView('testdf')
    Labels = spark.sql(
        'SELECT user_index, collect_list(track_index) AS label FROM testdf GROUP BY user_index'
    )
    Labels.createOrReplaceTempView('Labels')
    print("created ground truth labels")

    # generate top 500 track recommendations for each user in validation set
    user_subset = testdf.select("user_index").distinct()
    userRecs = model.stages[-1].recommendForUserSubset(user_subset, 500)
    userRecs.createOrReplaceTempView("userRecs")
    print("made user recommendations")

    # explode recommendations in long format
    Recs = (userRecs.select("user_index",
                            explode("recommendations").alias("pred")).select(
                                "user_index", "pred.*"))
    Recs.createOrReplaceTempView("Recs")

    # make predictions
    Preds = spark.sql(
        'SELECT user_index, collect_list(track_index) AS prediction FROM Recs GROUP BY user_index'
    )
    Preds.createOrReplaceTempView("Preds")

    # make label pairs
    Preds_labels = spark.sql(
        'SELECT Preds.prediction AS prediction, Labels.label as label FROM Preds INNER JOIN Labels ON Preds.user_index = Labels.user_index'
    )
    print("inner join preds & labels")

    # calculate MAP
    MAPrecommendationsAndTruth = Preds_labels.select("prediction", "label")
    metrics = RankingMetrics(MAPrecommendationsAndTruth.rdd)
    MAP = metrics.meanAveragePrecision
    print("MAP = %s" % MAP)
コード例 #29
0
ファイル: helper.py プロジェクト: bps10/github-churn
def feature_scaling(df):
    '''Log transform all numeric cols.
    '''
    # scale remaining cols
    if isinstance(df, DataFrame):
        for col in count_columns:
            df = df.withColumn(col, F.log(df[col].cast(DoubleType()) + 1))
        for col in scale_columns:
            df = df.withColumn(col, F.log(df[col].cast(DoubleType()) + 1))
        # Scale recency
        df = df.withColumn('recency', F.log(df.recency.cast(DoubleType()) + 1))

    else:
        df[count_columns] = df[count_columns].apply(lambda x: np.log(x + 1))
        df[scale_columns] = df[scale_columns].apply(lambda x: np.log(x + 1))
        df['recency'] = df['recency'].apply(lambda x: np.log(x + 1))

    return df
コード例 #30
0
def logLoss(predDF):
    # Define a function clamp to restrict the values of probability to be greater than 0 and less than one
    def clamp(n):
        epsilon = .000000000000001
        minn = 0 + epsilon
        maxn = 1 - epsilon
        return max(min(maxn, n), minn)
    
    # Define a UDF to extract the first element of the probability array returned which is probability of one
    firstelement=udf(lambda v:clamp(float(v[1])))   #,FloatType() after [] was inserted and removed for epsilon
    
    # Create a new dataframe that contains a probability of one column (true)
    predict_df = predDF.withColumn('prob_one', firstelement(predDF.probability))
    
    # Compute the log loss for the spark dataframe for each row
    row_logloss = (predict_df.withColumn(
        'logloss', -f.col('Label')*f.log(f.col('prob_one')) - (1.-f.col('Label'))*f.log(1.-f.col('prob_one'))))

    logloss = row_logloss.agg(f.mean('logloss').alias('ll')).collect()[0]['ll']
    return logloss
コード例 #31
0
ファイル: retail.py プロジェクト: ZhiYinZhang/study
def get_vfr_index():
    try:
        print(f"{str(dt.now())} 零售户周边人流指数")
        # 有人流数据的零售户
        vfr = get_around_vfr(spark)

        vfr.cache()

        # 零售户
        co_cust = get_co_cust(spark).select("cust_id")
        # 有经纬度的零售户
        cust_lng_lat = get_cust_lng_lat(spark) \
            .select("city", "cust_id", "lng", "lat") \
            .join(co_cust, "cust_id")

        cust_lng_lat.cache()

        # 周边没有人流的零售户
        not_df = cust_lng_lat.select("cust_id") \
            .exceptAll(vfr.select("cust_id")) \
            .join(cust_lng_lat, "cust_id")

        exist_df = vfr.join(cust_lng_lat, ["city", "cust_id"])

        if not_df.count()>0:
            #knn填充
            fill_df = fillWithKNN(exist_df.toPandas(), not_df.toPandas(), "avg_vfr")

            all_df = spark.createDataFrame(fill_df) \
                .unionByName(exist_df)
        else:
            all_df=exist_df

        #阈值
        threshold = all_df.groupBy("city") \
            .agg((f.mean("avg_vfr") + 3 * f.stddev_pop("avg_vfr")).alias("threshold"))

        truncate_df = all_df.join(threshold, "city") \
            .withColumn("avg_vfr", f.when(col("avg_vfr") > col("threshold"), col("threshold"))
                        .otherwise(col("avg_vfr"))
                        )

        log_df = truncate_df.withColumn("log", f.log(col("avg_vfr") + 1))
        log_max = log_df.groupBy("city").agg(f.max("log").alias("log_max"))

        colName = "people_count"
        log_df.join(log_max, "city") \
            .withColumn(colName, col("log") / col("log_max") * 5) \
            .foreachPartition(lambda x: write_hbase1(x, [colName], hbase))

        vfr.unpersist()
        cust_lng_lat.unpersist()
    except Exception:
        tb.print_exc()
コード例 #32
0
# MAGIC Now, let's visualize the results from the last example.  We can use the built-in `display()` function to show a bar chart of the count for each response code.  After running this cell, select the bar graph option, and then use "Plot Options..." and drag `status` to the key entry field and drag `count` to the value entry field. See the diagram, below, for an example.
# MAGIC 
# MAGIC <img src="http://spark-mooc.github.io/web-assets/images/cs105x/plot_options_1.png" style="float: right; margin-right: 30px; border: 1px solid #999999"/>

# COMMAND ----------

display(status_to_count_df)

# COMMAND ----------

# MAGIC %md
# MAGIC You can see that this is not a very effective plot.  Due to the large number of '200' codes, it is very hard to see the relative number of the others.  We can alleviate this by taking the logarithm of the count, adding that as a column to our DataFrame and displaying the result.

# COMMAND ----------

log_status_to_count_df = status_to_count_df.withColumn('log(count)', sqlFunctions.log(status_to_count_df['count']))

display(log_status_to_count_df)

# COMMAND ----------

# MAGIC %md
# MAGIC While this graph is an improvement, we might want to make more adjustments.  The [`matplotlib` library](http://matplotlib.org/) can give us more control in our plot and is also useful outside the Databricks environment. In this case, we're essentially just reproducing the Databricks graph using `matplotlib`. However, `matplotlib` exposes far more controls than the Databricks graph, allowing you to change colors, label the axes, and more. We're using a set of helper functions from the [`spark_notebook_helpers`](https://pypi.python.org/pypi/spark_notebook_helpers/1.0.1) library.

# COMMAND ----------

# np is just an alias for numpy.
# cm and plt are aliases for matplotlib.cm (for "color map") and matplotlib.pyplot, respectively.
# prepareSubplot is a helper.
from spark_notebook_helpers import prepareSubplot, np, plt, cm