Ejemplo n.º 1
0
def compute_rho_transitions(vertices, edges, pub_decay_time, data_decay_time):
    """Compute the initial distribution rho and transitions"""
    distribution = fn.when(fn.col('type') == 'data', fn.exp(-fn.col('age') / fn.lit(data_decay_time))). \
        otherwise(fn.exp(-fn.col('age') / fn.lit(pub_decay_time)))
    rho = vertices.select('i', distribution.alias('value'))

    transitions = edges.groupBy('i').count().join(edges, 'i'). \
        selectExpr('j as i', 'i as j', '1/count as value')
    return rho, transitions
    def evaluate_agg_prob(self):
        import pyspark
        from pyspark.sql.functions import col
        #terminal_outcome.show()

        from pyspark.sql.functions import udf, log, sum, exp
        from pyspark.ml.evaluation import BinaryClassificationEvaluator

        udf_prob = udf(lambda x: x.toArray().tolist()[1])
        cur_terminal_df = self.get_terminal_df()
        self.flatten_terminal_outcome()
        for cur_of in [self.target_disch_col]:
            self.logger.info(cur_of)
            try:
                cur_training_df = self.spark.read.parquet(
                    self.training_result_dest_template.format(cur_of)).select(
                        "ID", "TIME_SPAN",
                        udf_prob("Probability").cast("double").alias(
                            "probability"),
                        col("{0}_label".format(cur_of)).alias("label"))
                cur_testing_df = self.spark.read.parquet(
                    self.testing_result_dest_template.format(cur_of)).select(
                        "ID", "TIME_SPAN",
                        udf_prob("Probability").cast("double").alias(
                            "probability"),
                        col("{0}_label".format(cur_of)).alias("label"))
            except pyspark.sql.utils.AnalysisException as ex:
                template = "An exception of type {0} occurred. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                self.logger.info(message)
                self.logger.info("PROCESS")
                self.logger.debug("{0} Not exists".format(cur_of))
                continue
            cur_tr_agg = cur_training_df.groupBy("ID").agg(
                sum(log(1.0 - col("probability"))).alias("agg_prob")).select(
                    "ID",
                    (1.0 - exp("agg_prob")).alias("agg_prob").cast("double"))
            cur_te_agg = cur_testing_df.groupBy("ID").agg(
                sum(log(1.0 - col("probability"))).alias("agg_prob")).select(
                    "ID",
                    (1.0 - exp("agg_prob")).alias("agg_prob").cast("double"))

            # TODO terminal_df is flattened terminal DX for now. Need to merge with other DF with ALI,AKI,ALF,AHF column separately.

            cur_tr_agg = cur_tr_agg.join(self.target_terminal_outcome_table,
                                         "ID")
            cur_te_agg = cur_te_agg.join(self.target_terminal_outcome_table,
                                         "ID")

            #cur_tr_agg.show()
            #cur_te_agg.show()

            from pyspark.sql.functions import count
            #cur_te_agg.select(cur_of).groupBy(cur_of).agg(count("*")).show()

            return cur_tr_agg, cur_te_agg
def test(
    keras_model,
    working_dir: FlyteDirectory,
    test_df: pyspark.sql.DataFrame,
    hp: Hyperparameters,
) -> FlyteDirectory:

    print("================")
    print("Final prediction")
    print("================")

    pred_df = keras_model.transform(test_df)
    pred_df.printSchema()
    pred_df.show(5)
    # convert from log domain to real Sales numbers
    pred_df = pred_df.withColumn("Sales_pred", F.exp(pred_df.Sales_output))

    submission_df = pred_df.select(pred_df.Id.cast(T.IntegerType()),
                                   pred_df.Sales_pred).toPandas()
    submission_df.sort_values(by=["Id"]).to_csv(os.path.join(
        working_dir, hp.local_submission_csv),
                                                index=False)
    # predictions are saved to a CSV file.
    print("Saved predictions to %s" % hp.local_submission_csv)

    return working_dir
Ejemplo n.º 4
0
    def output(self, scores, thresh=1.5, mode="best-guess"):
        """Standard output of the algorithm

        De-anonymisation has two modes: entropic (keeps the full distribution) or 
        best-guess (matching with threshold).
        """
        if mode == "best-guess":
            return self.matching_set(scores, thresh)
        elif mode == "entropic":
            # (custId_1, std)
            sigma = scores.groupBy('custId_1').agg(
                F.stddev(scores.value).alias('std'))
            # (custId_1, custId_2, probas_raw)
            probas_raw = scores\
                .join(sigma, ['custId_1'])\
                .withColumn("probas_raw", F.exp(F.col('value')/F.col('std')))\
                .select(['custId_1', 'custId_2', 'probas_raw', 'std'])
            # (custId_1, probas_z)
            probas_z = probas_raw.groupBy('custId_1').agg(
                F.sum(probas_raw.probas_raw).alias('probas_z'))
            # (custId_1, custId_2, probas)
            return scores\
                .join(probas_raw, ['custId_1', 'custId_2'])\
                .join(probas_z, ['custId_1'])\
                .withColumn("probas", F.col('probas_raw')/F.col('probas_z'))\
                .select(['custId_1', 'custId_2', 'probas', 'value', 'std'])
        else:
            raise "Mode '{}' is invalid.".format(mode)
def correct_thermal_factor(df, input_cols, T, replace=False):
    new_df = df
    kBT = 8.6173303e-5 * T * 1000  # unit: meV
    for col_name in input_cols:
        new_df = new_df.withColums(
            "corrected_" + col_name,
            col(col_name) * (func.exp(df.E / kBT) - 1.0))
    if replace:
        print("Replace", input_cols, "with corrected values...")
        new_df = new_df.drop(*input_I_cols)
        for col_name in input_cols:
            new_df = new_df.withColumnRenamed("corrected_" + col_name,
                                              col_name)
    else:
        output_cols = ["corrected_" + col_name for col_name in input_cols]
        print("Add corrected intensity data as new column(s): ", output_cols)
    return new_df
Ejemplo n.º 6
0
 def output(self, scores, thresh=1.5, mode="best-guess"):
     if mode == "best-guess":
         return self.matching_set(scores, thresh)
     elif mode == "entropic":
         # (custId_1, std)
         sigma = scores.groupBy('custId_1').agg(
             F.stddev(scores.value).alias('std'))
         # (custId_1, custId_2, probas_raw)
         probas_raw = scores\
             .join(sigma, ['custId_1'])\
             .withColumn("probas_raw", F.exp(F.col('value')/F.col('std')))\
             .select(['custId_1', 'custId_2', 'probas_raw', 'std'])
         # (custId_1, probas_z)
         probas_z = probas_raw.groupBy('custId_1').agg(
             F.sum(probas_raw.probas_raw).alias('probas_z'))
         # (custId_1, custId_2, probas)
         return scores\
             .join(probas_raw, ['custId_1', 'custId_2'])\
             .join(probas_z, ['custId_1'])\
             .withColumn("probas", F.col('probas_raw')/F.col('probas_z'))\
             .select(['custId_1', 'custId_2', 'probas', 'value', 'std'])
     else:
         raise "Mode '{}' is invalid.".format(mode)
Ejemplo n.º 7
0
def dataframe_operation():
    spark = SparkSession.builder.appName('dataframe-operation').getOrCreate()
    spark.sparkContext.setLogLevel('WARN')

    # Add rows.
    df1 = spark.range(3)
    df2 = spark.range(5)
    df3 = df1.union(df2)
    df3.show()

    # Add columns.
    df1 = spark.createDataFrame([(1, 'a', 23.0), (3, 'B', -23.0)],
                                ('x1', 'x2', 'x3'))
    df2 = df1.withColumn('x4', func.lit(0))
    df2.show()
    df3 = df2.withColumn('x5', func.exp('x3'))
    df3.show(truncate=False)

    df4 = spark.createDataFrame([(1, 'foo'), (2, 'bar')], ('k', 'v'))
    df5 = df3 \
     .join(df4, func.col('x1') == func.col('k'), 'leftouter') \
     .drop('k') \
     .withColumnRenamed('v', 'x6')
    df5.show(truncate=False)
Ejemplo n.º 8
0
def dedup_records(data: DataFrame, key_columns) -> DataFrame:
    data = data.dropDuplicates(key_columns)

    data_with_new_feature = data.withColumn("10_exp", exp("10"))
    return data_with_new_feature
Ejemplo n.º 9
0
 def similarity(r):
     D_1 = F.exp(-(F.abs(r['rating_1'] - r['rating_2']) / r0))
     D_2 = F.exp(-(F.abs(r['days_1'] - r['days_2']) / d0))
     D_3 = F.exp(-(F.abs(r['avgMovieRating_1'] - r['avgMovieRating_2']) /
                   avgr0))
     return D_1 + D_2 + D_3
Ejemplo n.º 10
0
	print('====> Parsing local arguments')
	parser = argparse.ArgumentParser()
	parser.add_argument('--query_month', type=str, help='The format should be YYYYmm')
	parser.add_argument('--mode', type=str, choices=['train', 'eval', 'test'], default='train')
	parser.add_argument('--save_model', action='store_true', default=False)
	args = parser.parse_args()

	print('====> Start computation')
	dataset = spark.read.csv('/user/ronghui_safe/hgy/nid/datasets/{}_{}'.format(args.query_month, args.mode), header=True, inferSchema=True)
	dataset = dataset.withColumn('source', F.when(F.col('source') == '__HIVE_DEFAULT_PARTITION__', 'null').otherwise(F.col('source')))
	dataset = dataset.withColumn('source', F.when(F.col('source') == 'cm_mail', 'null').otherwise(F.col('source')))
	if args.mode != 'test':
		dataset = dataset.withColumn('duration', F.when(F.col('duration') == 0, 1e-6).otherwise(F.col('duration')))
		dataset = dataset.withColumn('duration', F.log(F.lit(1e-6))/F.col('duration'))
		dataset = dataset.withColumn('duration', F.exp(F.col('duration')))
	stringIndex_model = None
	if args.mode == 'train':
		stringIndexer = StringIndexer(inputCol='source', outputCol='source_index')
		stringIndex_model = stringIndexer.fit(dataset)
		stringIndex_model.save('/user/ronghui_safe/hgy/nid/edw/stringIndex_model_v2')
	else:
		stringIndex_model = StringIndexerModel.load('/user/ronghui_safe/hgy/nid/edw/stringIndex_model_v2')
	dataset = stringIndex_model.transform(dataset)
	encoder_model = None
	if args.mode == 'train':
		encoder = OneHotEncoder(inputCol='source_index', outputCol='source_vec')
		encoder_model = encoder.fit(dataset)
		encoder_model.save('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2')
	else:
		encoder_model = OneHotEncoderModel.load('/user/ronghui_safe/hgy/nid/edw/oneHotEncoder_model_v2')
Ejemplo n.º 11
0
def get_and_enrich_spark(raw_data: spark.DataFrame, column_name: str):
    raw_data.show()
    data_with_new_feature = raw_data.withColumn(column_name + "_exp",
                                                exp(column_name))
    return data_with_new_feature
    def run_RF(self, tr_inst, te_inst, model_of=[]):
        from pyspark.sql.functions import col
        if type(self) == data_run_experiment:
            raise NotImplementedError(
                "Method need to be called in sub-class but currently called in base class"
            )

        if model_of == []:
            model_of = self.target_disch_col
        if type(model_of) == str:
            model_of = [model_of]

        self.logger.info("TARGET_OF:")
        self.logger.info(model_of)

        from pyspark.ml.classification import GBTClassifier as cur_model_selection

        cur_classifier = cur_model_selection(featuresCol="features",
                                             checkpointInterval=5)

        from pyspark.ml import Pipeline
        from pyspark.ml.evaluation import BinaryClassificationEvaluator
        from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

        if self.eval_performance_criteria == "AUPRC":
            target_metric = "areaUnderPR"
        elif self.eval_performance_criteria == "AUROC":
            target_metric = "areaUnderROC"
        else:
            raise Exception("eval_metric should be either 'AUPRC' or 'AUROC'")

        evaluator = BinaryClassificationEvaluator(metricName=target_metric)
        paramGrid = self.get_param_grid(cur_model_selection)

        if self.eval_cv_or_tvt == "CV":
            pipeline = Pipeline(stages=[cur_classifier])
            orig_tr_inst = tr_inst
            orig_te_inst = te_inst
            self.logger.info("ORIGINAL_INSTANCES")
            from pyspark.sql.functions import count, datediff
            from pyspark.sql.functions import udf, log, sum, exp, max
            udf_prob = udf(lambda x: x.toArray().tolist()[1])
            from pyspark.sql.functions import corr, udf, isnan
            for cur_of in model_of:
                self.logger.debug(cur_of)
                #should move this to back
                te_inst = orig_te_inst.withColumn(
                    "label",
                    col("{0}_label".format(cur_of)).cast("double"))

                self.logger.info("TE_POP")
                te_inst.groupBy("label").agg(count("*")).show()

                tr_inst.printSchema()
                tr_val_pts_dict = self.get_target_tr_val_id()
                orig_tr_inst = tr_inst

                tr_pts = tr_val_pts_dict["TR"]
                val_pts = tr_val_pts_dict["VAL"]
                self.logger.info(tr_pts)
                self.logger.info(val_pts)

                all_training_ids = tr_pts + val_pts
                from random import shuffle
                shuffle(all_training_ids)
                import numpy as np
                print(len(all_training_ids))
                cv_id_list_full = np.array(all_training_ids)
                perform_dict = dict()
                for cur_cv_stage in range(self.cur_cv_fold):
                    tr_pts = cv_id_list_full[
                        np.linspace(0, cv_id_list_full.shape[0] -
                                    1, cv_id_list_full.shape[0]) %
                        self.cur_cv_fold != cur_cv_stage].tolist()
                    val_pts = cv_id_list_full[
                        np.linspace(0, cv_id_list_full.shape[0] -
                                    1, cv_id_list_full.shape[0]) %
                        self.cur_cv_fold == cur_cv_stage].tolist()
                    print(
                        np.linspace(0, cv_id_list_full.shape[0] -
                                    1, cv_id_list_full.shape[0]) %
                        self.cur_cv_fold == cur_cv_stage)
                    print(cv_id_list_full[
                        np.linspace(0, cv_id_list_full.shape[0] -
                                    1, cv_id_list_full.shape[0]) %
                        self.cur_cv_fold == cur_cv_stage])
                    print("VAL_ROUND_{0}_TARGET IDS:{1}".format(
                        cur_cv_stage, val_pts))

                    tr_inst = orig_tr_inst.where(
                        col("ID").isin(tr_pts))  #.persist()
                    val_inst = orig_tr_inst.where(
                        col("ID").isin(val_pts))  #.persist()

                    self.logger.info(
                        "Excluded instances for training:{0}".format(
                            tr_inst.where(
                                col("{0}_excl".format(cur_of)) == 1).count()))

                    tr_inst = tr_inst.where(
                        col("{0}_excl".format(cur_of)) == 0).withColumn(
                            "label",
                            col("{0}_label".format(cur_of)).cast("double"))
                    val_inst = val_inst.withColumn(
                        "label",
                        col("{0}_label".format(cur_of)).cast("double"))

                    self.logger.info("TR_POP")
                    tr_inst.groupBy("label").agg(count("*")).show()

                    pipeline_models = pipeline.fit(tr_inst, params=paramGrid)

                    for cur_model in pipeline_models:
                        val_pred = cur_model.transform(val_inst)
                        agg_prob_val = val_pred.groupBy("ID").agg(max("label").alias("label"),
                                                                  sum(log(1.0 - udf_prob("Probability"))).alias(
                                                                      "inverse_log_sum")) \
                            .select("label", (1.0 - exp(col("inverse_log_sum"))).alias("rawPrediction"))
                        agg_prob_val.show(300, truncate=False)
                        cur_pr = BinaryClassificationEvaluator(
                            rawPredictionCol="rawPrediction",
                            labelCol="label",
                            metricName=target_metric).evaluate(agg_prob_val)
                        if str(cur_model.stages[-1].extractParamMap()
                               ) not in perform_dict:
                            perform_dict[str(cur_model.stages[-1].
                                             extractParamMap())] = dict()
                            perform_dict[str(
                                cur_model.stages[-1].extractParamMap(
                                ))]["PERF"] = list()
                            perform_dict[str(
                                cur_model.stages[-1].extractParamMap()
                            )]["PARAM"] = cur_model.stages[-1].extractParamMap(
                            )

                        perform_dict[str(cur_model.stages[-1].extractParamMap(
                        ))]["PERF"].append(cur_pr)

                best_pf_measure = -1
                best_pf_param = None

                for key in perform_dict:
                    import numpy as np
                    test_array = np.array(perform_dict[key]["PERF"])
                    print(key, test_array.mean(), test_array.std())
                    if best_pf_measure < test_array.mean():
                        best_pf_measure = test_array.mean()
                        best_pf_param = perform_dict[key]["PARAM"]

                print("retrain model based on best hp from CV")
                print("PERF:{0}".format(best_pf_measure))
                print("HP:{0}".format(best_pf_param))
                tr_inst = orig_tr_inst.where(
                    col("{0}_excl".format(cur_of)) == 0).withColumn(
                        "label",
                        col("{0}_label".format(cur_of)).cast("double"))

                bestModel = pipeline.fit(tr_inst.where(
                    col("ID").isin(cv_id_list_full.tolist())),
                                         params=[best_pf_param])[0]

                bestModel.save(
                    self.model_dir_template.format(cur_of, best_pf_measure))

                prediction = bestModel.transform(te_inst)
                prediction.show()
                prediction.write.save(
                    self.testing_result_dest_template.format(cur_of),
                    mode="overwrite")
                tr_result = bestModel.transform(tr_inst).withColumn(
                    "Prob", udf_prob("Probability"))
                tr_result.write.save(
                    self.training_result_dest_template.format(cur_of),
                    mode="overwrite")

        elif self.eval_cv_or_tvt == "TVT":
            pipeline = Pipeline(stages=[cur_classifier])
            orig_tr_inst = tr_inst
            orig_te_inst = te_inst
            self.logger.info("ORIGINAL_INSTANCES")
            #of pop_overview
            from pyspark.sql.functions import count, datediff
            from pyspark.sql.functions import udf, log, sum, exp, max
            udf_prob = udf(lambda x: x.toArray().tolist()[1])
            from pyspark.sql.functions import corr, udf, isnan
            for cur_of in model_of:
                self.logger.debug(cur_of)
                if "{0}_excl".format(cur_of) not in orig_tr_inst.columns:
                    self.logger.info("NO TARGET {0} is in pts".format(cur_of))
                    continue
                tr_inst = orig_tr_inst.where(
                    col("{0}_excl".format(cur_of)) == 0).withColumn(
                        "label",
                        col("{0}_label".format(cur_of)).cast(
                            "double")).repartition(500).checkpoint()
                self.logger.info("Excluded instances for training:{0}".format(
                    orig_tr_inst.where(
                        col("{0}_excl".format(cur_of)) == 1).count()))

                self.logger.info("TR_POP")
                tr_inst.groupBy("label").agg(count("*")).show()

                te_inst = orig_te_inst.withColumn(
                    "label",
                    col("{0}_label".format(cur_of)).cast("double"))

                self.logger.info("TE_POP")
                te_inst.groupBy("label").agg(count("*")).show()

                tr_inst.printSchema()
                tr_val_pts_dict = self.get_target_tr_val_id()

                tr_pts = tr_val_pts_dict["TR"]
                val_pts = tr_val_pts_dict["VAL"]
                self.logger.info(tr_pts)
                self.logger.info(val_pts)

                orig_tr_inst = tr_inst
                tr_inst = orig_tr_inst.where(
                    col("ID").isin(tr_pts))  #.persist()
                val_inst = orig_tr_inst.where(
                    col("ID").isin(val_pts))  #.persist()

                tr_inst.show()
                val_inst.show()

                self.logger.info("tr_inst_count:{0}//val_inst_count{1}".format(
                    tr_inst.count(), val_inst.count()))
                te_inst.printSchema()

                pipeline_models = pipeline.fit(tr_inst, params=paramGrid)

                max_pr = -1.0
                bestModel = None
                for cur_model in pipeline_models:
                    val_pred = cur_model.transform(val_inst)
                    agg_prob_val = val_pred.groupBy("ID").agg(max("label").alias("label"),sum(log(1.0-udf_prob("Probability"))).alias("inverse_log_sum"))\
                        .select("label",(1.0-exp(col("inverse_log_sum"))).alias("rawPrediction"))
                    agg_prob_val.show(300, truncate=False)
                    cur_pr = BinaryClassificationEvaluator(
                        rawPredictionCol="rawPrediction",
                        labelCol="label",
                        metricName=target_metric).evaluate(agg_prob_val)
                    self.logger.info(cur_pr)
                    if max_pr < cur_pr:
                        max_pr = cur_pr
                        bestModel = cur_model

                if not bestModel:
                    self.logger.info("NO MODEL")
                    return
                self.logger.debug(bestModel)
                self.logger.debug(max_pr)
                udf_prob = udf(lambda x: float(x.toArray().tolist()[1]))
                prediction = bestModel.transform(te_inst)
                prediction.show()
                prediction.write.save(
                    self.testing_result_dest_template.format(cur_of),
                    mode="overwrite")
                tr_result = bestModel.transform(tr_inst).withColumn(
                    "Prob", udf_prob("Probability"))
                tr_result.write.save(
                    self.training_result_dest_template.format(cur_of),
                    mode="overwrite")
                #tr_inst.show_corr_result(tr_result)
                from pyspark.mllib.evaluation import BinaryClassificationMetrics
                self.logger.info("MAX_PRC_VAL:{0}".format(max_pr))
                bestModel.save(self.model_dir_template.format(cur_of, max_pr))
                tr_inst.unpersist()
                val_inst.unpersist()
Ejemplo n.º 13
0
forecastInDF.createOrReplaceTempView('forecast_df')

dfsql = "select period_end_date,rent_space_code,'PROP'||substr(rent_space_code,2,4) as property_code, iteration,quarter,forecast_period_end_date,qtr_number + quarter as cal_qtr_number,net_income,erv,capital_value,lead(net_income) over (partition by rent_space_code,iteration order by quarter) as net_income_1 from forecast_df"

df = sess.sql(dfsql).orderBy('period_end_date','rent_space_code','iteration','forecast_period_end_date')
df.createOrReplaceTempView('dash_df')

dxsql = " select period_end_date,property_code,iteration,forecast_period_end_date,cal_qtr_number,sum(net_income)   as net_income,sum(net_income_1)  as net_income1,sum(erv)   as erv,sum(capital_value)  as capital_value, (sum(net_income_1) - sum(net_income))  as net_income_diff,(sum(net_income_1) - sum(net_income))/sum(net_income)   as net_income_pq, ln(1 + ((sum(net_income_1) - sum(net_income))/sum(net_income)))    as ln_net_income_pq from dash_df  where property_code IN $modifier group by period_end_date,property_code,iteration,forecast_period_end_date,cal_qtr_number"

dx = sess.sql(dxsql).orderBy('period_end_date','property_code','forecast_period_end_date','iteration')

pWindow1y = W.partitionBy('period_end_date','property_code','iteration').orderBy('cal_qtr_number').rangeBetween(0,3)
pWindow3y = W.partitionBy('period_end_date','property_code','iteration').orderBy('cal_qtr_number').rangeBetween(0,11)
pWindow5y = W.partitionBy('period_end_date','property_code','iteration').orderBy('cal_qtr_number').rangeBetween(0,19)

prop = dx.withColumn('net_income_1y_pa',((F.exp(F.sum('ln_net_income_pq').over(pWindow1y)))-1)) \
         .withColumn('net_income_3y_pa',((F.exp(F.sum('ln_net_income_pq').over(pWindow3y) * 4/12))-1)) \
         .withColumn('net_income_5y_pa',((F.exp(F.sum('ln_net_income_pq').over(pWindow5y) * 4/20))-1)) \
         .orderBy('period_end_date','property_code','iteration','forecast_period_end_date')


tot = prop.groupBy('period_end_date','property_code','forecast_period_end_date') \
	.agg(F.mean('net_income_pq').alias('avg_net_income_pq'), F.stddev('net_income_pq').alias('std_net_income_pq')) \
	.orderBy('period_end_date','property_code','forecast_period_end_date')

#prop.show(10)
#tot.show(10)
#maprdd = tot.rdd.groupBy(lambda x:x[0]).map(lambda x:(x[0],{y[1]:y[2] for y in x[1]}))
#result_dict = dict(maprdd.collect())
data = tot.toPandas()
Ejemplo n.º 14
0

# ## Generate Test Data

from pyspark.sql.functions import rand

n = 10000000
df1 = spark.range(n).withColumn("x", rand(seed=12345))


# ## Built-in Functions

from pyspark.sql.functions import log, exp, sum

df2 = df1.withColumn("y", log(col("x") / (1.0 - col("x"))))
df3 = df2.withColumn("z", 1.0 / (1.0 + exp(-col("y"))))

df3.show()

%time df3.select(sum("x"), sum("z")).show()


# ## Scalar Python UDFs

from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

def logit(x):
  from math import log
  return log(x / (1.0 - x))
Ejemplo n.º 15
0
    # literal zero. We use lit(), a literal function
    df_with_x4 = df.withColumn("x4", lit(0))
    df_with_x4.show()
    # +---+---+----+---+
    # | x1| x2|  x3| x4|
    # +---+---+----+---+
    # |100|  a| 3.0|  0|
    # |300|  b| 5.0|  0|
    # +---+---+----+---+

    # Spark enable us to transform an existing column
    # (by using its column values) to a new column: The
    # following example computes the exponential of the
    # column "x3" value as a new column "x5":
    # creates a new column "x5" and initialize it to exp("x3")
    df_with_x5 = df_with_x4.withColumn("x5", exp("x3"))
    df_with_x5.show()
    # +---+---+---+---+------------------+
    # | x1| x2| x3| x4|                x5|
    # +---+---+---+---+------------------+
    # |100|  a|3.0|  0|20.085536923187668|
    # |300|  b|5.0|  0| 148.4131591025766|
    # +---+---+---+---+------------------+

    # You may perform the `join()` operation between
    # two DataFrames and hence add new additional columns.
    # The following example joins two DataFrames (named
    # as `df_with_x5` and `other_df`) and creates a new
    # DataFrame as `df_with_x6`.
    other_data = [(100, "foo1"), (100, "foo2"), (200, "foo")]
    other_df = spark.createDataFrame(other_data, ("k", "v"))
Ejemplo n.º 16
0
 def convert_grade_back_to_normal(self, df):
     # type: (dataframe) -> dataframe
     for col in self.grade_cols:
         df = df.withColumn(col, F.lit(1) - F.exp(F.col(col)))
     return df
Ejemplo n.º 17
0
        StructType(
            [StructField('_1', StringType()),
             StructField('_2', DoubleType())])))

if __name__ == '__main__':
    spark = SparkSession.builder.appName('data_frame_creation').getOrCreate()
    data = [(100, "a", 3.0), (300, "b", 5.0)]
    col = ("x1", "x2", "x3")
    df = spark.createDataFrame(data, col)
    df.show()

    # ---------------------------
    # 1- 添加字段
    # ---------------------------
    ## 1.1 简单添加
    df_with = df.withColumn("x4", lit(0)).withColumn("x5", exp("x3"))
    df_with.show()
    ## 1.2 left join + rename | rand()
    other_data = [(100, "foo1"), (100, "foo2"), (200, "foo")]
    other_df = spark.createDataFrame(other_data, ("k", "v"))
    df_with_x6 = df_with.join(other_df, df_with.x1 == other_df.k, 'leftouter')\
                  .drop('k').withColumnRenamed('v', 'x6')
    df_with_x6.show()
    df_with_x6.withColumn("x8", rand()).show()

    # ---------------------------
    # 2- aggregate_multiple_columns
    # 在关联规则中可能用到
    # ---------------------------
    df = spark.sparkContext.parallelize([("mary", "lemon", 2.00),
                                         ("adam", "grape", 1.22),
"""
There are many ways that you can use to create a column in a PySpark Dataframe.
    1. Using Spark Native Functions {withColumn}
    2. Using Spark UDFs 
    3. Using RDDs
    4. Using Pandas UDF
"""
"""
1. Using Spark Native Functions {withColumn} : 
"""

casesWithNewConfirmed = cases.withColumn("NewConfirmed",
                                         100 + func.col("confirmed"))
# casesWithNewConfirmed.show()

casesWithExpConfirmed = cases.withColumn("ExpConfirmed", func.exp("confirmed"))
# casesWithExpConfirmed.show()
"""
2. Using Spark UDFs : 
        Sometimes we want to do complicated things to a column or multiple columns.This could be thought of as a map 
        operation on a PySpark Dataframe to a single column or multiple columns. While Spark SQL functions do solve many
        use cases when it comes to column creation, I use Spark UDF whenever I need more matured Python functionality.
"""


def casesHighLow(confirmed):
    if confirmed < 50:
        return "Low"
    else:
        return "High"
Ejemplo n.º 19
0
    def run(self,
            rawCountsSparkDF,
            columnBatchSize,
            outFileGroupedByGene=None):
        '''
        Arguments:
            rawCountsSparkDF
                a dataframe with columns
                    The name column from the salmon quant.sf files

                    for each sample there is a column containing the NumReads
                    column of the salmon quant.sf file. The column name == the sample Name
                    
            columnBatchSize:
                an integer
                The GTEx training data set has 10409 numeric columns. This cause a
                java.lang.StackOverflowError because the DAG is to big. increasing spark driver
                memory does not help. The work around is sum the smaller batches of columns
                and cache the results of each batch
                
            outFileGroupedByGene:
                optional file path
                if defined groupedByGene dataframe will be saved. is a work around. 
                We are having trouble calculate the row sums needed to calculate the 
                estimated scaling factors. OOM exceptions. Using the grouped by gene counts matrix
                you can try having DESeq calculate the scaling factors with out having to 
                process all the salmon quant.sf file
                
        returns:
            (scalingFactorsDF, countDF)
                scalingFactorsDF:
                    a spark data frame with columns 'sampleName' and 'scalingFactor'

                countDF:
                    contains the integer counts of the transcripts grouped by geneId.
                    the first column name will be 'geneId'. The following column names will be the
                    sample names
        '''
        self.logger.warn("run BEGIN")

        self.logger.warn( "run rawCountsSparkDF numRows:{} numCols:{}"\
                         .format( rawCountsSparkDF.count(), len( rawCountsSparkDF.columns ) ) )

        # pass transients to enable unit testing
        rawCountsSparkDF.createOrReplaceTempView("rawCounts")

        countsSparkDF = self._groupByGeneAndSum(rawCountsSparkDF)
        retIntSparkDF = self._convertToLong(countsSparkDF)

        if outFileGroupedByGene:
            self.logger.warn("saving integer grouped by counts to :{}".format(
                outFileGroupedByGene))
            retIntSparkDF.coalesce(1).write.csv(outFileGroupedByGene,
                                                mode='overwrite',
                                                header=True)
            self.logger.warn(
                "finished writing integer grouped by counts to :{}".format(
                    outFileGroupedByGene))

        countsSparkDF = None

        # 6.a)
        # skip first column, i.e. gene_id
        columnNames = retIntSparkDF.columns[1:]
        logCountsSparkDF = self._calculateLogs(retIntSparkDF, columnNames)

        # 6.b) filter out genes with one or more nulls
        #    i) removes genes with zero in one or more samples.
        #     that are type specific. I.e. we want to focus on the house keeping genes.
        #     These are genes that are trascripted at simpilar levels regradless of tissue type

        filteredDF = logCountsSparkDF.na.drop()
        filteredDF.checkpoint()
        logCountsSparkDF = None

        # 6.c) calculate the mean of the row sum
        # skip gene_id column
        columns = filteredDF.columns[1:]
        rowSumsDF = self.rowSums(filteredDF, columns, columnBatchSize)
        rowSumsDF.checkpoint()

        n = len(rowSumsDF.columns) - 2  # do not count geneId or rowSum columns
        rowMeansDF = rowSumsDF.withColumn("rowMean", (rowSumsDF.rowSum / n))
        rowMeansDF.checkpoint()
        filteredDF = None

        # 6.d) subtract the avereage log values from the log(counts)
        #     i) this is equal to log( numRead_x / average numRead_x)

        # skip the first and last 2 columns, ie. geneId, rowSum, rowMean
        columnNames = rowMeansDF.columns[1:-2]
        ratioDF = self._subtractRowMeanFromLogCounts(rowMeansDF, columnNames)
        ratioDF.checkpoint()
        rowMeansDF = None

        # 6.e calculate the median of the ratio for each sample
        #     i) median is robust
        # skip geneId
        columnNames = ratioDF.columns[1:]
        logMedianDF = self.median(ratioDF, columnNames)
        logMedianDF.checkpoint()
        ratioDF = None

        newColNames = [self.getSampleNames(c) for c in logMedianDF.columns]
        logScalingFactorsDF = logMedianDF.toDF(*newColNames)
        logScalingFactorsDF.checkpoint()

        # 6.f) convert the medians back to linear scale
        scalingFactorsDF = logScalingFactorsDF.select(
            *(exp(c) for c in logScalingFactorsDF.columns))

        # fix the column names change 'EXP(ctrl_1)' to 'ctrl_1'
        # transpose into a 2 columns, 'sampleName' and  'scalingFactor'
        retScalingFactorsDF = self._fixScalingFactors(scalingFactorsDF)

        # fix the column names change 'sum(kras)' to 'kras'
        retIntCountSparkDF = self._fixSumColNames(retIntSparkDF)

        self.logger.warn("run END\n")
        return (retScalingFactorsDF, retIntCountSparkDF)
Ejemplo n.º 20
0
def expit_pandas_udf(x):
  from numpy import exp
  return 1.0 / (1.0 + exp(-x))
Ejemplo n.º 21
0
def expit_udf(x):
  from math import exp
  return 1.0 / (1.0 + exp(-x))
Ejemplo n.º 22
0
rFormula = RFormula(formula="log_price ~ . - price",
                    featuresCol="features",
                    labelCol="log_price",
                    handleInvalid="skip")

lr = LinearRegression(labelCol="log_price", predictionCol="log_pred")
pipeline = Pipeline(stages=[rFormula, lr])
pipelineModel = pipeline.fit(logTrainDF)
predDF = pipelineModel.transform(logTestDF)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Exponentiate
# MAGIC
# MAGIC In order to interpret our RMSE, we need to convert our predictions back from logarithmic scale.

# COMMAND ----------

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col, exp

expDF = predDF.withColumn("prediction", exp(col("log_pred")))

regressionEvaluator = RegressionEvaluator(labelCol="price",
                                          predictionCol="prediction")
rmse = regressionEvaluator.setMetricName("rmse").evaluate(expDF)
r2 = regressionEvaluator.setMetricName("r2").evaluate(expDF)
print(f"RMSE is {rmse}")
print(f"R2 is {r2}")
Ejemplo n.º 23
0
from pyspark.ml.regression import LinearRegression

linreg = LinearRegression(maxIter=500, regParam=0.0)

lm = linreg.fit(train_df)

print("Intercept ", lm.intercept)
print("Coefficients ", lm.coefficients)

y_pred = lm.transform(test_df)

y_pred.select('features', 'label', 'prediction').show(5)

from pyspark.sql.functions import exp

y_pred = y_pred.withColumn("y_pred", exp('prediction'))

y_pred.show(5)

from pyspark.ml.evaluation import RegressionEvaluator

rmse_evaluator = RegressionEvaluator(labelCol="price",
                                     predictionCol="y_pred",
                                     metricName="rmse")

lm_rmse = rmse_evaluator.evaluate(y_pred)

print("Root mean square ", lm_rmse)

rsquare_evaluator = RegressionEvaluator(labelCol="price",
                                        predictionCol="y_pred",
Ejemplo n.º 24
0
def compile_exp(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    return F.exp(src_column)
Ejemplo n.º 25
0
                            parallelism=args.num_workers)

model = model_selection.fit(train_df).set_output_columns(['Sales'])

history = model.get_history()
best_val_rmspe = min(history['val_exp_rmspe'])
print('Best RMSPE: %f' % best_val_rmspe)

# Save the trained model.
model.keras().save(args.local_checkpoint_file)
print('Written checkpoint to %s' % args.local_checkpoint_file)

# =================== #
# 3. FINAL PREDICTION #
# =================== #

print('================')
print('Final prediction')
print('================')

pred_df = model.transform(test_df)
# Convert from log domain to real Sales numbers
pred_df = pred_df.withColumn('Sales', F.exp(pred_df.Sales))
submission_df = pred_df.select(pred_df.Id.cast(T.IntegerType()),
                               pred_df.Sales).toPandas()
submission_df.sort_values(by=['Id']).to_csv(args.local_submission_csv,
                                            index=False)
print('Saved predictions to %s' % args.local_submission_csv)

spark.stop()
Ejemplo n.º 26
0
def tocolumns(df, expr):
    import pyspark.sql.functions as fcns

    if isinstance(expr, histbook.expr.Const):
        return fcns.lit(expr.value)

    elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)):
        return df[expr.value]

    elif isinstance(expr, histbook.expr.Call):
        if expr.fcn == "abs" or expr.fcn == "fabs":
            return fcns.abs(tocolumns(df, expr.args[0]))
        elif expr.fcn == "max" or expr.fcn == "fmax":
            return fcns.greatest(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "min" or expr.fcn == "fmin":
            return fcns.least(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "arccos":
            return fcns.acos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arccosh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arcsin":
            return fcns.asin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arcsinh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arctan2":
            return fcns.atan2(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "arctan":
            return fcns.atan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arctanh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "ceil":
            return fcns.ceil(tocolumns(df, expr.args[0]))
        elif expr.fcn == "copysign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "cos":
            return fcns.cos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "cosh":
            return fcns.cosh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "rad2deg":
            return tocolumns(df, expr.args[0]) * (180.0 / math.pi)
        elif expr.fcn == "erfc":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "erf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp":
            return fcns.exp(tocolumns(df, expr.args[0]))
        elif expr.fcn == "expm1":
            return fcns.expm1(tocolumns(df, expr.args[0]))
        elif expr.fcn == "factorial":
            return fcns.factorial(tocolumns(df, expr.args[0]))
        elif expr.fcn == "floor":
            return fcns.floor(tocolumns(df, expr.args[0]))
        elif expr.fcn == "fmod":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "gamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "hypot":
            return fcns.hypot(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "isinf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isnan":
            return fcns.isnan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "lgamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "log10":
            return fcns.log10(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log1p":
            return fcns.log1p(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log":
            return fcns.log(tocolumns(df, expr.args[0]))
        elif expr.fcn == "pow":
            return fcns.pow(tocolumns(df, expr.args[0]),
                            tocolumns(df, expr.args[1]))
        elif expr.fcn == "deg2rad":
            return tocolumns(df, expr.args[0]) * (math.pi / 180.0)
        elif expr.fcn == "sinh":
            return fcns.sinh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sin":
            return fcns.sin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sqrt":
            return fcns.sqrt(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tanh":
            return fcns.tanh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tan":
            return fcns.tan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "trunc":
            raise NotImplementedError(
                expr.fcn)  # FIXME (fcns.trunc is for dates)
        elif expr.fcn == "xor":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "conjugate":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "heaviside":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isfinite":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "left_shift" and isinstance(expr.args[1],
                                                     histbook.expr.Const):
            return fcns.shiftLeft(tocolumns(df, expr.args[0]),
                                  expr.args[1].value)
        elif expr.fcn == "log2":
            return fcns.log2(tocolumns(df, expr.args[0]))
        elif expr.fcn == "logaddexp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "logaddexp":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "mod" or expr.fcn == "fmod":
            return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1])
        elif expr.fcn == "right_shift" and isinstance(expr.args[1],
                                                      histbook.expr.Const):
            return fcns.shiftRight(tocolumns(df, expr.args[0]),
                                   expr.args[1].value)
        elif expr.fcn == "rint":
            return fcns.rint(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "where":
            return fcns.when(tocolumns(df, expr.args[0]),
                             tocolumns(df, expr.args[1])).otherwise(
                                 tocolumns(df, expr.args[2]))
        elif expr.fcn == "numpy.equal":
            return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.not_equal":
            return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less":
            return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less_equal":
            return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.isin":
            return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_not":
            return ~tocolumns(df, expr.args[0])
        elif expr.fcn == "numpy.add":
            return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.subtract":
            return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.multiply":
            return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.true_divide":
            return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_or":
            return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_and":
            return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1])
        else:
            raise NotImplementedError(expr.fcn)

    else:
        raise AssertionError(expr)
Ejemplo n.º 27
0
    history = keras_model.getHistory()
    best_val_rmspe = min(history['val_exp_rmspe'])
    print('Best RMSPE: %f' % best_val_rmspe)

    # Save the trained model.
    keras_model.save(args.local_checkpoint_file)
    print('Written checkpoint to %s' % args.local_checkpoint_file)

    # ================ #
    # FINAL PREDICTION #
    # ================ #

    print('================')
    print('Final prediction')
    print('================')

    pred_df = keras_model.transform(test_df)
    pred_df.printSchema()
    pred_df.show(5)

    # Convert from log domain to real Sales numbers
    pred_df = pred_df.withColumn('Sales_pred', F.exp(pred_df.Sales_output))

    submission_df = pred_df.select(pred_df.Id.cast(T.IntegerType()),
                                   pred_df.Sales_pred).toPandas()
    submission_df.sort_values(by=['Id']).to_csv(args.local_submission_csv,
                                                index=False)
    print('Saved predictions to %s' % args.local_submission_csv)

    spark.stop()