def createTrans09(sparkDF):
	# ===========================
	# douglas fletcher
	# purpose: create data 
	# transformations (10 at a time) 
	# input: 
	# 	sparkDF type sparkDF
	# output: 
	# 	sparkDFTrans type sparkDF
	# ===========================
	sparkDFTrans = sparkDF
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberOfOpenCreditLinesAndLoans)) 
		  .alias("LogUnknownIncomeDebtRatioPerLine")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberRealEstateLoansOrLines)) 
		  .alias("LogUnknownIncomeDebtRatioPerRealEstateLine")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberOfTimesPastDue)) 
		  .alias("LogUnknownIncomeDebtRatioPerDelinquency")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberOfTimes90DaysLate)) 
		  .alias("LogUnknownIncomeDebtRatioPer90DaysLate")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (log10(sparkDFTrans.NumberRealEstateLoansOrLines)) 
		  .alias("LogNumberRealEstateLoansOrLines")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogNumberRealEstateLoansOrLines"
		, when(sparkDFTrans.LogNumberRealEstateLoansOrLines.isNull(), 0)
		  .otherwise(sparkDFTrans.LogNumberRealEstateLoansOrLines)
	)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberRealEstateLoansOrLines)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfOpenCreditLinesAndLoans)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTimesPastDue)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTimes90DaysLate)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTime3059DaysPastDueNotWorse)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTime6089DaysPastDueNotWorse)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.age < 18, 1).otherwise(0) 
		  .alias("LowAge")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (log10(sparkDFTrans.age - 17)) 
		  .alias("Logage")
	)	
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.age)	
	return sparkDFTrans
def createStationDataFrame(station, labelLinkFunction='none'):
    print("station_id =", station)
    bf_station = sampleStationData.filter(
        sampleStationData.station_id == station)

    #left join includes all intervals in weather file in output - then fill supply and demand nulls with zeroes
    #right joing only includes intervals with supply or demand
    bf_station = weatherFeatures.join(bf_station, ['datetime'], how="left")
    bf_station = bf_station.fillna({'totalDemand': '0', 'totalSupply': '0'})

    #bf_station.show()
    print("rows in dataframe", bf_station.count())
    print(time.time() - t0)

    # year month and hour are redundent with metblue data fields
    bf_station = bf_station.withColumn(
        "year",
        year(bf_station.datetime).cast("integer"))
    bf_station = bf_station.withColumn(
        "month",
        month(bf_station.datetime).cast("integer"))

    @udf('boolean')
    def ifWeekday(dow):
        if dow > 5.0: return False
        else: return (True)

    @udf('boolean')
    def ifRain(precip):
        if precip > 0.0: return True
        else: return (False)

    bf_station = bf_station.withColumn(
        "hourOfDay",
        hour(bf_station.datetime).cast('integer'))
    bf_station = bf_station.withColumn(
        "dayOfWeek",
        dayofweek(bf_station.datetime).cast("double"))
    bf_station = bf_station.na.drop(how="any",
                                    subset=['dayOfWeek', 'hourOfDay'])
    bf_station = bf_station.withColumn("weekday",
                                       ifWeekday(bf_station.dayOfWeek))
    bf_station = bf_station.withColumn("raining",
                                       ifRain(bf_station.total_precip))

    #Label y
    #linkFunction = "log1p"
    if labelLinkFunction == "log1p":
        bf_station = bf_station.withColumn("label",
                                           log1p(bf_station.totalDemand))
    else:
        bf_station = bf_station.withColumn("label", bf_station.totalDemand)
    print('bf_station created')
    return (bf_station)
def createTrans05(sparkDF):
	# ===========================
	# douglas fletcher
	# purpose: create data 
	# transformations (10 at a time) 
	# input: 
	# 	sparkDF type sparkDF
	# output: 
	# 	sparkDFTrans type sparkDF
	# ===========================
	sparkDFTrans = sparkDF
	sparkDFTrans = sparkDFTrans.select("*"
		, when((sparkDFTrans.MonthlyIncome % 1000) == 0, 1).otherwise(0) 
		  .alias("IncomeDivBy1000")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when((sparkDFTrans.MonthlyIncome % 5000) == 0, 1).otherwise(0) 
		  .alias("IncomeDivBy5000")
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines==0.9999999, 1).otherwise(0) 
		  .alias("Weird0999Utilization")
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines == 1, 1).otherwise(0) 
		  .alias("FullUtilization")
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines > 1, 1).otherwise(0) 
		  .alias("ExcessUtilization")
	)
	sparkDFTrans = sparkDFTrans.withColumn("NumberOfTime3089DaysPastDueNotWorse"
		, sparkDFTrans.NumberOfTime3059DaysPastDueNotWorse + sparkDFTrans.NumberOfTime6089DaysPastDueNotWorse
	)	
	sparkDFTrans = sparkDFTrans.withColumn("Never3089DaysPastDueNotWorse"
		, sparkDFTrans.Never6089DaysPastDueNotWorse * sparkDFTrans.Never3059DaysPastDueNotWorse
	)
	sparkDFTrans = sparkDFTrans.withColumn("NumberOfTimesPastDue"
		, sparkDFTrans.NumberOfTime3059DaysPastDueNotWorse+sparkDFTrans.NumberOfTime6089DaysPastDueNotWorse+sparkDFTrans.NumberOfTimes90DaysLate
	)	
	sparkDFTrans = sparkDFTrans.withColumn("NeverPastDue"
		, sparkDFTrans.Never90DaysLate + sparkDFTrans.Never6089DaysPastDueNotWorse * sparkDFTrans.Never3059DaysPastDueNotWorse
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (log1p(sparkDFTrans.RevolvingLines * sparkDFTrans.RevolvingUtilizationOfUnsecuredLines))
		  .alias("LogRevolvingUtilizationTimesLines")
	)
	return sparkDFTrans
def customer_meta(df):
    SENIOR_CUTOFF = 65
    ADULT_CUTOFF = 18
    DAYS_IN_YEAR = 365.25
    EXPONENTIAL_DIST_SCALE = 6.3

    augmented_original = replicate_df(df, options["dup_times"] or 1)

    customerMetaRaw = augmented_original.select(
        "customerID",
        F.lit(now).alias("now"),
        (F.abs(F.hash(augmented_original.customerID)) % 4096 /
         4096).alias("choice"),
        "SeniorCitizen",
        "gender",
        "Partner",
        "Dependents",
        F.col("MonthlyCharges").cast(
            get_currency_type()).alias("MonthlyCharges"),
    )

    customerMetaRaw = customerMetaRaw.withColumn(
        "ageInDays",
        F.floor(
            F.when(
                customerMetaRaw.SeniorCitizen == 0,
                (customerMetaRaw.choice *
                 ((SENIOR_CUTOFF - ADULT_CUTOFF - 1) * DAYS_IN_YEAR)) +
                (ADULT_CUTOFF * DAYS_IN_YEAR),
            ).otherwise((SENIOR_CUTOFF * DAYS_IN_YEAR) +
                        (DAYS_IN_YEAR *
                         (-F.log1p(-customerMetaRaw.choice) *
                          EXPONENTIAL_DIST_SCALE)))).cast("int"),
    )

    customerMetaRaw = customerMetaRaw.withColumn(
        "dateOfBirth", F.expr("date_sub(now, ageInDays)"))

    return customerMetaRaw.select(
        "customerID",
        "dateOfBirth",
        "gender",
        "SeniorCitizen",
        "Partner",
        "Dependents",
        "MonthlyCharges",
        "now",
    ).orderBy("customerID")
Beispiel #5
0
def min_max(df, column_names):
    min_funcs = [f.min(x) for x in column_names]
    max_funcs = [f.max(x) for x in column_names]
    column_funcs = min_funcs + max_funcs
    #min_value, max_value = df.select(f.min(column_name), f.max(column_name)).first()
    min_max_value = df.select(*column_funcs).first()
    print("min_max_value:", min_max_value)
    for i, column_name in enumerate(column_names):
        min_value = min_max_value[i]
        max_value = min_max_value[i + len(column_names)]
        if min_value == max_value:
            print("__error__: column_name:{} min_value == max_value".format(
                column_name))
            continue
        df = df.withColumn(column_name, f.log1p(column_name))
        df = df.withColumn(
            column_name,
            f.round((f.col(column_name) - min_value) / (max_value - min_value),
                    6))
    return df
Beispiel #6
0
def impute(df, building_id, meter):

    time_series = spark.sql(
        "SELECT explode(sequence(to_timestamp('2016-01-01'), to_timestamp('2017-01-02'), interval 1 hour))"
    ).withColumnRenamed("col", "timestamp_seq")
    joined = time_series.join(df, [time_series.timestamp_seq == df.timestamp],
                              "left_outer")
    median = df.approxQuantile("meter_reading", [0.5], 0.001)[0]
    imputed = joined.fillna(median, ["meter_reading"])
    imputed = imputed.drop("timestamp")
    imputed = imputed.withColumnRenamed("timestamp_seq", "timestamp")
    imputed = imputed.withColumn("building_id", F.lit(building_id))
    imputed = imputed.withColumn("meter", F.lit(meter))
    imputed = imputed.withColumn("month", F.month(imputed.timestamp))
    imputed = imputed.withColumn("day", F.dayofmonth(imputed.timestamp))
    imputed = imputed.withColumn("hour", F.hour(imputed.timestamp))
    imputed = imputed.withColumn(
        "meter_reading",
        F.when(imputed.meter_reading == 0,
               median).otherwise(imputed.meter_reading))
    imputed = imputed.withColumn("meter_reading",
                                 F.log1p(imputed.meter_reading))

    return imputed
Beispiel #7
0
 def _transform(self, df):
     self.check_input_type(df.schema)
     return df.withColumn(self.outputCol, F.log1p(F.col(self.inputCol)))
Beispiel #8
0
def main(spark, log_comp=False, drop_low=False, drop_thr=0):
    '''

    Parameters
    ----------
    spark : SparkSession object

    train_path : string, path to the training parquet file to load

    val_path : string, path to the validation parquet file to load

    test_path : string, path to the validation parquet file to load
    '''
    ## Load in datasets
    train_path = 'hdfs:/user/bm106/pub/project/cf_train.parquet'
    val_path = 'hdfs:/user/bm106/pub/project/cf_validation.parquet'
    test_path = 'hdfs:/user/bm106/pub/project/cf_test.parquet'

    train = spark.read.parquet(train_path)
    val = spark.read.parquet(val_path)
    test = spark.read.parquet(test_path)

    ## Downsample the data
    # Pick out user list in training set
    user_train = set(row['user_id']
                     for row in train.select('user_id').distinct().collect())
    # Pick out user list in validation set
    user_val = set(row['user_id']
                   for row in val.select('user_id').distinct().collect())
    # Get the previous 1M users
    user_prev = list(user_train - user_val)
    # Random sampling to get 20%
    k = int(0.2 * len(user_prev))
    user_prev_filtered = random.sample(user_prev, k)
    train = train.where(train.user_id.isin(user_prev_filtered +
                                           list(user_val)))

    ## Create StringIndexer
    indexer_user = StringIndexer(inputCol="user_id",
                                 outputCol="user_id_indexed",
                                 handleInvalid='skip')
    indexer_user_model = indexer_user.fit(train)
    indexer_track = StringIndexer(inputCol="track_id",
                                  outputCol="track_id_indexed",
                                  handleInvalid='skip')
    indexer_track_model = indexer_track.fit(train)

    train = indexer_user_model.transform(train)
    train = indexer_track_model.transform(train)

    val = indexer_user_model.transform(val)
    val = indexer_track_model.transform(val)

    test = indexer_user_model.transform(test)
    test = indexer_track_model.transform(test)

    ## ALS model
    rank_ = [5, 10, 20]
    regParam_ = [0.1, 1, 10]
    alpha_ = [1, 5, 10]
    param_grid = it.product(rank_, regParam_, alpha_)

    ## Pick out users from validation set
    user_id = val.select('user_id_indexed').distinct()
    true_label = val.select('user_id_indexed', 'track_id_indexed')\
                    .groupBy('user_id_indexed')\
                    .agg(expr('collect_list(track_id_indexed) as true_item'))

    ## Log-Compression
    ## count -> log(1+count)
    if log_comp == True:
        train = train.select('*', F.log1p('count').alias('count_log1p'))
        val = val.select('*', F.log1p('count').alias('count_log1p'))
        rateCol = "count_log1p"
    else:
        rateCol = "count"

    ## Drop interactions that have counts lower than specified threhold
    if drop_low == True:
        train = train.filter(train['count'] > drop_thr)
        val = val.filter(val['count'] > drop_thr)

    for i in param_grid:
        print('Start Training for {}'.format(i))
        als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \
            alpha=i[2], nonnegative=True, coldStartStrategy="drop")
        model = als.fit(train)
        print('Finish Training for {}'.format(i))

        # Make top 500 recommendations for users in validation test
        res = model.recommendForUserSubset(user_id, 500)
        pred_label = res.select('user_id_indexed',
                                'recommendations.track_id_indexed')

        pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_indexed', 'inner') \
                    .rdd \
                    .map(lambda row: (row[1], row[2]))

        print('Start Evaluating for {}'.format(i))
        metrics = RankingMetrics(pred_true_rdd)
        map_ = metrics.meanAveragePrecision
        ndcg = metrics.ndcgAt(500)
        mpa = metrics.precisionAt(500)
        print(i, 'map score: ', map_, 'ndcg score: ', ndcg, 'map score: ', mpa)

    pass
Beispiel #9
0
def tocolumns(df, expr):
    import pyspark.sql.functions as fcns

    if isinstance(expr, histbook.expr.Const):
        return fcns.lit(expr.value)

    elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)):
        return df[expr.value]

    elif isinstance(expr, histbook.expr.Call):
        if expr.fcn == "abs" or expr.fcn == "fabs":
            return fcns.abs(tocolumns(df, expr.args[0]))
        elif expr.fcn == "max" or expr.fcn == "fmax":
            return fcns.greatest(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "min" or expr.fcn == "fmin":
            return fcns.least(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "arccos":
            return fcns.acos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arccosh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arcsin":
            return fcns.asin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arcsinh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arctan2":
            return fcns.atan2(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "arctan":
            return fcns.atan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arctanh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "ceil":
            return fcns.ceil(tocolumns(df, expr.args[0]))
        elif expr.fcn == "copysign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "cos":
            return fcns.cos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "cosh":
            return fcns.cosh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "rad2deg":
            return tocolumns(df, expr.args[0]) * (180.0 / math.pi)
        elif expr.fcn == "erfc":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "erf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp":
            return fcns.exp(tocolumns(df, expr.args[0]))
        elif expr.fcn == "expm1":
            return fcns.expm1(tocolumns(df, expr.args[0]))
        elif expr.fcn == "factorial":
            return fcns.factorial(tocolumns(df, expr.args[0]))
        elif expr.fcn == "floor":
            return fcns.floor(tocolumns(df, expr.args[0]))
        elif expr.fcn == "fmod":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "gamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "hypot":
            return fcns.hypot(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "isinf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isnan":
            return fcns.isnan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "lgamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "log10":
            return fcns.log10(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log1p":
            return fcns.log1p(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log":
            return fcns.log(tocolumns(df, expr.args[0]))
        elif expr.fcn == "pow":
            return fcns.pow(tocolumns(df, expr.args[0]),
                            tocolumns(df, expr.args[1]))
        elif expr.fcn == "deg2rad":
            return tocolumns(df, expr.args[0]) * (math.pi / 180.0)
        elif expr.fcn == "sinh":
            return fcns.sinh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sin":
            return fcns.sin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sqrt":
            return fcns.sqrt(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tanh":
            return fcns.tanh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tan":
            return fcns.tan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "trunc":
            raise NotImplementedError(
                expr.fcn)  # FIXME (fcns.trunc is for dates)
        elif expr.fcn == "xor":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "conjugate":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "heaviside":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isfinite":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "left_shift" and isinstance(expr.args[1],
                                                     histbook.expr.Const):
            return fcns.shiftLeft(tocolumns(df, expr.args[0]),
                                  expr.args[1].value)
        elif expr.fcn == "log2":
            return fcns.log2(tocolumns(df, expr.args[0]))
        elif expr.fcn == "logaddexp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "logaddexp":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "mod" or expr.fcn == "fmod":
            return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1])
        elif expr.fcn == "right_shift" and isinstance(expr.args[1],
                                                      histbook.expr.Const):
            return fcns.shiftRight(tocolumns(df, expr.args[0]),
                                   expr.args[1].value)
        elif expr.fcn == "rint":
            return fcns.rint(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "where":
            return fcns.when(tocolumns(df, expr.args[0]),
                             tocolumns(df, expr.args[1])).otherwise(
                                 tocolumns(df, expr.args[2]))
        elif expr.fcn == "numpy.equal":
            return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.not_equal":
            return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less":
            return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less_equal":
            return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.isin":
            return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_not":
            return ~tocolumns(df, expr.args[0])
        elif expr.fcn == "numpy.add":
            return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.subtract":
            return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.multiply":
            return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.true_divide":
            return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_or":
            return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_and":
            return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1])
        else:
            raise NotImplementedError(expr.fcn)

    else:
        raise AssertionError(expr)
Beispiel #10
0

def algorithm(target):
    rf = RandomForestRegressor(featuresCol='Features', labelCol=target)
    gbt = GBTRegressor(featuresCol='Features', labelCol=target)
    dt = DecisionTreeRegressor(featuresCol='Features', labelCol=target)
    lr = LinearRegression(featuresCol='Features', labelCol=target)
    glr = GeneralizedLinearRegression(family="gaussian",
                                      link="identity",
                                      featuresCol='Features',
                                      labelCol=target)
    model = [gbt, dt, lr, glr, rf]
    return rf, gbt, dt, lr, glr, model


X_train = X_train.withColumn(target, F.log1p(F.col(target)))

X_test = X_test.withColumn(target, F.log1p(F.col(target)))

rf, gbt, dt, lr, glr, model = algorithm(target)

fitted = gbt.fit(X_train)

yhat = (fitted.transform(X_test).withColumn(
    "prediction", F.expm1(F.col("prediction"))).withColumn(
        target, F.expm1(F.col(target))).withColumn(
            'fiability', 1 - F.abs(F.col(target) - F.col("prediction")) /
            F.col(target)).withColumn(
                'fiability',
                F.when(F.col("fiability") < 0,
                       0).otherwise(F.col("fiability"))))
Beispiel #11
0
    bucket = "[s3|gs]://[your input graph data]"  # loc of your input graph data
    # refer to constants.NODE2VEC_PARAMS for default values of n2v_params
    n2v_params = {
        "num_walks": 30,
        "walk_length": 10,
        "return_param": 1.0,
        "inout_param": 1.0,
    }
    # refer to constants.WORD2VEC_PARAMS for default values of w2v_params
    w2v_params = {}

    if len(sys.argv) <= 1 or sys.argv[1] == "index":
        df = spark.read.parquet(f"{bucket}/input_graph.parquet").repartition(
            1000)
        df = df.select("src", "dst",
                       "weight").withColumn("weight", ssf.log1p(df["weight"]))
        fugue_df = SparkDataFrame(df.distinct())
        # assume the input graph is not indexed, and is directed
        df_index, name_id = trim_index(
            fugue_spark,
            fugue_df,
            indexed=False,
            directed=True,
            max_out_deg=10000,
        )
        name_id.native.write.parquet(f"{bucket}/graph_name2id.parquet",
                                     "overwrite")
        df_index.native.write.parquet(f"{bucket}/graph_indexed.parquet",
                                      "overwrite")

    elif len(sys.argv) <= 1 or sys.argv[1] == "walk":
def createTrans08(sparkDF):
	# ===========================
	# douglas fletcher
	# purpose: create data 
	# transformations (10 at a time) 
	# input: 
	# 	sparkDF type sparkDF
	# output: 
	# 	sparkDFTrans type sparkDF
	# ===========================
	sparkDFTrans = sparkDF
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogNumberOfTimes90DaysLate - sparkDFTrans.LogNumberOfTime6089DaysPastDueNotWorse)
		  .alias("LogRatio90to6089DaysLate")
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans > 0, 1).otherwise(0) 
		  .alias("AnyOpenCreditLinesOrLoans")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (log10(sparkDFTrans.NumberOfOpenCreditLinesAndLoans))
		  .alias("LogNumberOfOpenCreditLinesAndLoans")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfOpenCreditLinesAndLoans"
		, when(sparkDFTrans.LogNumberOfOpenCreditLinesAndLoans.isNull(), 0)
		  .otherwise(sparkDFTrans.LogNumberOfOpenCreditLinesAndLoans)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogNumberOfOpenCreditLinesAndLoans - log1p(sparkDFTrans.NumberOfDependents))
		  .alias("LogNumberOfOpenCreditLinesAndLoansPerPerson")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, when(sparkDFTrans.NumberOfDependents > 0, 1).otherwise(0) 
		  .alias("HasDependents")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, log1p(sparkDFTrans.NumberOfDependents) 
		  .alias("LogHouseholdSize")
	)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfDependents)
	sparkDFTrans = sparkDFTrans.select("*"
		, log10(sparkDFTrans.DebtRatio) 
		  .alias("LogDebtRatio")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogDebtRatio"
		, when(sparkDFTrans.LogDebtRatio.isNull(), 0)
		  .otherwise(sparkDFTrans.LogDebtRatio)
	)
	sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.DebtRatio)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfTimesPastDue)) 
		  .alias("LogDebtPerDelinquency")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfTimes90DaysLate)) 
		  .alias("LogDebtPer90DaysLate")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (log10(sparkDFTrans.UnknownIncomeDebtRatio)) 
		  .alias("LogUnknownIncomeDebtRatio")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogUnknownIncomeDebtRatio"
		, when(sparkDFTrans.LogUnknownIncomeDebtRatio.isNull(), 0)
		  .otherwise(sparkDFTrans.LogUnknownIncomeDebtRatio)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogUnknownIncomeDebtRatio - sparkDFTrans.LogHouseholdSize) 
		  .alias("LogUnknownIncomeDebtRatioPerPerson")
	)
	return sparkDFTrans
def createTrans06(sparkDF):
	# ===========================
	# douglas fletcher
	# purpose: create data 
	# transformations (10 at a time) 
	# input: 
	# 	sparkDF type sparkDF
	# output: 
	# 	sparkDFTrans type sparkDF
	# ===========================
	sparkDFTrans = sparkDF
	sparkDFTrans = sparkDFTrans.select("*"
		, (log10(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines))
		  .alias("LogRevolvingUtilizationOfUnsecuredLines")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogRevolvingUtilizationOfUnsecuredLines"
		, when(sparkDFTrans.LogRevolvingUtilizationOfUnsecuredLines.isNull(), 0)
		  .otherwise(sparkDFTrans.LogRevolvingUtilizationOfUnsecuredLines)
	)
	sparkDFTrans = sparkDFTrans.drop("RevolvingUtilizationOfUnsecuredLines")
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberOfTimesPastDue / sparkDFTrans.NumberOfOpenCreditLinesAndLoans)
		  .alias("DelinquenciesPerLine")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("DelinquenciesPerLine"
		, when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans == 0, 0)
		  .otherwise(sparkDFTrans.DelinquenciesPerLine)
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberOfTimes90DaysLate / sparkDFTrans.NumberOfOpenCreditLinesAndLoans)
		  .alias("MajorDelinquenciesPerLine")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("MajorDelinquenciesPerLine"
		, when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans == 0, 0)
		  .otherwise(sparkDFTrans.MajorDelinquenciesPerLine)
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberOfTime3089DaysPastDueNotWorse / sparkDFTrans.NumberOfOpenCreditLinesAndLoans)
		  .alias("MinorDelinquenciesPerLine")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("MinorDelinquenciesPerLine"
		, when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans == 0, 0)
		  .otherwise(sparkDFTrans.MinorDelinquenciesPerLine)
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberOfTimesPastDue / sparkDFTrans.RevolvingLines)
		  .alias("DelinquenciesPerRevolvingLine")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("DelinquenciesPerRevolvingLine"
		, when(sparkDFTrans.RevolvingLines == 0, 0)
		  .otherwise(sparkDFTrans.DelinquenciesPerRevolvingLine)
	)	
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberOfTimes90DaysLate / sparkDFTrans.RevolvingLines)
		  .alias("MajorDelinquenciesPerRevolvingLine")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("MajorDelinquenciesPerRevolvingLine"
		, when(sparkDFTrans.RevolvingLines == 0, 0)
		  .otherwise(sparkDFTrans.MajorDelinquenciesPerRevolvingLine)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberOfTime3089DaysPastDueNotWorse / sparkDFTrans.RevolvingLines)
		  .alias("MinorDelinquenciesPerRevolvingLine")
	)	
	sparkDFTrans = sparkDFTrans.withColumn("MinorDelinquenciesPerRevolvingLine"
		, when(sparkDFTrans.RevolvingLines == 0, 0)
		  .otherwise(sparkDFTrans.MinorDelinquenciesPerRevolvingLine)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfOpenCreditLinesAndLoans))
		  .alias("LogDebtPerLine")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberRealEstateLoansOrLines))
		  .alias("LogDebtPerRealEstateLine")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfDependents))
		  .alias("LogDebtPerPerson")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.RevolvingLines /(1+sparkDFTrans.NumberOfDependents))
		  .alias("RevolvingLinesPerPerson")
	)	
	return sparkDFTrans
Beispiel #14
0
    [assembler]+ \
    [pca])

model = pipeline.fit(df)
final_dataset = model.transform(df)

target = 'QTY'

gbt = GBTRegressor(featuresCol='Features', labelCol=target)
dt = DecisionTreeRegressor(featuresCol='Features', labelCol=target)
lr = LinearRegression(featuresCol='Features', labelCol=target)

X_train = (final_dataset.filter(
    F.col('DATE').between("2017-01-02",
                          "2018-06-01")).withColumn(target,
                                                    F.log1p(F.col(target))))

X_test = (final_dataset.filter(F.col('DATE') > "2018-06-01").withColumn(
    target, F.log1p(F.col(target))))

fitted = gbt.fit(X_train)

yhat = (fitted.transform(X_test).withColumn(
    "prediction",
    F.expm1(F.col("prediction"))).withColumn(target, F.expm1(F.col(target))))

eval_ = RegressionEvaluator(labelCol=target,
                            predictionCol="prediction",
                            metricName="rmse")

rmse = eval_.evaluate(yhat)
Beispiel #15
0
data = data.withColumn('INTERVAL',udf_assign_interval('TIME'))

# calculate mean price of each time intervel according to symbol, date, interval
data = data.groupby(['SYMBOL','DATE','INTERVAL']).agg({'PRICE':'mean'})
data = data.withColumnRenamed('avg(PRICE)', 'AVG_PRICE')

data = data.select(data.SYMBOL,data.DATE,data.INTERVAL.cast('double'),data.AVG_PRICE)

data = data.orderBy(["SYMBOL","DATE","INTERVAL"], ascending=[1, 1])

# apply window function to get previous time interval avg_price
w = Window().partitionBy(col('SYMBOL')).orderBy([col('SYMBOL'),col('DATE'),col('INTERVAL')])
data = data.select("*", lag('AVG_PRICE').over(w).alias('PRE_AVG_PRICE'))

# compute log return
data = data.withColumn('U_SEQUENCE',log1p(data.AVG_PRICE/data.PRE_AVG_PRICE-1.))

data = data.withColumn('SQUARE_U_SEQUENCE', data.U_SEQUENCE * data.U_SEQUENCE)

window = Window().partitionBy("SYMBOL").rowsBetween(-(len(endPoints)-1), 0).orderBy([col('SYMBOL'),col('DATE'),col('INTERVAL')])
new_data = data.select(data.SYMBOL,data.DATE,data.INTERVAL,data.U_SEQUENCE,data.SQUARE_U_SEQUENCE,f.sum('U_SEQUENCE').over(window).alias('SUM_U'),f.sum('SQUARE_U_SEQUENCE').over(window).alias('SQUARE_U_SUM'))

N = float(len(endPoints))
# compute section volatility
new_data = new_data.withColumn("SECTION_VOLATILITY",sqrt(col('SQUARE_U_SUM')/(N-1.) - col('SUM_U')**2/(N*(N-1.))))
SRData = new_data.select(new_data.SYMBOL,new_data.DATE,new_data.SECTION_VOLATILITY,new_data.SQUARE_U_SUM)

SRData = SRData.withColumnRenamed('SQUARE_U_SUM', 'REALIZE_VOLATILITY')

meanSR = SRData.groupby(['SYMBOL','DATE']).agg({'SECTION_VOLATILITY':'mean','REALIZE_VOLATILITY':'mean'})
Beispiel #16
0
rank_ = [5, 10, 20]
regParam_ = [0.1, 1, 10]
alpha_ = [1, 5, 10]
param_grid = it.product(rank_, regParam_, alpha_)
user_id = val.select('user_id_indexed').distinct()
true_label = val.select('user_id_indexed', 'track_id_indexed')\
                .groupBy('user_id_indexed')\
                .agg(expr('collect_list(track_id_indexed) as true_item'))

## Define log-compression/ Drop_low counts
log_comp = True
drop_low = True
drop_thr = 2

if log_comp == True:
    train = train.select('*', F.log1p('count').alias('count_log1p'))
    val = val.select('*', F.log1p('count').alias('count_log1p'))
    rateCol = "count_log1p"
else:
    rateCol = "count"

if drop_low == True:
    train = train.filter(train['count'] > drop_thr)
    val = val.filter(val['count'] > drop_thr)

for i in param_grid:
    print('Start Training for {}'.format(i))
    als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \
        alpha=i[2], nonnegative=True, coldStartStrategy="drop")
    model = als.fit(train)
    print('Finish Training for {}'.format(i))
from pyspark.sql.types import DateType

udf1 = udf(lambda x: x[0:4] + '-' + x[4:6] + '-' + x[6:], StringType())
train_df = (train_df.withColumn("date",
                                train_df["date"].cast("string"))).withColumn(
                                    'date', udf1('date'))
train_df = train_df.withColumn("date", train_df['date'].cast(DateType()))

dev_df = train_df.filter(train_df["date"] <= lit('2017-03-01'))
val_df = train_df.filter(train_df["date"] > lit('2017-03-01'))

print('dev_df = dtpyes', dev_df.dtypes)
print('val_df = dtpyes', val_df.dtypes)

dev_y = dev_df.withColumn("totalstransactionRevenuelog1p",
                          log1p('totalstransactionRevenue')).select(
                              ['totalstransactionRevenuelog1p'])
val_y = val_df.withColumn("totalstransactionRevenuelog1p",
                          log1p('totalstransactionRevenue')).select(
                              ['totalstransactionRevenuelog1p'])

dev_df = dev_df.toPandas()
val_df = val_df.toPandas()
dev_y = dev_y.toPandas()
val_y = val_y.toPandas()
test_df = test_df.toPandas()

dev_X = dev_df[cats + nums]
val_X = val_df[cats + nums]
test_X = test_df[cats + nums]
def createTrans07(sparkDF):
	# ===========================
	# douglas fletcher
	# purpose: create data 
	# transformations (10 at a time) 
	# input: 
	# 	sparkDF type sparkDF
	# output: 
	# 	sparkDFTrans type sparkDF
	# ===========================
	sparkDFTrans = sparkDF
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.NumberRealEstateLoansOrLines /(1+sparkDFTrans.NumberOfDependents))
		  .alias("RealEstateLoansPerPerson")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.age /(1+sparkDFTrans.NumberOfDependents))
		  .alias("YearsOfAgePerDependent")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, log10(sparkDFTrans.MonthlyIncome)
		  .alias("LogMonthlyIncome")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogMonthlyIncome"
		, when((sparkDFTrans.LogMonthlyIncome.isNull()) | (sparkDFTrans.LogMonthlyIncome.isNull()), 0)
		  .otherwise(sparkDFTrans.LogMonthlyIncome)
	)
	sparkDFTrans = sparkDFTrans.drop("MonthlyIncome")
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogMonthlyIncome - log1p(sparkDFTrans.NumberOfDependents))
		  .alias("LogIncomePerPerson")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogMonthlyIncome - log1p(sparkDFTrans.age))
		  .alias("LogIncomeAge")
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, log10(sparkDFTrans.NumberOfTimesPastDue)
		  .alias("LogNumberOfTimesPastDue")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTimesPastDue"
		, when(sparkDFTrans.LogNumberOfTimesPastDue.isNull(), 0)
		  .otherwise(sparkDFTrans.LogNumberOfTimesPastDue)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, log10(sparkDFTrans.NumberOfTimes90DaysLate)
		  .alias("LogNumberOfTimes90DaysLate")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTimes90DaysLate"
		, when(sparkDFTrans.LogNumberOfTimes90DaysLate.isNull(), 0)
		  .otherwise(sparkDFTrans.LogNumberOfTimes90DaysLate)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, log10(sparkDFTrans.NumberOfTime3059DaysPastDueNotWorse)
		  .alias("LogNumberOfTime3059DaysPastDueNotWorse")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTime3059DaysPastDueNotWorse"
		, when(sparkDFTrans.LogNumberOfTime3059DaysPastDueNotWorse.isNull(), 0)
		  .otherwise(sparkDFTrans.LogNumberOfTime3059DaysPastDueNotWorse)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, log10(sparkDFTrans.NumberOfTime6089DaysPastDueNotWorse)
		  .alias("LogNumberOfTime6089DaysPastDueNotWorse")
	)
	sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTime6089DaysPastDueNotWorse"
		, when(sparkDFTrans.LogNumberOfTime6089DaysPastDueNotWorse.isNull(), 0)
		  .otherwise(sparkDFTrans.LogNumberOfTime6089DaysPastDueNotWorse)
	)
	sparkDFTrans = sparkDFTrans.select("*"
		, (sparkDFTrans.LogNumberOfTimes90DaysLate - sparkDFTrans.LogNumberOfTime3059DaysPastDueNotWorse)
		  .alias("LogRatio90to3059DaysLate")
	)	
	return sparkDFTrans
def run_pipeline(name: str, data: str, save: str) -> None:

    spark = SparkSession.builder.appName(name).getOrCreate()

    # Dataset Creation #

    # read bike ride history csv's
    df = spark.read.csv(f'{data}/rides/*', header=True)
    df = df.select(['Duration', 'Start date', 'Start station number', 'Member type'])
    df = df.withColumn('Start station number', df['Start station number'].cast(IntegerType()))
    print(f'The rides dataset has [{df.count()}] rows!')

    # read station information csv
    stations = spark.read.csv(f'{data}/stations/*', header=True)
    print(f'The stations dataset has {stations.count()} rows!')
    stations = stations.withColumnRenamed('LATITUDE', 'start_station_lat')
    stations = stations.withColumnRenamed('LONGITUDE', 'start_station_long')
    stations = stations.withColumn('Start station number', stations['TERMINAL_NUMBER'].cast(IntegerType()))
    stations = stations.select(['start_station_lat', 'start_station_long', 'Start station number'])

    # remove rides longer than 1.5 hours
    one_and_a_half_hours = 60 * 60 * 1.5
    df = df.filter(df['Duration'] <= one_and_a_half_hours)

    # remove rides shorter than 3 minutes
    three_minutes = 60 * 3
    df = df.filter(df['Duration'] >= three_minutes)

    # remove unknown 'Member type's
    df = df.filter(df['Member type'] != 'Unknown')

    # remove non-existent stations
    df = df.filter(~(df['Start station number'] == 31008) & ~(
            df['Start station number'] == 32051) & ~(df['Start station number'] == 32034))

    # make target feature
    df = df.withColumn('label', F.log1p(df.Duration))

    # join on 'Start station number'
    print('Merging rides and stations dataframes!')
    df = df.join(stations, on='Start station number')
    df = df.withColumn('start_station_long', df['start_station_long'].cast(DoubleType()))
    df = df.withColumn('start_station_lat', df['start_station_lat'].cast(DoubleType()))

    print(f'Complete rides and stations dataset has {df.count()} rows!')

    # Feature Transformations #
    print('Doing Feature Transformations!')

    # convert to datetime type
    df = df.withColumn('Start date', F.to_timestamp('Start date', 'yyyy-MM-dd HH:mm:ss'))
    df = df.withColumn('day_of_week', F.dayofweek('Start date'))
    df = df.withColumn('week_of_year', F.weekofyear('Start date'))
    df = df.withColumn('month', F.month('Start date'))
    df = df.withColumn('minute', F.minute('Start date'))
    df = df.withColumn('hour', F.hour('Start date'))

    # make time features cyclical
    pi = 3.141592653589793

    df = df.withColumn('sin_day_of_week', F.sin(2 * pi * df['day_of_week'] / 7))
    df = df.withColumn('sin_week_of_year', F.sin(2 * pi * df['week_of_year'] / 53))
    df = df.withColumn('sin_month', F.sin(2 * pi * (df['month'] - 1) / 12))
    df = df.withColumn('sin_minute', F.sin(2 * pi * df['minute'] / 60))
    df = df.withColumn('sin_hour', F.sin(2 * pi * df['hour'] / 24))

    df = df.withColumn('cos_day_of_week', F.cos(2 * pi * df['day_of_week'] / 7))
    df = df.withColumn('cos_week_of_year', F.cos(2 * pi * df['week_of_year'] / 53))
    df = df.withColumn('cos_month', F.cos(2 * pi * (df['month'] - 1) / 12))
    df = df.withColumn('cos_minute', F.cos(2 * pi * df['minute'] / 60))
    df = df.withColumn('cos_hour', F.cos(2 * pi * df['hour'] / 24))

    df = df.withColumn('hour_and_day_of_week', df['hour'].cast(StringType()) + '_' + df['day_of_week'].cast(StringType()))
    df = df.withColumn('member_type_and_day_of_week', df['Member type'] + '_' + df['day_of_week'].cast(StringType()))

    # drop unused columns
    drop_columns = [
        'Start date',
        'Start station number',
        'Duration',
        'day_of_week',
        'week_of_year',
        'month',
        'minute',
        'hour'
    ]
    df = df.drop(*drop_columns)

    # df.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in df.columns]).show()

    # Model and Pipeline #

    # split training and test
    train, test = df.randomSplit([.7, .3])

    # encode categorical column 'Member type'
    member_indexer = StringIndexer(inputCol='Member type', outputCol='member_idx')
    member_encoder = OneHotEncoder(inputCol='member_idx', outputCol='member_enc')

    # create vector of features named 'features'
    vector = VectorAssembler(
        inputCols=[
            'start_station_lat',
            'start_station_long',
            'sin_day_of_week',
            'cos_day_of_week',
            'sin_week_of_year',
            'cos_week_of_year',
            'sin_month',
            'cos_month',
            'sin_minute',
            'cos_minute',
            'sin_hour',
            'cos_hour',
            'member_enc'
        ],
        outputCol='features'
    )

    # scale features
    scaler = StandardScaler(
        inputCol='features',
        outputCol='scaled_features'
    )

    # define model
    model = GeneralizedLinearRegression(
        featuresCol='scaled_features'
    )

    # create pipeline and fill in stages
    pipeline = Pipeline(
        stages=[
            member_indexer,
            member_encoder,
            vector,
            scaler,
            model
        ]
    )

    # evaluation method
    evaluation = RegressionEvaluator()

    # best parameter search
    grid = ParamGridBuilder()
    # grid = grid.addGrid(model.maxDepth, [5, 7])
    # grid = grid.addGrid(model.numTrees, [200, 500])
    grid = grid.addGrid(model.maxIter, [40, 50])
    grid = grid.addGrid(model.family, ['gaussian', 'gamma'])
    grid = grid.addGrid(model.regParam, [0.0, 0.1])
    grid = grid.build()

    # run cross validation
    cv = CrossValidator(
        estimator=pipeline,
        estimatorParamMaps=grid,
        evaluator=evaluation,
        numFolds=7
    )

    print('Doing Cross Validation!')

    cv_models = cv.fit(train)
    print(f'CV results: {cv_models.avgMetrics} (RMSE)')

    best_model = cv_models.bestModel
    best_params = extract_best_params(best_model.stages[-1].extractParamMap())
    print(f'Best params:\n{best_params}')

    results = cv_models.transform(test)
    print(f'CV results on holdout dataset: {evaluation.evaluate(results)} (RMSE)')

    print('Re-fitting pipeline on entire dataset!')
    cv_models = cv.fit(df)

    print('Saving to pipeline into S3!')
    entire_dataset_best_model = cv_models.bestModel
    entire_dataset_best_model.save(f'{save}/{name}')
    print('Done!')

    return
Beispiel #20
0

df5=df4.withColumn("mean_TOTAL_PRICE",col("sum(TOTAL_PRICE)")/col("sum(SIZE)"))
df5=df5.withColumn("mean_SIZE",col("sum(SIZE)")/col("count(KEY)"))
df5 = df5.withColumn("Interval_int", df5["Interval"].cast("double"))

df6=df5.select('SYMBOL','DATE','Interval_int','avg(PRICE)','mean_TOTAL_PRICE','mean_SIZE')


df6= df6.orderBy(["SYMBOL","DATE","Interval_int"], ascending=[1, 1])
w=Window().partitionBy([col("SYMBOL"),col("DATE")]).orderBy([col("SYMBOL"),col("DATE"),col("Interval_int")])
df7=df6.select("*", lag("avg(PRICE)").over(w).alias("avg(PRICE)_previous"))


    
df8=df7.withColumn("U_sequence",log1p(col("avg(PRICE)")/col("avg(PRICE)_previous")))
df_Usequence=df8.select("U_sequence")
#df_Usequence.write.csv('/Users/yuhan/Dropbox/big_data_analytics/Final/data_save_30',header=True)



df8=df8.withColumn("U_sequence_square",col("U_sequence")*col("U_sequence"))




f_section={"U_sequence_square":'sum', "U_sequence":'sum',"Interval_int":'count',"avg(PRICE)":'mean'}

df_section=df8.groupby(["SYMBOL","DATE"]).agg(f_section)

df_section=df_section.withColumnRenamed('avg(avg(PRICE))', 'daily_average_price')
Beispiel #21
0
        "inout_param": 1.0,
    }
    # refer to constants.WORD2VEC_PARAMS for default values of w2v_params
    w2v_params: Dict[str, Any] = {}
    g2v = Node2VecSpark(
        spark,
        n2v_params,
        w2v_params=w2v_params,
        max_out_degree=10000,
    )

    if len(sys.argv) <= 1 or sys.argv[1] == "index":
        # the input graph must have 3 cols: src, dst, weight
        df = spark.read.parquet(f"{bucket}/input_graph.parquet").repartition(1000)
        df = df.select("src", "dst", "weight").withColumn(
            "weight", ssf.log1p(df["weight"])
        )
        # assume the input graph is not indexed, and is directed
        g2v.preprocess_input_graph(df, indexed=False, directed=True)
        g2v.name_id.write.parquet(f"{bucket}/graph_name2id.parquet", "overwrite")
        g2v.df.write.parquet(f"{bucket}/graph_indexed.parquet", "overwrite")

    elif sys.argv[1] == "walk":
        g2v.name_id = spark.read.parquet(f"{bucket}/graph_name2id.parquet").cache()
        g2v.df = spark.read.parquet(f"{bucket}/graph_indexed.parquet").cache()
        walks = g2v.random_walk()
        walks.write.parquet(f"{bucket}/graph_walks.parquet", "overwrite")

    else:
        df_walks = spark.read.parquet(f"{bucket}/graph_walks.parquet")
        model = g2v.fit(df_walks)
Beispiel #22
0
def log_transformation(x):
    return when(col(x) < 0, col(x)).otherwise(log1p(col(x)))
Beispiel #23
0
 def _process_df(self, col):
     return F.when(col.isNull(), F.lit(self.default_value)).otherwise(
         F.log1p(col.cast('float')))