def createTrans09(sparkDF): # =========================== # douglas fletcher # purpose: create data # transformations (10 at a time) # input: # sparkDF type sparkDF # output: # sparkDFTrans type sparkDF # =========================== sparkDFTrans = sparkDF sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberOfOpenCreditLinesAndLoans)) .alias("LogUnknownIncomeDebtRatioPerLine") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberRealEstateLoansOrLines)) .alias("LogUnknownIncomeDebtRatioPerRealEstateLine") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberOfTimesPastDue)) .alias("LogUnknownIncomeDebtRatioPerDelinquency") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberOfTimes90DaysLate)) .alias("LogUnknownIncomeDebtRatioPer90DaysLate") ) sparkDFTrans = sparkDFTrans.select("*" , (log10(sparkDFTrans.NumberRealEstateLoansOrLines)) .alias("LogNumberRealEstateLoansOrLines") ) sparkDFTrans = sparkDFTrans.withColumn("LogNumberRealEstateLoansOrLines" , when(sparkDFTrans.LogNumberRealEstateLoansOrLines.isNull(), 0) .otherwise(sparkDFTrans.LogNumberRealEstateLoansOrLines) ) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberRealEstateLoansOrLines) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfOpenCreditLinesAndLoans) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTimesPastDue) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTimes90DaysLate) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTime3059DaysPastDueNotWorse) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTime6089DaysPastDueNotWorse) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.age < 18, 1).otherwise(0) .alias("LowAge") ) sparkDFTrans = sparkDFTrans.select("*" , (log10(sparkDFTrans.age - 17)) .alias("Logage") ) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.age) return sparkDFTrans
def createStationDataFrame(station, labelLinkFunction='none'): print("station_id =", station) bf_station = sampleStationData.filter( sampleStationData.station_id == station) #left join includes all intervals in weather file in output - then fill supply and demand nulls with zeroes #right joing only includes intervals with supply or demand bf_station = weatherFeatures.join(bf_station, ['datetime'], how="left") bf_station = bf_station.fillna({'totalDemand': '0', 'totalSupply': '0'}) #bf_station.show() print("rows in dataframe", bf_station.count()) print(time.time() - t0) # year month and hour are redundent with metblue data fields bf_station = bf_station.withColumn( "year", year(bf_station.datetime).cast("integer")) bf_station = bf_station.withColumn( "month", month(bf_station.datetime).cast("integer")) @udf('boolean') def ifWeekday(dow): if dow > 5.0: return False else: return (True) @udf('boolean') def ifRain(precip): if precip > 0.0: return True else: return (False) bf_station = bf_station.withColumn( "hourOfDay", hour(bf_station.datetime).cast('integer')) bf_station = bf_station.withColumn( "dayOfWeek", dayofweek(bf_station.datetime).cast("double")) bf_station = bf_station.na.drop(how="any", subset=['dayOfWeek', 'hourOfDay']) bf_station = bf_station.withColumn("weekday", ifWeekday(bf_station.dayOfWeek)) bf_station = bf_station.withColumn("raining", ifRain(bf_station.total_precip)) #Label y #linkFunction = "log1p" if labelLinkFunction == "log1p": bf_station = bf_station.withColumn("label", log1p(bf_station.totalDemand)) else: bf_station = bf_station.withColumn("label", bf_station.totalDemand) print('bf_station created') return (bf_station)
def createTrans05(sparkDF): # =========================== # douglas fletcher # purpose: create data # transformations (10 at a time) # input: # sparkDF type sparkDF # output: # sparkDFTrans type sparkDF # =========================== sparkDFTrans = sparkDF sparkDFTrans = sparkDFTrans.select("*" , when((sparkDFTrans.MonthlyIncome % 1000) == 0, 1).otherwise(0) .alias("IncomeDivBy1000") ) sparkDFTrans = sparkDFTrans.select("*" , when((sparkDFTrans.MonthlyIncome % 5000) == 0, 1).otherwise(0) .alias("IncomeDivBy5000") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines==0.9999999, 1).otherwise(0) .alias("Weird0999Utilization") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines == 1, 1).otherwise(0) .alias("FullUtilization") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines > 1, 1).otherwise(0) .alias("ExcessUtilization") ) sparkDFTrans = sparkDFTrans.withColumn("NumberOfTime3089DaysPastDueNotWorse" , sparkDFTrans.NumberOfTime3059DaysPastDueNotWorse + sparkDFTrans.NumberOfTime6089DaysPastDueNotWorse ) sparkDFTrans = sparkDFTrans.withColumn("Never3089DaysPastDueNotWorse" , sparkDFTrans.Never6089DaysPastDueNotWorse * sparkDFTrans.Never3059DaysPastDueNotWorse ) sparkDFTrans = sparkDFTrans.withColumn("NumberOfTimesPastDue" , sparkDFTrans.NumberOfTime3059DaysPastDueNotWorse+sparkDFTrans.NumberOfTime6089DaysPastDueNotWorse+sparkDFTrans.NumberOfTimes90DaysLate ) sparkDFTrans = sparkDFTrans.withColumn("NeverPastDue" , sparkDFTrans.Never90DaysLate + sparkDFTrans.Never6089DaysPastDueNotWorse * sparkDFTrans.Never3059DaysPastDueNotWorse ) sparkDFTrans = sparkDFTrans.select("*" , (log1p(sparkDFTrans.RevolvingLines * sparkDFTrans.RevolvingUtilizationOfUnsecuredLines)) .alias("LogRevolvingUtilizationTimesLines") ) return sparkDFTrans
def customer_meta(df): SENIOR_CUTOFF = 65 ADULT_CUTOFF = 18 DAYS_IN_YEAR = 365.25 EXPONENTIAL_DIST_SCALE = 6.3 augmented_original = replicate_df(df, options["dup_times"] or 1) customerMetaRaw = augmented_original.select( "customerID", F.lit(now).alias("now"), (F.abs(F.hash(augmented_original.customerID)) % 4096 / 4096).alias("choice"), "SeniorCitizen", "gender", "Partner", "Dependents", F.col("MonthlyCharges").cast( get_currency_type()).alias("MonthlyCharges"), ) customerMetaRaw = customerMetaRaw.withColumn( "ageInDays", F.floor( F.when( customerMetaRaw.SeniorCitizen == 0, (customerMetaRaw.choice * ((SENIOR_CUTOFF - ADULT_CUTOFF - 1) * DAYS_IN_YEAR)) + (ADULT_CUTOFF * DAYS_IN_YEAR), ).otherwise((SENIOR_CUTOFF * DAYS_IN_YEAR) + (DAYS_IN_YEAR * (-F.log1p(-customerMetaRaw.choice) * EXPONENTIAL_DIST_SCALE)))).cast("int"), ) customerMetaRaw = customerMetaRaw.withColumn( "dateOfBirth", F.expr("date_sub(now, ageInDays)")) return customerMetaRaw.select( "customerID", "dateOfBirth", "gender", "SeniorCitizen", "Partner", "Dependents", "MonthlyCharges", "now", ).orderBy("customerID")
def min_max(df, column_names): min_funcs = [f.min(x) for x in column_names] max_funcs = [f.max(x) for x in column_names] column_funcs = min_funcs + max_funcs #min_value, max_value = df.select(f.min(column_name), f.max(column_name)).first() min_max_value = df.select(*column_funcs).first() print("min_max_value:", min_max_value) for i, column_name in enumerate(column_names): min_value = min_max_value[i] max_value = min_max_value[i + len(column_names)] if min_value == max_value: print("__error__: column_name:{} min_value == max_value".format( column_name)) continue df = df.withColumn(column_name, f.log1p(column_name)) df = df.withColumn( column_name, f.round((f.col(column_name) - min_value) / (max_value - min_value), 6)) return df
def impute(df, building_id, meter): time_series = spark.sql( "SELECT explode(sequence(to_timestamp('2016-01-01'), to_timestamp('2017-01-02'), interval 1 hour))" ).withColumnRenamed("col", "timestamp_seq") joined = time_series.join(df, [time_series.timestamp_seq == df.timestamp], "left_outer") median = df.approxQuantile("meter_reading", [0.5], 0.001)[0] imputed = joined.fillna(median, ["meter_reading"]) imputed = imputed.drop("timestamp") imputed = imputed.withColumnRenamed("timestamp_seq", "timestamp") imputed = imputed.withColumn("building_id", F.lit(building_id)) imputed = imputed.withColumn("meter", F.lit(meter)) imputed = imputed.withColumn("month", F.month(imputed.timestamp)) imputed = imputed.withColumn("day", F.dayofmonth(imputed.timestamp)) imputed = imputed.withColumn("hour", F.hour(imputed.timestamp)) imputed = imputed.withColumn( "meter_reading", F.when(imputed.meter_reading == 0, median).otherwise(imputed.meter_reading)) imputed = imputed.withColumn("meter_reading", F.log1p(imputed.meter_reading)) return imputed
def _transform(self, df): self.check_input_type(df.schema) return df.withColumn(self.outputCol, F.log1p(F.col(self.inputCol)))
def main(spark, log_comp=False, drop_low=False, drop_thr=0): ''' Parameters ---------- spark : SparkSession object train_path : string, path to the training parquet file to load val_path : string, path to the validation parquet file to load test_path : string, path to the validation parquet file to load ''' ## Load in datasets train_path = 'hdfs:/user/bm106/pub/project/cf_train.parquet' val_path = 'hdfs:/user/bm106/pub/project/cf_validation.parquet' test_path = 'hdfs:/user/bm106/pub/project/cf_test.parquet' train = spark.read.parquet(train_path) val = spark.read.parquet(val_path) test = spark.read.parquet(test_path) ## Downsample the data # Pick out user list in training set user_train = set(row['user_id'] for row in train.select('user_id').distinct().collect()) # Pick out user list in validation set user_val = set(row['user_id'] for row in val.select('user_id').distinct().collect()) # Get the previous 1M users user_prev = list(user_train - user_val) # Random sampling to get 20% k = int(0.2 * len(user_prev)) user_prev_filtered = random.sample(user_prev, k) train = train.where(train.user_id.isin(user_prev_filtered + list(user_val))) ## Create StringIndexer indexer_user = StringIndexer(inputCol="user_id", outputCol="user_id_indexed", handleInvalid='skip') indexer_user_model = indexer_user.fit(train) indexer_track = StringIndexer(inputCol="track_id", outputCol="track_id_indexed", handleInvalid='skip') indexer_track_model = indexer_track.fit(train) train = indexer_user_model.transform(train) train = indexer_track_model.transform(train) val = indexer_user_model.transform(val) val = indexer_track_model.transform(val) test = indexer_user_model.transform(test) test = indexer_track_model.transform(test) ## ALS model rank_ = [5, 10, 20] regParam_ = [0.1, 1, 10] alpha_ = [1, 5, 10] param_grid = it.product(rank_, regParam_, alpha_) ## Pick out users from validation set user_id = val.select('user_id_indexed').distinct() true_label = val.select('user_id_indexed', 'track_id_indexed')\ .groupBy('user_id_indexed')\ .agg(expr('collect_list(track_id_indexed) as true_item')) ## Log-Compression ## count -> log(1+count) if log_comp == True: train = train.select('*', F.log1p('count').alias('count_log1p')) val = val.select('*', F.log1p('count').alias('count_log1p')) rateCol = "count_log1p" else: rateCol = "count" ## Drop interactions that have counts lower than specified threhold if drop_low == True: train = train.filter(train['count'] > drop_thr) val = val.filter(val['count'] > drop_thr) for i in param_grid: print('Start Training for {}'.format(i)) als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \ alpha=i[2], nonnegative=True, coldStartStrategy="drop") model = als.fit(train) print('Finish Training for {}'.format(i)) # Make top 500 recommendations for users in validation test res = model.recommendForUserSubset(user_id, 500) pred_label = res.select('user_id_indexed', 'recommendations.track_id_indexed') pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_indexed', 'inner') \ .rdd \ .map(lambda row: (row[1], row[2])) print('Start Evaluating for {}'.format(i)) metrics = RankingMetrics(pred_true_rdd) map_ = metrics.meanAveragePrecision ndcg = metrics.ndcgAt(500) mpa = metrics.precisionAt(500) print(i, 'map score: ', map_, 'ndcg score: ', ndcg, 'map score: ', mpa) pass
def tocolumns(df, expr): import pyspark.sql.functions as fcns if isinstance(expr, histbook.expr.Const): return fcns.lit(expr.value) elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)): return df[expr.value] elif isinstance(expr, histbook.expr.Call): if expr.fcn == "abs" or expr.fcn == "fabs": return fcns.abs(tocolumns(df, expr.args[0])) elif expr.fcn == "max" or expr.fcn == "fmax": return fcns.greatest(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "min" or expr.fcn == "fmin": return fcns.least(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "arccos": return fcns.acos(tocolumns(df, expr.args[0])) elif expr.fcn == "arccosh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arcsin": return fcns.asin(tocolumns(df, expr.args[0])) elif expr.fcn == "arcsinh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arctan2": return fcns.atan2(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "arctan": return fcns.atan(tocolumns(df, expr.args[0])) elif expr.fcn == "arctanh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "ceil": return fcns.ceil(tocolumns(df, expr.args[0])) elif expr.fcn == "copysign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "cos": return fcns.cos(tocolumns(df, expr.args[0])) elif expr.fcn == "cosh": return fcns.cosh(tocolumns(df, expr.args[0])) elif expr.fcn == "rad2deg": return tocolumns(df, expr.args[0]) * (180.0 / math.pi) elif expr.fcn == "erfc": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "erf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp": return fcns.exp(tocolumns(df, expr.args[0])) elif expr.fcn == "expm1": return fcns.expm1(tocolumns(df, expr.args[0])) elif expr.fcn == "factorial": return fcns.factorial(tocolumns(df, expr.args[0])) elif expr.fcn == "floor": return fcns.floor(tocolumns(df, expr.args[0])) elif expr.fcn == "fmod": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "gamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "hypot": return fcns.hypot(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "isinf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isnan": return fcns.isnan(tocolumns(df, expr.args[0])) elif expr.fcn == "lgamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "log10": return fcns.log10(tocolumns(df, expr.args[0])) elif expr.fcn == "log1p": return fcns.log1p(tocolumns(df, expr.args[0])) elif expr.fcn == "log": return fcns.log(tocolumns(df, expr.args[0])) elif expr.fcn == "pow": return fcns.pow(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "deg2rad": return tocolumns(df, expr.args[0]) * (math.pi / 180.0) elif expr.fcn == "sinh": return fcns.sinh(tocolumns(df, expr.args[0])) elif expr.fcn == "sin": return fcns.sin(tocolumns(df, expr.args[0])) elif expr.fcn == "sqrt": return fcns.sqrt(tocolumns(df, expr.args[0])) elif expr.fcn == "tanh": return fcns.tanh(tocolumns(df, expr.args[0])) elif expr.fcn == "tan": return fcns.tan(tocolumns(df, expr.args[0])) elif expr.fcn == "trunc": raise NotImplementedError( expr.fcn) # FIXME (fcns.trunc is for dates) elif expr.fcn == "xor": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "conjugate": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "heaviside": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isfinite": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "left_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftLeft(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "log2": return fcns.log2(tocolumns(df, expr.args[0])) elif expr.fcn == "logaddexp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "logaddexp": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "mod" or expr.fcn == "fmod": return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1]) elif expr.fcn == "right_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftRight(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "rint": return fcns.rint(tocolumns(df, expr.args[0])) elif expr.fcn == "sign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "where": return fcns.when(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])).otherwise( tocolumns(df, expr.args[2])) elif expr.fcn == "numpy.equal": return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.not_equal": return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less": return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less_equal": return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.isin": return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_not": return ~tocolumns(df, expr.args[0]) elif expr.fcn == "numpy.add": return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.subtract": return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.multiply": return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.true_divide": return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_or": return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_and": return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1]) else: raise NotImplementedError(expr.fcn) else: raise AssertionError(expr)
def algorithm(target): rf = RandomForestRegressor(featuresCol='Features', labelCol=target) gbt = GBTRegressor(featuresCol='Features', labelCol=target) dt = DecisionTreeRegressor(featuresCol='Features', labelCol=target) lr = LinearRegression(featuresCol='Features', labelCol=target) glr = GeneralizedLinearRegression(family="gaussian", link="identity", featuresCol='Features', labelCol=target) model = [gbt, dt, lr, glr, rf] return rf, gbt, dt, lr, glr, model X_train = X_train.withColumn(target, F.log1p(F.col(target))) X_test = X_test.withColumn(target, F.log1p(F.col(target))) rf, gbt, dt, lr, glr, model = algorithm(target) fitted = gbt.fit(X_train) yhat = (fitted.transform(X_test).withColumn( "prediction", F.expm1(F.col("prediction"))).withColumn( target, F.expm1(F.col(target))).withColumn( 'fiability', 1 - F.abs(F.col(target) - F.col("prediction")) / F.col(target)).withColumn( 'fiability', F.when(F.col("fiability") < 0, 0).otherwise(F.col("fiability"))))
bucket = "[s3|gs]://[your input graph data]" # loc of your input graph data # refer to constants.NODE2VEC_PARAMS for default values of n2v_params n2v_params = { "num_walks": 30, "walk_length": 10, "return_param": 1.0, "inout_param": 1.0, } # refer to constants.WORD2VEC_PARAMS for default values of w2v_params w2v_params = {} if len(sys.argv) <= 1 or sys.argv[1] == "index": df = spark.read.parquet(f"{bucket}/input_graph.parquet").repartition( 1000) df = df.select("src", "dst", "weight").withColumn("weight", ssf.log1p(df["weight"])) fugue_df = SparkDataFrame(df.distinct()) # assume the input graph is not indexed, and is directed df_index, name_id = trim_index( fugue_spark, fugue_df, indexed=False, directed=True, max_out_deg=10000, ) name_id.native.write.parquet(f"{bucket}/graph_name2id.parquet", "overwrite") df_index.native.write.parquet(f"{bucket}/graph_indexed.parquet", "overwrite") elif len(sys.argv) <= 1 or sys.argv[1] == "walk":
def createTrans08(sparkDF): # =========================== # douglas fletcher # purpose: create data # transformations (10 at a time) # input: # sparkDF type sparkDF # output: # sparkDFTrans type sparkDF # =========================== sparkDFTrans = sparkDF sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogNumberOfTimes90DaysLate - sparkDFTrans.LogNumberOfTime6089DaysPastDueNotWorse) .alias("LogRatio90to6089DaysLate") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans > 0, 1).otherwise(0) .alias("AnyOpenCreditLinesOrLoans") ) sparkDFTrans = sparkDFTrans.select("*" , (log10(sparkDFTrans.NumberOfOpenCreditLinesAndLoans)) .alias("LogNumberOfOpenCreditLinesAndLoans") ) sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfOpenCreditLinesAndLoans" , when(sparkDFTrans.LogNumberOfOpenCreditLinesAndLoans.isNull(), 0) .otherwise(sparkDFTrans.LogNumberOfOpenCreditLinesAndLoans) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogNumberOfOpenCreditLinesAndLoans - log1p(sparkDFTrans.NumberOfDependents)) .alias("LogNumberOfOpenCreditLinesAndLoansPerPerson") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.NumberOfDependents > 0, 1).otherwise(0) .alias("HasDependents") ) sparkDFTrans = sparkDFTrans.select("*" , log1p(sparkDFTrans.NumberOfDependents) .alias("LogHouseholdSize") ) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfDependents) sparkDFTrans = sparkDFTrans.select("*" , log10(sparkDFTrans.DebtRatio) .alias("LogDebtRatio") ) sparkDFTrans = sparkDFTrans.withColumn("LogDebtRatio" , when(sparkDFTrans.LogDebtRatio.isNull(), 0) .otherwise(sparkDFTrans.LogDebtRatio) ) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.DebtRatio) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfTimesPastDue)) .alias("LogDebtPerDelinquency") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfTimes90DaysLate)) .alias("LogDebtPer90DaysLate") ) sparkDFTrans = sparkDFTrans.select("*" , (log10(sparkDFTrans.UnknownIncomeDebtRatio)) .alias("LogUnknownIncomeDebtRatio") ) sparkDFTrans = sparkDFTrans.withColumn("LogUnknownIncomeDebtRatio" , when(sparkDFTrans.LogUnknownIncomeDebtRatio.isNull(), 0) .otherwise(sparkDFTrans.LogUnknownIncomeDebtRatio) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogUnknownIncomeDebtRatio - sparkDFTrans.LogHouseholdSize) .alias("LogUnknownIncomeDebtRatioPerPerson") ) return sparkDFTrans
def createTrans06(sparkDF): # =========================== # douglas fletcher # purpose: create data # transformations (10 at a time) # input: # sparkDF type sparkDF # output: # sparkDFTrans type sparkDF # =========================== sparkDFTrans = sparkDF sparkDFTrans = sparkDFTrans.select("*" , (log10(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines)) .alias("LogRevolvingUtilizationOfUnsecuredLines") ) sparkDFTrans = sparkDFTrans.withColumn("LogRevolvingUtilizationOfUnsecuredLines" , when(sparkDFTrans.LogRevolvingUtilizationOfUnsecuredLines.isNull(), 0) .otherwise(sparkDFTrans.LogRevolvingUtilizationOfUnsecuredLines) ) sparkDFTrans = sparkDFTrans.drop("RevolvingUtilizationOfUnsecuredLines") sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberOfTimesPastDue / sparkDFTrans.NumberOfOpenCreditLinesAndLoans) .alias("DelinquenciesPerLine") ) sparkDFTrans = sparkDFTrans.withColumn("DelinquenciesPerLine" , when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans == 0, 0) .otherwise(sparkDFTrans.DelinquenciesPerLine) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberOfTimes90DaysLate / sparkDFTrans.NumberOfOpenCreditLinesAndLoans) .alias("MajorDelinquenciesPerLine") ) sparkDFTrans = sparkDFTrans.withColumn("MajorDelinquenciesPerLine" , when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans == 0, 0) .otherwise(sparkDFTrans.MajorDelinquenciesPerLine) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberOfTime3089DaysPastDueNotWorse / sparkDFTrans.NumberOfOpenCreditLinesAndLoans) .alias("MinorDelinquenciesPerLine") ) sparkDFTrans = sparkDFTrans.withColumn("MinorDelinquenciesPerLine" , when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans == 0, 0) .otherwise(sparkDFTrans.MinorDelinquenciesPerLine) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberOfTimesPastDue / sparkDFTrans.RevolvingLines) .alias("DelinquenciesPerRevolvingLine") ) sparkDFTrans = sparkDFTrans.withColumn("DelinquenciesPerRevolvingLine" , when(sparkDFTrans.RevolvingLines == 0, 0) .otherwise(sparkDFTrans.DelinquenciesPerRevolvingLine) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberOfTimes90DaysLate / sparkDFTrans.RevolvingLines) .alias("MajorDelinquenciesPerRevolvingLine") ) sparkDFTrans = sparkDFTrans.withColumn("MajorDelinquenciesPerRevolvingLine" , when(sparkDFTrans.RevolvingLines == 0, 0) .otherwise(sparkDFTrans.MajorDelinquenciesPerRevolvingLine) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberOfTime3089DaysPastDueNotWorse / sparkDFTrans.RevolvingLines) .alias("MinorDelinquenciesPerRevolvingLine") ) sparkDFTrans = sparkDFTrans.withColumn("MinorDelinquenciesPerRevolvingLine" , when(sparkDFTrans.RevolvingLines == 0, 0) .otherwise(sparkDFTrans.MinorDelinquenciesPerRevolvingLine) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfOpenCreditLinesAndLoans)) .alias("LogDebtPerLine") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberRealEstateLoansOrLines)) .alias("LogDebtPerRealEstateLine") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfDependents)) .alias("LogDebtPerPerson") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.RevolvingLines /(1+sparkDFTrans.NumberOfDependents)) .alias("RevolvingLinesPerPerson") ) return sparkDFTrans
[assembler]+ \ [pca]) model = pipeline.fit(df) final_dataset = model.transform(df) target = 'QTY' gbt = GBTRegressor(featuresCol='Features', labelCol=target) dt = DecisionTreeRegressor(featuresCol='Features', labelCol=target) lr = LinearRegression(featuresCol='Features', labelCol=target) X_train = (final_dataset.filter( F.col('DATE').between("2017-01-02", "2018-06-01")).withColumn(target, F.log1p(F.col(target)))) X_test = (final_dataset.filter(F.col('DATE') > "2018-06-01").withColumn( target, F.log1p(F.col(target)))) fitted = gbt.fit(X_train) yhat = (fitted.transform(X_test).withColumn( "prediction", F.expm1(F.col("prediction"))).withColumn(target, F.expm1(F.col(target)))) eval_ = RegressionEvaluator(labelCol=target, predictionCol="prediction", metricName="rmse") rmse = eval_.evaluate(yhat)
data = data.withColumn('INTERVAL',udf_assign_interval('TIME')) # calculate mean price of each time intervel according to symbol, date, interval data = data.groupby(['SYMBOL','DATE','INTERVAL']).agg({'PRICE':'mean'}) data = data.withColumnRenamed('avg(PRICE)', 'AVG_PRICE') data = data.select(data.SYMBOL,data.DATE,data.INTERVAL.cast('double'),data.AVG_PRICE) data = data.orderBy(["SYMBOL","DATE","INTERVAL"], ascending=[1, 1]) # apply window function to get previous time interval avg_price w = Window().partitionBy(col('SYMBOL')).orderBy([col('SYMBOL'),col('DATE'),col('INTERVAL')]) data = data.select("*", lag('AVG_PRICE').over(w).alias('PRE_AVG_PRICE')) # compute log return data = data.withColumn('U_SEQUENCE',log1p(data.AVG_PRICE/data.PRE_AVG_PRICE-1.)) data = data.withColumn('SQUARE_U_SEQUENCE', data.U_SEQUENCE * data.U_SEQUENCE) window = Window().partitionBy("SYMBOL").rowsBetween(-(len(endPoints)-1), 0).orderBy([col('SYMBOL'),col('DATE'),col('INTERVAL')]) new_data = data.select(data.SYMBOL,data.DATE,data.INTERVAL,data.U_SEQUENCE,data.SQUARE_U_SEQUENCE,f.sum('U_SEQUENCE').over(window).alias('SUM_U'),f.sum('SQUARE_U_SEQUENCE').over(window).alias('SQUARE_U_SUM')) N = float(len(endPoints)) # compute section volatility new_data = new_data.withColumn("SECTION_VOLATILITY",sqrt(col('SQUARE_U_SUM')/(N-1.) - col('SUM_U')**2/(N*(N-1.)))) SRData = new_data.select(new_data.SYMBOL,new_data.DATE,new_data.SECTION_VOLATILITY,new_data.SQUARE_U_SUM) SRData = SRData.withColumnRenamed('SQUARE_U_SUM', 'REALIZE_VOLATILITY') meanSR = SRData.groupby(['SYMBOL','DATE']).agg({'SECTION_VOLATILITY':'mean','REALIZE_VOLATILITY':'mean'})
rank_ = [5, 10, 20] regParam_ = [0.1, 1, 10] alpha_ = [1, 5, 10] param_grid = it.product(rank_, regParam_, alpha_) user_id = val.select('user_id_indexed').distinct() true_label = val.select('user_id_indexed', 'track_id_indexed')\ .groupBy('user_id_indexed')\ .agg(expr('collect_list(track_id_indexed) as true_item')) ## Define log-compression/ Drop_low counts log_comp = True drop_low = True drop_thr = 2 if log_comp == True: train = train.select('*', F.log1p('count').alias('count_log1p')) val = val.select('*', F.log1p('count').alias('count_log1p')) rateCol = "count_log1p" else: rateCol = "count" if drop_low == True: train = train.filter(train['count'] > drop_thr) val = val.filter(val['count'] > drop_thr) for i in param_grid: print('Start Training for {}'.format(i)) als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \ alpha=i[2], nonnegative=True, coldStartStrategy="drop") model = als.fit(train) print('Finish Training for {}'.format(i))
from pyspark.sql.types import DateType udf1 = udf(lambda x: x[0:4] + '-' + x[4:6] + '-' + x[6:], StringType()) train_df = (train_df.withColumn("date", train_df["date"].cast("string"))).withColumn( 'date', udf1('date')) train_df = train_df.withColumn("date", train_df['date'].cast(DateType())) dev_df = train_df.filter(train_df["date"] <= lit('2017-03-01')) val_df = train_df.filter(train_df["date"] > lit('2017-03-01')) print('dev_df = dtpyes', dev_df.dtypes) print('val_df = dtpyes', val_df.dtypes) dev_y = dev_df.withColumn("totalstransactionRevenuelog1p", log1p('totalstransactionRevenue')).select( ['totalstransactionRevenuelog1p']) val_y = val_df.withColumn("totalstransactionRevenuelog1p", log1p('totalstransactionRevenue')).select( ['totalstransactionRevenuelog1p']) dev_df = dev_df.toPandas() val_df = val_df.toPandas() dev_y = dev_y.toPandas() val_y = val_y.toPandas() test_df = test_df.toPandas() dev_X = dev_df[cats + nums] val_X = val_df[cats + nums] test_X = test_df[cats + nums]
def createTrans07(sparkDF): # =========================== # douglas fletcher # purpose: create data # transformations (10 at a time) # input: # sparkDF type sparkDF # output: # sparkDFTrans type sparkDF # =========================== sparkDFTrans = sparkDF sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberRealEstateLoansOrLines /(1+sparkDFTrans.NumberOfDependents)) .alias("RealEstateLoansPerPerson") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.age /(1+sparkDFTrans.NumberOfDependents)) .alias("YearsOfAgePerDependent") ) sparkDFTrans = sparkDFTrans.select("*" , log10(sparkDFTrans.MonthlyIncome) .alias("LogMonthlyIncome") ) sparkDFTrans = sparkDFTrans.withColumn("LogMonthlyIncome" , when((sparkDFTrans.LogMonthlyIncome.isNull()) | (sparkDFTrans.LogMonthlyIncome.isNull()), 0) .otherwise(sparkDFTrans.LogMonthlyIncome) ) sparkDFTrans = sparkDFTrans.drop("MonthlyIncome") sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogMonthlyIncome - log1p(sparkDFTrans.NumberOfDependents)) .alias("LogIncomePerPerson") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogMonthlyIncome - log1p(sparkDFTrans.age)) .alias("LogIncomeAge") ) sparkDFTrans = sparkDFTrans.select("*" , log10(sparkDFTrans.NumberOfTimesPastDue) .alias("LogNumberOfTimesPastDue") ) sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTimesPastDue" , when(sparkDFTrans.LogNumberOfTimesPastDue.isNull(), 0) .otherwise(sparkDFTrans.LogNumberOfTimesPastDue) ) sparkDFTrans = sparkDFTrans.select("*" , log10(sparkDFTrans.NumberOfTimes90DaysLate) .alias("LogNumberOfTimes90DaysLate") ) sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTimes90DaysLate" , when(sparkDFTrans.LogNumberOfTimes90DaysLate.isNull(), 0) .otherwise(sparkDFTrans.LogNumberOfTimes90DaysLate) ) sparkDFTrans = sparkDFTrans.select("*" , log10(sparkDFTrans.NumberOfTime3059DaysPastDueNotWorse) .alias("LogNumberOfTime3059DaysPastDueNotWorse") ) sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTime3059DaysPastDueNotWorse" , when(sparkDFTrans.LogNumberOfTime3059DaysPastDueNotWorse.isNull(), 0) .otherwise(sparkDFTrans.LogNumberOfTime3059DaysPastDueNotWorse) ) sparkDFTrans = sparkDFTrans.select("*" , log10(sparkDFTrans.NumberOfTime6089DaysPastDueNotWorse) .alias("LogNumberOfTime6089DaysPastDueNotWorse") ) sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTime6089DaysPastDueNotWorse" , when(sparkDFTrans.LogNumberOfTime6089DaysPastDueNotWorse.isNull(), 0) .otherwise(sparkDFTrans.LogNumberOfTime6089DaysPastDueNotWorse) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogNumberOfTimes90DaysLate - sparkDFTrans.LogNumberOfTime3059DaysPastDueNotWorse) .alias("LogRatio90to3059DaysLate") ) return sparkDFTrans
def run_pipeline(name: str, data: str, save: str) -> None: spark = SparkSession.builder.appName(name).getOrCreate() # Dataset Creation # # read bike ride history csv's df = spark.read.csv(f'{data}/rides/*', header=True) df = df.select(['Duration', 'Start date', 'Start station number', 'Member type']) df = df.withColumn('Start station number', df['Start station number'].cast(IntegerType())) print(f'The rides dataset has [{df.count()}] rows!') # read station information csv stations = spark.read.csv(f'{data}/stations/*', header=True) print(f'The stations dataset has {stations.count()} rows!') stations = stations.withColumnRenamed('LATITUDE', 'start_station_lat') stations = stations.withColumnRenamed('LONGITUDE', 'start_station_long') stations = stations.withColumn('Start station number', stations['TERMINAL_NUMBER'].cast(IntegerType())) stations = stations.select(['start_station_lat', 'start_station_long', 'Start station number']) # remove rides longer than 1.5 hours one_and_a_half_hours = 60 * 60 * 1.5 df = df.filter(df['Duration'] <= one_and_a_half_hours) # remove rides shorter than 3 minutes three_minutes = 60 * 3 df = df.filter(df['Duration'] >= three_minutes) # remove unknown 'Member type's df = df.filter(df['Member type'] != 'Unknown') # remove non-existent stations df = df.filter(~(df['Start station number'] == 31008) & ~( df['Start station number'] == 32051) & ~(df['Start station number'] == 32034)) # make target feature df = df.withColumn('label', F.log1p(df.Duration)) # join on 'Start station number' print('Merging rides and stations dataframes!') df = df.join(stations, on='Start station number') df = df.withColumn('start_station_long', df['start_station_long'].cast(DoubleType())) df = df.withColumn('start_station_lat', df['start_station_lat'].cast(DoubleType())) print(f'Complete rides and stations dataset has {df.count()} rows!') # Feature Transformations # print('Doing Feature Transformations!') # convert to datetime type df = df.withColumn('Start date', F.to_timestamp('Start date', 'yyyy-MM-dd HH:mm:ss')) df = df.withColumn('day_of_week', F.dayofweek('Start date')) df = df.withColumn('week_of_year', F.weekofyear('Start date')) df = df.withColumn('month', F.month('Start date')) df = df.withColumn('minute', F.minute('Start date')) df = df.withColumn('hour', F.hour('Start date')) # make time features cyclical pi = 3.141592653589793 df = df.withColumn('sin_day_of_week', F.sin(2 * pi * df['day_of_week'] / 7)) df = df.withColumn('sin_week_of_year', F.sin(2 * pi * df['week_of_year'] / 53)) df = df.withColumn('sin_month', F.sin(2 * pi * (df['month'] - 1) / 12)) df = df.withColumn('sin_minute', F.sin(2 * pi * df['minute'] / 60)) df = df.withColumn('sin_hour', F.sin(2 * pi * df['hour'] / 24)) df = df.withColumn('cos_day_of_week', F.cos(2 * pi * df['day_of_week'] / 7)) df = df.withColumn('cos_week_of_year', F.cos(2 * pi * df['week_of_year'] / 53)) df = df.withColumn('cos_month', F.cos(2 * pi * (df['month'] - 1) / 12)) df = df.withColumn('cos_minute', F.cos(2 * pi * df['minute'] / 60)) df = df.withColumn('cos_hour', F.cos(2 * pi * df['hour'] / 24)) df = df.withColumn('hour_and_day_of_week', df['hour'].cast(StringType()) + '_' + df['day_of_week'].cast(StringType())) df = df.withColumn('member_type_and_day_of_week', df['Member type'] + '_' + df['day_of_week'].cast(StringType())) # drop unused columns drop_columns = [ 'Start date', 'Start station number', 'Duration', 'day_of_week', 'week_of_year', 'month', 'minute', 'hour' ] df = df.drop(*drop_columns) # df.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in df.columns]).show() # Model and Pipeline # # split training and test train, test = df.randomSplit([.7, .3]) # encode categorical column 'Member type' member_indexer = StringIndexer(inputCol='Member type', outputCol='member_idx') member_encoder = OneHotEncoder(inputCol='member_idx', outputCol='member_enc') # create vector of features named 'features' vector = VectorAssembler( inputCols=[ 'start_station_lat', 'start_station_long', 'sin_day_of_week', 'cos_day_of_week', 'sin_week_of_year', 'cos_week_of_year', 'sin_month', 'cos_month', 'sin_minute', 'cos_minute', 'sin_hour', 'cos_hour', 'member_enc' ], outputCol='features' ) # scale features scaler = StandardScaler( inputCol='features', outputCol='scaled_features' ) # define model model = GeneralizedLinearRegression( featuresCol='scaled_features' ) # create pipeline and fill in stages pipeline = Pipeline( stages=[ member_indexer, member_encoder, vector, scaler, model ] ) # evaluation method evaluation = RegressionEvaluator() # best parameter search grid = ParamGridBuilder() # grid = grid.addGrid(model.maxDepth, [5, 7]) # grid = grid.addGrid(model.numTrees, [200, 500]) grid = grid.addGrid(model.maxIter, [40, 50]) grid = grid.addGrid(model.family, ['gaussian', 'gamma']) grid = grid.addGrid(model.regParam, [0.0, 0.1]) grid = grid.build() # run cross validation cv = CrossValidator( estimator=pipeline, estimatorParamMaps=grid, evaluator=evaluation, numFolds=7 ) print('Doing Cross Validation!') cv_models = cv.fit(train) print(f'CV results: {cv_models.avgMetrics} (RMSE)') best_model = cv_models.bestModel best_params = extract_best_params(best_model.stages[-1].extractParamMap()) print(f'Best params:\n{best_params}') results = cv_models.transform(test) print(f'CV results on holdout dataset: {evaluation.evaluate(results)} (RMSE)') print('Re-fitting pipeline on entire dataset!') cv_models = cv.fit(df) print('Saving to pipeline into S3!') entire_dataset_best_model = cv_models.bestModel entire_dataset_best_model.save(f'{save}/{name}') print('Done!') return
df5=df4.withColumn("mean_TOTAL_PRICE",col("sum(TOTAL_PRICE)")/col("sum(SIZE)")) df5=df5.withColumn("mean_SIZE",col("sum(SIZE)")/col("count(KEY)")) df5 = df5.withColumn("Interval_int", df5["Interval"].cast("double")) df6=df5.select('SYMBOL','DATE','Interval_int','avg(PRICE)','mean_TOTAL_PRICE','mean_SIZE') df6= df6.orderBy(["SYMBOL","DATE","Interval_int"], ascending=[1, 1]) w=Window().partitionBy([col("SYMBOL"),col("DATE")]).orderBy([col("SYMBOL"),col("DATE"),col("Interval_int")]) df7=df6.select("*", lag("avg(PRICE)").over(w).alias("avg(PRICE)_previous")) df8=df7.withColumn("U_sequence",log1p(col("avg(PRICE)")/col("avg(PRICE)_previous"))) df_Usequence=df8.select("U_sequence") #df_Usequence.write.csv('/Users/yuhan/Dropbox/big_data_analytics/Final/data_save_30',header=True) df8=df8.withColumn("U_sequence_square",col("U_sequence")*col("U_sequence")) f_section={"U_sequence_square":'sum', "U_sequence":'sum',"Interval_int":'count',"avg(PRICE)":'mean'} df_section=df8.groupby(["SYMBOL","DATE"]).agg(f_section) df_section=df_section.withColumnRenamed('avg(avg(PRICE))', 'daily_average_price')
"inout_param": 1.0, } # refer to constants.WORD2VEC_PARAMS for default values of w2v_params w2v_params: Dict[str, Any] = {} g2v = Node2VecSpark( spark, n2v_params, w2v_params=w2v_params, max_out_degree=10000, ) if len(sys.argv) <= 1 or sys.argv[1] == "index": # the input graph must have 3 cols: src, dst, weight df = spark.read.parquet(f"{bucket}/input_graph.parquet").repartition(1000) df = df.select("src", "dst", "weight").withColumn( "weight", ssf.log1p(df["weight"]) ) # assume the input graph is not indexed, and is directed g2v.preprocess_input_graph(df, indexed=False, directed=True) g2v.name_id.write.parquet(f"{bucket}/graph_name2id.parquet", "overwrite") g2v.df.write.parquet(f"{bucket}/graph_indexed.parquet", "overwrite") elif sys.argv[1] == "walk": g2v.name_id = spark.read.parquet(f"{bucket}/graph_name2id.parquet").cache() g2v.df = spark.read.parquet(f"{bucket}/graph_indexed.parquet").cache() walks = g2v.random_walk() walks.write.parquet(f"{bucket}/graph_walks.parquet", "overwrite") else: df_walks = spark.read.parquet(f"{bucket}/graph_walks.parquet") model = g2v.fit(df_walks)
def log_transformation(x): return when(col(x) < 0, col(x)).otherwise(log1p(col(x)))
def _process_df(self, col): return F.when(col.isNull(), F.lit(self.default_value)).otherwise( F.log1p(col.cast('float')))