def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) tRatio = self.getOrDefault(self.trainRatio) seed = self.getOrDefault(self.seed) randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) condition = (df[randCol] >= tRatio) validation = df.filter(condition).cache() train = df.filter(~condition).cache() subModels = None collectSubModelsParam = self.getCollectSubModels() if collectSubModelsParam: subModels = [None for i in range(numModels)] tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam) pool = ThreadPool(processes=min(self.getParallelism(), numModels)) metrics = [None] * numModels for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks): metrics[j] = metric if collectSubModelsParam: subModels[j] = subModel train.unpersist() validation.unpersist() if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues(TrainValidationSplitModel(bestModel, metrics, subModels))
def fit(self, dataset, params={}): paramMap = self.extractParamMap(params) est = paramMap[self.estimator] epm = paramMap[self.estimatorParamMaps] numModels = len(epm) eva = paramMap[self.evaluator] nFolds = paramMap[self.numFolds] h = 1.0 / nFolds randCol = self.uid + "_rand" df = dataset.select("*", rand(0).alias(randCol)) metrics = np.zeros(numModels) for i in range(nFolds): validateLB = i * h validateUB = (i + 1) * h condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB) validation = df.filter(condition) train = df.filter(~condition) for j in range(numModels): model = est.fit(train, epm[j]) # TODO: duplicate evaluator to take extra params from input metric = eva.evaluate(model.transform(validation, epm[j])) metrics[j] += metric bestIndex = np.argmax(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return CrossValidatorModel(bestModel)
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) tRatio = self.getOrDefault(self.trainRatio) seed = self.getOrDefault(self.seed) randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) condition = (df[randCol] >= tRatio) validation = df.filter(condition).cache() train = df.filter(~condition).cache() def singleTrain(paramMap): model = est.fit(train, paramMap) metric = eva.evaluate(model.transform(validation, paramMap)) return metric pool = ThreadPool(processes=min(self.getParallelism(), numModels)) metrics = pool.map(singleTrain, epm) train.unpersist() validation.unpersist() if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues(TrainValidationSplitModel(bestModel, metrics))
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) nFolds = self.getOrDefault(self.numFolds) seed = self.getOrDefault(self.seed) h = 1.0 / nFolds randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) metrics = [0.0] * numModels for i in range(nFolds): validateLB = i * h validateUB = (i + 1) * h condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB) validation = df.filter(condition) train = df.filter(~condition) for j in range(numModels): model = est.fit(train, epm[j]) # TODO: duplicate evaluator to take extra params from input metric = eva.evaluate(model.transform(validation, epm[j])) metrics[j] += metric/nFolds if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues(CrossValidatorModel(bestModel, metrics))
def test_rand_functions(self): df = self.df from pyspark.sql import functions rnd = df.select('key', functions.rand()).collect() for row in rnd: assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1] rndn = df.select('key', functions.randn(5)).collect() for row in rndn: assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1]
def test_rand_functions(self): df = self.df from pyspark.sql import functions rnd = df.select('key', functions.rand()).collect() for row in rnd: assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1] rndn = df.select('key', functions.randn(5)).collect() for row in rndn: assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1] # If the specified seed is 0, we should use it. # https://issues.apache.org/jira/browse/SPARK-9691 rnd1 = df.select('key', functions.rand(0)).collect() rnd2 = df.select('key', functions.rand(0)).collect() self.assertEqual(sorted(rnd1), sorted(rnd2)) rndn1 = df.select('key', functions.randn(0)).collect() rndn2 = df.select('key', functions.randn(0)).collect() self.assertEqual(sorted(rndn1), sorted(rndn2))
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) nFolds = self.getOrDefault(self.numFolds) seed = self.getOrDefault(self.seed) h = 1.0 / nFolds randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) metrics = [0.0] * numModels pool = ThreadPool(processes=min(self.getParallelism(), numModels)) subModels = None collectSubModelsParam = self.getCollectSubModels() if collectSubModelsParam: subModels = [[None for j in range(numModels)] for i in range(nFolds)] for i in range(nFolds): validateLB = i * h validateUB = (i + 1) * h condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB) validation = df.filter(condition).cache() train = df.filter(~condition).cache() tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam) for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks): metrics[j] += (metric / nFolds) if collectSubModelsParam: subModels[i][j] = subModel validation.unpersist() train.unpersist() if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues(CrossValidatorModel(bestModel, metrics, subModels))
def split_data(frame, num_folds, tc=TkContext.implicit): """ Randomly split data based on num_folds specified. Implementation logic borrowed from pyspark. :param frame: The frame to be split into train and validation frames :param num_folds: Number of folds to be split into :param tc: spark-tk context passed implicitly :return: train frame and test frame for each fold """ from pyspark.sql.functions import rand df = frame.dataframe h = 1.0/num_folds rand_col = "rand_1" df_indexed = df.select("*", rand(0).alias(rand_col)) for i in xrange(num_folds): test_lower_bound = i*h test_upper_bound = (i+1)*h condition = (df_indexed[rand_col] >= test_lower_bound) & (df_indexed[rand_col] < test_upper_bound) test_df = df_indexed.filter(condition) train_df = df_indexed.filter(~condition) train_frame = tc.frame.create(train_df) test_frame = tc.frame.create(test_df) yield train_frame, test_frame
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) nFolds = self.getOrDefault(self.numFolds) seed = self.getOrDefault(self.seed) h = 1.0 / nFolds randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) metrics = [0.0] * numModels pool = ThreadPool(processes=min(self.getParallelism(), numModels)) for i in range(nFolds): validateLB = i * h validateUB = (i + 1) * h condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB) validation = df.filter(condition).cache() train = df.filter(~condition).cache() def singleTrain(paramMap): model = est.fit(train, paramMap) # TODO: duplicate evaluator to take extra params from input metric = eva.evaluate(model.transform(validation, paramMap)) return metric currentFoldMetrics = pool.map(singleTrain, epm) for j in range(numModels): metrics[j] += (currentFoldMetrics[j] / nFolds) validation.unpersist() train.unpersist() if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues(CrossValidatorModel(bestModel, metrics))
def _fit(self, dataset): est = self.getOrDefault(self.estimator) epm = self.getOrDefault(self.estimatorParamMaps) numModels = len(epm) eva = self.getOrDefault(self.evaluator) tRatio = self.getOrDefault(self.trainRatio) seed = self.getOrDefault(self.seed) randCol = self.uid + "_rand" df = dataset.select("*", rand(seed).alias(randCol)) metrics = [0.0] * numModels condition = (df[randCol] >= tRatio) validation = df.filter(condition) train = df.filter(~condition) for j in range(numModels): model = est.fit(train, epm[j]) metric = eva.evaluate(model.transform(validation, epm[j])) metrics[j] += metric if eva.isLargerBetter(): bestIndex = np.argmax(metrics) else: bestIndex = np.argmin(metrics) bestModel = est.fit(dataset, epm[bestIndex]) return self._copyValues(TrainValidationSplitModel(bestModel, metrics))
y = tf.placeholder(tf.float32, shape=[None, 10], name='y') layer1 = tf.layers.dense(x, 256, activation=tf.nn.relu) layer2 = tf.layers.dense(layer1, 256, activation=tf.nn.relu) out = tf.layers.dense(layer2, 10) z = tf.argmax(out, 1, name='out') loss = tf.losses.softmax_cross_entropy(y, out) return loss if __name__ == '__main__': spark = SparkSession.builder \ .appName("examples") \ .master('local[8]').config('spark.driver.memory', '4g') \ .getOrCreate() df = spark.read.option("inferSchema", "true").csv('mnist_train.csv').orderBy(rand()) mg = build_graph(small_model) adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999) va = VectorAssembler(inputCols=df.columns[1:785], outputCol='features').transform(df) encoded = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False).transform(va).select(['features', 'labels']) #demonstration of options. Not all are required spark_model = SparkAsyncDL( inputCol='features', tensorflowGraph=mg, tfInput='x:0', tfLabel='y:0', tfOutput='out:0', tfOptimizer='adam', miniBatchSize=300,
num_users = ui_mat_rdd.map(_func).distinct().count() def _func(i): usrId, docId, value = i return docId num_movies = ui_mat_rdd.map(_func).distinct().count() print('users:', num_users, 'products:', num_movies) # Create Spark dataframe df = spark.createDataFrame(ui_mat_rdd, ['userId', 'movieId', 'value']) ui_mat_rdd.unpersist() print('Splitting data set...') df = df.orderBy(F.rand()) train_df, test_df = df.randomSplit([0.9, 0.1], seed=45) train_df, val_df = train_df.randomSplit([0.95, 0.05], seed=45) train_df = train_df.withColumn('flag', F.lit(0)) val_df = val_df.withColumn('flag', F.lit(1)) val_df = val_df.union(train_df) test_df = test_df.withColumn('flag', F.lit(2)) test_df = test_df.union(train_df) test_df = test_df.union(val_df) train_size = train_df.count() val_size = val_df.count() test_size = test_df.count()
def ROEM_cv(ratings_df, userCol = "userId", itemCol = "songId", ratingCol = "num_plays", ranks = [10, 50, 100, 150, 200], maxIters = [10, 25, 50, 100, 200, 400], regParams = [.05, .1, .15], alphas = [10, 40, 80, 100]): #Originally run on a subset of the Echo Next Taste Profile dataset found here: #https://labrosa.ee.columbia.edu/millionsong/tasteprofile from pyspark.sql.functions import rand from pyspark.ml.recommendation import ALS ratings_df = ratings_df.orderBy(rand()) #Shuffling to ensure randomness #Building train and validation test sets train, validate = ratings_df.randomSplit([0.8, 0.2], seed = 0) #Building 5 folds within the training set. test1, test2, test3, test4, test5 = train.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2], seed = 1) train1 = test2.union(test3).union(test4).union(test5) train2 = test3.union(test4).union(test5).union(test1) train3 = test4.union(test5).union(test1).union(test2) train4 = test5.union(test1).union(test2).union(test3) train5 = test1.union(test2).union(test3).union(test4) #Creating variables that will be replaced by the best model's hyperparameters for subsequent printing best_validation_performance = 9999999999999 best_rank = 0 best_maxIter = 0 best_regParam = 0 best_alpha = 0 best_model = 0 best_predictions = 0 #Looping through each combindation of hyperparameters to ensure all combinations are tested. for r in ranks: for mi in maxIters: for rp in regParams: for a in alphas: #Create ALS model als = ALS(rank = r, maxIter = mi, regParam = rp, alpha = a, userCol=userCol, itemCol=itemCol, ratingCol=ratingCol, coldStartStrategy="drop", nonnegative = True, implicitPrefs = True) #Fit model to each fold in the training set model1 = als.fit(train1) model2 = als.fit(train2) model3 = als.fit(train3) model4 = als.fit(train4) model5 = als.fit(train5) #Generating model's predictions for each fold in the test set predictions1 = model1.transform(test1) predictions2 = model2.transform(test2) predictions3 = model3.transform(test3) predictions4 = model4.transform(test4) predictions5 = model5.transform(test5) #Expected percentile rank error metric function def ROEM(predictions, userCol = "userId", itemCol = "songId", ratingCol = "num_plays"): #Creates table that can be queried predictions.createOrReplaceTempView("predictions") #Sum of total number of plays of all songs denominator = predictions.groupBy().sum(ratingCol).collect()[0][0] #Calculating rankings of songs predictions by user spark.sql("SELECT " + userCol + " , " + ratingCol + " , PERCENT_RANK() OVER (PARTITION BY " + userCol + " ORDER BY prediction DESC) AS rank FROM predictions").createOrReplaceTempView("rankings") #Multiplies the rank of each song by the number of plays and adds the products together numerator = spark.sql('SELECT SUM(' + ratingCol + ' * rank) FROM rankings').collect()[0][0] performance = numerator/denominator return performance #Calculating expected percentile rank error metric for the model on each fold's prediction set performance1 = ROEM(predictions1) performance2 = ROEM(predictions2) performance3 = ROEM(predictions3) performance4 = ROEM(predictions4) performance5 = ROEM(predictions5) #Printing the model's performance on each fold print ("Model Parameters: ")("Rank:"), r, (" MaxIter:"), mi, ("RegParam:"), rp, ("Alpha: "), a print("Test Percent Rank Errors: "), performance1, performance2, performance3, performance4, performance5 #Validating the model's performance on the validation set validation_model = als.fit(train) validation_predictions = validation_model.transform(validate) validation_performance = ROEM(validation_predictions) #Printing model's final expected percentile ranking error metric print("Validation Percent Rank Error: "), validation_performance print(" ") #Filling in final hyperparameters with those of the best-performing model if validation_performance < best_validation_performance: best_validation_performance = validation_performance best_rank = r best_maxIter = mi best_regParam = rp best_alpha = a best_model = validation_model best_predictions = validation_predictions #Printing best model's expected percentile rank and hyperparameters print ("**Best Model** ") print (" Percent Rank Error: "), best_validation_performance print (" Rank: "), best_rank print (" MaxIter: "), best_maxIter print (" RegParam: "), best_regParam print (" Alpha: "), best_alpha return best_model, best_predictions
def spark_stratified_split( data, ratio=0.75, min_rating=1, filter_by="user", col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, seed=42, ): """Spark stratified splitter For each user / item, the split function takes proportions of ratings which is specified by the split ratio(s). The split is stratified. Args: data (spark.DataFrame): Spark DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two halves and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. Earlier indexed splits will have earlier times (e.g the latest time per user or item in split[0] <= the earliest time per user or item in split[1]) seed (int): Seed. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. col_user (str): column name of user IDs. col_item (str): column name of item IDs. Returns: list: Splits of the input data as spark.DataFrame. """ if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError("min_rating should be integer and larger than or equal to 1.") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item if min_rating > 1: data = min_rating_filter_spark( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) ratio = ratio if multi_split else [ratio, 1 - ratio] ratio_index = np.cumsum(ratio) window_spec = Window.partitionBy(split_by_column).orderBy(rand(seed=seed)) rating_grouped = ( data.groupBy(split_by_column) .agg({col_rating: "count"}) .withColumnRenamed("count(" + col_rating + ")", "count") ) rating_all = data.join(broadcast(rating_grouped), on=split_by_column) rating_rank = rating_all.withColumn( "rank", row_number().over(window_spec) / col("count") ) splits = [] for i, _ in enumerate(ratio_index): if i == 0: rating_split = rating_rank.filter(col("rank") <= ratio_index[i]) else: rating_split = rating_rank.filter( (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1]) ) splits.append(rating_split) return splits
# COMMAND ---------- from pyspark.sql.types import StringType manualSplitPythonUDF = spark.udf.register("manualSplitSQLUDF", manual_split, StringType()) # COMMAND ---------- # MAGIC %md # MAGIC Create a dataframe of 100k values with a string to index. Do this by using a hash function. # COMMAND ---------- from pyspark.sql.functions import sha1, rand randomDF = (spark.range(1, 10000 * 10 * 10 * 10) .withColumn("random_value", rand(seed=10).cast("string")) .withColumn("hash", sha1("random_value")) .drop("random_value") ) display(randomDF) # COMMAND ---------- # MAGIC %md # MAGIC Apply the UDF by using it just like any other Spark function. # COMMAND ---------- randomAugmentedDF = randomDF.select("*", manualSplitPythonUDF("hash").alias("augmented_col"))
#cuminv=np.loadtxt('scripts/cuminv_bdt.txt') # we know the binnings that were used dz = 0.01 du = 1 / 1000. #find indices and return the table values @pandas_udf('float', PandasUDFType.SCALAR) def z_PZ(zr, u): iz = np.array(zr / dz, dtype='int') iu = np.array(u / du, dtype='int') return pd.Series(cuminv[iz, iu]) #add column of uniform random numbers gal = gal.withColumn("u", F.rand()) #transform with the inverse-cumulative table gal=gal.withColumn("zrec",z_PZ("z","u")+dz/2)\ .drop("u") #do not need u anymore gal.show(5) ddt.append(timer.step()) timer.print(ana) #### ana = "3: cache (count)" gal = gal.cache() print("N={}".format(gal.count())) ddt.append(timer.step()) timer.print(ana)
# 0 1 16382 0.049476 # 1 0 314728 0.950524 # test # 82183 # Class count ratio # 0 1 4184 0.050911 # 1 0 77999 0.949089 #way 2-2 Exact stratification using Window ===multi-class variant in comments temp = ( JMM_binary_Vfeature .withColumn("id", F.monotonically_increasing_id()) .withColumn("Random", F.rand(seed=1000)) .withColumn( "Row", F.row_number() .over( Window .partitionBy("Class") .orderBy("Random") ) ) ) #top 20899 rows are class 1 num_P=20566 num_N=392727 training_stratification = temp.where(
schema = StructType() \ .add("order_id", StringType()) \ .add("customer_id", StringType()) \ .add("order_status", StringType()) \ .add("order_purchase_timestamp", StringType()) \ .add("order_approved_at", StringType()) \ .add("order_delivered_carrier_date", StringType()) \ .add("order_delivered_customer_date", StringType()) \ .add("order_estimated_delivery_date", StringType()) parsed_orders = raw_orders \ .select(F.from_json(F.col("value").cast("String"), schema).alias("value"), "offset") \ .select("value.*", "offset") extended_orders = parsed_orders \ .withColumn("my_extra_column", F.round( F.rand() * 100 ) ) \ .withColumn("my_current_time", F.current_timestamp()) #FOREACH BATCH SINK def foreach_batch_sink(df, freq): return df \ .writeStream \ .foreachBatch(foreach_batch_function) \ .trigger(processingTime='%s seconds' % freq ) \ .start() def foreach_batch_function(df, epoch_id): print("starting epoch " + str(epoch_id)) df.persist()
result = result.join(data_pid_profile_emb, result.row_num == data_pid_profile_emb.row_num, 'inner').drop(result['row_num']) result = result.join(data_click_emb, result.row_num == data_click_emb.row_num, 'inner').drop(result['row_num']) result = result.join(data_click_cross_emb, result.row_num == data_click_cross_emb.row_num, 'inner').drop(result['row_num']) result = result.join(data_order_cross_emb, result.row_num == data_order_cross_emb.row_num, 'inner').drop(result['row_num']) data_stage4_1 = result.withColumn( 'features', concat_ws('|', result.cid, result.pid, result.click, result.click_cross, result.order_cross, *[getattr(result, str(col)) for col in item_col])) data_stage4_2 = data_stage4_1.withColumn( 'merge', concat_ws('@', data_stage4_1.label, data_stage4_1.features)) trainDF, testDF = data_stage4_2.orderBy(rand()).randomSplit([rate, 1.0 - rate], 777) trainDF.select('merge').rdd.map(lambda x: '|'.join([ item[1:-1] if idx in [2, 3, 4] else item for idx, item in enumerate(x[0].split('|')) ])).saveAsTextFile(train_out_path) testDF.select('merge').rdd.map(lambda x: '|'.join([ item[1:-1] if idx in [2, 3, 4] else item for idx, item in enumerate(x[0].split('|')) ])).saveAsTextFile(eval_out_path)
axis=0) labPt = LabeledPoint(line.tip_amount, features) return labPt ## SPLIT DATA ## sample_size = 0.25 #test with sample of data train_ = 0.75 test_ = (1 - train_) #seed = 5767; encoded_sample = encoded.sample(False, sample_size, seed=seed) temp_rand = encoded_sample.select("*", rand(0).alias("rand")) train_data, test_data = temp_rand.randomSplit([train_, test_], seed=seed) indexed_train_bin = train_data.map(parseRowIndexingBinary) indexed_test_bin = test_data.map(parseRowIndexingBinary) oneHot_train_bin = train_data.map(parseRowOneHotBinary) oneHot_test_bin = test_data.map(parseRowOneHotBinary) indexed_train_reg = train_data.map(parseRowIndexingRegression) indexed_test_reg = test_data.map(parseRowIndexingRegression) oneHot_train_reg = train_data.map(parseRowOneHotRegression) oneHot_test_reg = test_data.map(parseRowOneHotRegression) ## FEATURE SCALING ## label = oneHot_train_reg.map(lambda x: x.label) features = oneHot_train_reg.map(lambda x: x.features)
def fallback_prediction(x): """ Make a random Guess if model made no predicitons """ return when(isnan(x), rand()).otherwise(col(x))
import pyspark.sql.functions as f rate = 10000000 item_nums = 10000000 ratePartitions = 40 dfInput = (spark .readStream .format("rate") .option("rowsPerSecond", rate) .option("numPartitions",ratePartitions) .load()) dfSales = (dfInput .withColumn("item_id",f.col("value") % item_nums) .withColumn("sales", (f.lit(1) + 10 * f.rand(seed = 42)).cast("int")) .select("timestamp","item_id","sales") ) # Define table name and checkpoint location of the streaming table. (checkpoint_location for database has been defined in setup notebook) table_name = "sw_db.bronze_compact" checkpointTable = checkpoint_location + table_name # Write to Delta (dfSales .writeStream .option("checkpointLocation", checkpointTable) .format("delta") .outputMode("append") .table(table_name)
def spark_stratified_split( data, ratio=0.75, min_rating=1, filter_by="user", col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_rating=DEFAULT_RATING_COL, seed=42, ): """Spark stratified splitter For each user / item, the split function takes proportions of ratings which is specified by the split ratio(s). The split is stratified. Args: data (spark.DataFrame): Spark DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two halves and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. Earlier indexed splits will have earlier times (e.g the latest time per user or item in split[0] <= the earliest time per user or item in split[1]) seed (int): Seed. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. col_user (str): column name of user IDs. col_item (str): column name of item IDs. Returns: list: Splits of the input data as spark.DataFrame. """ if not (filter_by == "user" or filter_by == "item"): raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError( "min_rating should be integer and larger than or equal to 1.") multi_split, ratio = process_split_ratio(ratio) split_by_column = col_user if filter_by == "user" else col_item if min_rating > 1: data = min_rating_filter_spark( data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) ratio = ratio if multi_split else [ratio, 1 - ratio] ratio_index = np.cumsum(ratio) window_spec = Window.partitionBy(split_by_column).orderBy(rand(seed=seed)) rating_grouped = (data.groupBy(split_by_column).agg({ col_rating: "count" }).withColumnRenamed("count(" + col_rating + ")", "count")) rating_all = data.join(broadcast(rating_grouped), on=split_by_column) rating_rank = rating_all.withColumn( "rank", row_number().over(window_spec) / col("count")) splits = [] for i, _ in enumerate(ratio_index): if i == 0: rating_split = rating_rank.filter(col("rank") <= ratio_index[i]) else: rating_split = rating_rank.filter( (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])) splits.append(rating_split) return splits
schema = StructType( [get_structfield(colname) for colname in header.split(',')]) #inputs = 'gs://{}/flights/tzcorr/all_flights-00000-*'.format(BUCKET) # 1/30th inputs = 'gs://{}/flights/tzcorr/all_flights-*'.format(BUCKET) # FULL flights = spark.read\ .schema(schema)\ .csv(inputs) flights.createOrReplaceTempView('flights') # separate training and validation data from pyspark.sql.functions import rand SEED = 13 traindays = traindays.withColumn( "holdout", rand(SEED) > 0.8) # 80% of data is for training traindays.createOrReplaceTempView('traindays') # logistic regression trainquery = """ SELECT DEP_DELAY, TAXI_OUT, ARR_DELAY, DISTANCE, DEP_TIME, DEP_AIRPORT_TZOFFSET FROM flights f JOIN traindays t ON f.FL_DATE == t.FL_DATE WHERE t.is_train_day == 'True' AND t.holdout == False AND f.CANCELLED == '0.00' AND f.DIVERTED == '0.00' """
import pyspark.sql.functions as F import time # COMMAND ---------- days_back = 14 values_per_second = 337 nowTimestamp = time.time() # COMMAND ---------- dfTimeSeries = sqlContext.range(0, days_back * 24 * 60 * 60 * values_per_second) \ .withColumn("Timestamp", (nowTimestamp - (F.col("id") / values_per_second)).cast("Timestamp")) \ .drop("id") \ .withColumn("Sensor", F.concat_ws('-', 1 + (F.rand() * 10).cast("Int"), 1 + (F.rand() * 100).cast("Int"), 1 + (F.rand() * 350).cast("Int"))) \ .withColumn("Value", F.round(F.rand() * 100, 3)) \ .withColumn("year", F.year("Timestamp")) \ .withColumn("month", F.month("Timestamp")) \ .withColumn("day", F.dayofmonth("Timestamp")) display(dfTimeSeries) # COMMAND ---------- spark.conf.set("fs.azure.account.key.<StorageAccountName>.blob.core.windows.net", \ "<StorageAccountKey>") dfTimeSeries.write \
import geopandas as gpd # Read csv uber_df = spark.read.csv("uber14.csv", inferSchema=True, header=True) nyc = gpd.read_file('NYC_map/nyc.shp') # Change lat/long to float uber_df = uber_df.withColumn("Lat", uber_df["Lat"].cast(FloatType())) uber_df = uber_df.withColumn("Lon", uber_df["Lon"].cast(FloatType())) # Add columns: Burrow , Month uber_df = uber_df.withColumn('Burrow', lit(None)) uber_df = uber_df.withColumn('Month', lit(None)) # Take sample sample_uber_df = uber_df.select("*").orderBy(rand()).limit(100000) def burrow_column(X, Y): point = Point(Y, X) if nyc['geometry'][0].contains(point): return ('Bronx') if nyc['geometry'][1].contains(point): return ('Staten Island') if nyc['geometry'][2].contains(point): return ('Brooklyn') if nyc['geometry'][3].contains(point): return ('Queens') if nyc['geometry'][4].contains(point): return ('Manhattan')
def sample_from_healthy_population(df, frac, withreplacement=True): return df.sample(withreplacement, frac) \ .select( col("id") ,(rand()*365).alias("timestamp").cast("int") )
def _transform(self, dataset): return dataset.withColumn( "prediction", dataset.feature + (rand(0) * self.getInducedError()))
non_escalation_case_status_history = (case_status_history.join( escalation_starts, on=['reference_id'], how='left_anti')) non_escalation_case_status_history.count() # 783586 non_escalation_decision_times = ( non_escalation_case_status_history.groupby('reference_id').agg( F.max('seconds_since_case_start').alias('case_end')).crossJoin( escalation_points_distribution).withColumn( 'time_cut', F.col('case_end') * F.col("average_percentile_escalation_point")).withColumn( 'random_row_rank_for_sampling', F.row_number().over( Window.partitionBy('reference_id').orderBy( F.rand()))).filter( F.col('random_row_rank_for_sampling') == 1)) non_escalation_case_status_history.count() # 783586 non_escalation_decision_times.count() # 52989 non_escalation_training_targets = (non_escalation_decision_times.join( non_escalation_case_status_history, on=['reference_id'], how='inner').filter( F.col('seconds_since_case_start') < F.col('time_cut') ).groupBy('reference_id').agg( F.max('seconds_since_case_start').alias('decision_time')).withColumn( 'target', F.lit(0))) non_escalation_training_targets.show() non_escalation_training_targets.count( ) # 51443 (we loose 52989 - 51443 = 1546)
def _sample_using_random(df, p: float = 0.1, seed: int = 1): """Take a random sample of rows, retaining proportion p""" res = (df.withColumn( "rand", F.rand(seed=seed)).filter(F.col("rand") < p).drop("rand")) return res
def shuffle(dataset): dataset = dataset.orderBy(rand()) return dataset
regexTokenizer = RegexTokenizer(inputCol="comment", outputCol="text", pattern="\\W") df_clean = regexTokenizer.transform(df_clean) df_clean.show(10) # COMMAND ---------- # MAGIC %md # MAGIC ##### Alert: First try is to use 1,000,000 rows for testing # COMMAND ---------- from pyspark.sql.functions import rand df_clean.orderBy(rand(seed=0)).createOrReplaceTempView("table1") df_clean = spark.sql("select * from table1 limit 1000000") # COMMAND ---------- # use word2vec get text vector feature. from pyspark.ml.feature import Word2Vec # Learn a mapping from words to Vectors. (choose higher vectorSize here) #word2Vec = Word2Vec(vectorSize=20, minCount=1, inputCol="text", outputCol="wordVector") word2Vec = Word2Vec(vectorSize=50, minCount=1, inputCol="text", outputCol="wordVector") model = word2Vec.fit(df_clean) df_model = model.transform(df_clean)
output_path = "s3://emr-rwes-pa-spark-dev-datastore/BI_IPF_2016/02_results/" start_time = time.time() st = datetime.datetime.fromtimestamp(start_time).strftime('%Y%m%d_%H%M%S') table_name = "hive_test_" + st datafactz_table_name = "hive_test_datafactz_" + st pos = sqlContext.read.load((data_path + pos_file), format='com.databricks.spark.csv', header='true', inferSchema='true') neg = sqlContext.read.load((data_path + neg_file), format='com.databricks.spark.csv', header='true', inferSchema='true') dataColumns = pos.columns data = pos.select(dataColumns).unionAll(neg.select(dataColumns)) #for IMS data.write.save(path=output_path + table_name, format='orc') #for datafactz df = sqlContext.range(0, numRowsReq) datafactz_df = df.select(rand().alias("Col1"), rand().alias("Col2"), rand().alias("Col3")) datafactz_df.write.save(path=output_path + datafactz_table_name, format='orc')
def array_choice(col): index = (F.rand() * F.size(col)).cast("int") return col[index]
def main(): # Setup Spark spark = SparkSession.builder.master("local[*]").getOrCreate() # Nice way to write a tmp file onto the system temp_csv_file = tempfile.mktemp() with open(temp_csv_file, mode="wb") as f: data_https = requests.get("https://teaching.mrsharky.com/data/iris.data") f.write(data_https.content) fisher_df_orig = spark.read.csv(temp_csv_file, inferSchema="true", header="false") fisher_df_orig.persist(StorageLevel.MEMORY_ONLY) fisher_df_orig.show() # Change column names column_names = [ "sepal_length", "sepal_width", "petal_length", "petal_width", "class", ] fisher_df_orig = fisher_df_orig.toDF(*column_names) # Randomize order of rows fisher_df_orig = fisher_df_orig.withColumn("random", rand()).orderBy("random") # Make a copy of the DataFrame (so we can start over) fisher_df = fisher_df_orig fisher_df.createOrReplaceTempView("fisher") print_heading("Original Dataset") fisher_df.show() # Get the average of all continuous fields print_heading("Population Average") fisher_avg_df = spark.sql( """ SELECT AVG(sepal_length) AS avg_sepal_length , AVG(sepal_width) AS avg_sepal_width , AVG(petal_length) AS avg_petal_length , AVG(petal_width) AS avg_petal_width FROM fisher """ ) fisher_avg_df.show() # Get the average of all continuous fields by class print_heading("Average by class") fisher_avg_df = spark.sql( """ SELECT class , AVG(sepal_length) AS avg_sepal_length , AVG(sepal_width) AS avg_sepal_width , AVG(petal_length) AS avg_petal_length , AVG(petal_width) AS avg_petal_width FROM fisher GROUP BY class ORDER BY class """ ) fisher_avg_df.show() # Build a features vector print_heading("VectorAssembler") vector_assembler = VectorAssembler( inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features", ) fisher_df = vector_assembler.transform(fisher_df) fisher_df.show() # Label String Indexer print_heading("StringIndexer") label_indexer = StringIndexer(inputCol="class", outputCol="class_idx") label_indexer_model = label_indexer.fit(fisher_df) fisher_df = label_indexer_model.transform(fisher_df) fisher_df.show() # Random forest print_heading("RandomForestClassifier") random_forest = RandomForestClassifier( labelCol="class_idx", featuresCol="features", ) random_forest_model = random_forest.fit(fisher_df) fisher_df_predicted = random_forest_model.transform(fisher_df) fisher_df_predicted.createOrReplaceTempView("predicted") fisher_df_predicted.show() print_heading("Accuracy") fisher_df_accuracy = spark.sql( """ SELECT SUM(correct)/COUNT(*) AS accuracy FROM (SELECT CASE WHEN prediction == class_idx THEN 1 ELSE 0 END AS correct FROM predicted) AS TMP """ ) fisher_df_accuracy.show() # Pipeline print_heading("Pipeline") fisher_df = fisher_df_orig fisher_df.createOrReplaceTempView("fisher") pipeline = Pipeline(stages=[vector_assembler, label_indexer, random_forest]) model = pipeline.fit(fisher_df) fisher_df_predicted = model.transform(fisher_df) fisher_df_predicted.show() return
from pyspark.sql.functions import rand, randn, mean, min, max from pyspark.sql.context import SQLContext from pyspark.context import SparkConf, SparkContext conf = SparkConf().setMaster("local").setAppName("sparkDataFrame") sc = SparkContext(conf = conf) sqlcontext = SQLContext(sc) # 1. Create a DataFrame with one int column and 10 rows. df = sqlcontext.range(0, 10) df.show() # Generate two other columns using uniform distribution and normal distribution. df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")) df.show() # 2. Summary and Descriptive Statistics df = sqlcontext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27)) df.describe('uniform', 'normal').show() df.select([mean('uniform'), min('uniform'), max('uniform')]).show() # 3. Sample covariance and correlation # Covariance is a measure of how two variables change with respect to each other. # A positive number would mean that there is a tendency that as one variable increases, # the other increases as well. # A negative number would mean that as one variable increases, # the other variable has a tendency to decrease. df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27)) df.stat.cov('rand1', 'rand2') df.stat.cov('id', 'id')
from pyspark.shell import sqlContext from pyspark.sql.functions import rand, randn from pyspark.sql import * from pyspark.sql.functions import mean, min, max df = sqlContext.range(0, 7) df.show() df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show() df.describe("uniform", "normal").show() dfNew = df.describe("uniform", "normal").show() dfNew.select([mean("uniform"), min("uniform"), max("uniform")]).show()
'prediction_vote', f.when(df.confidence_vote > 0.5, 1.0).otherwise(0.0)) df = df.withColumn( 'confidence_vote', f.when(df.prediction_vote == 0.0, 1 - df.prediction_vote).otherwise(df.prediction_vote)) return (df) # Import des data training_large = [dir + '/Data/stemmed_amazon_500k_train.txt'] test_large = ['./Data/stemmed_amazon_500k_test.txt'] test_imbd = [dir + '/Data/imdb_yelp.txt'] X_train_large = loadData(training_large, minDF=1, TFIDF_b=True) X_train_large = X_train_large.orderBy(rand()) X_test_large = loadData(test_large, train_cv=0, TFIDF_b=True) X_test_imbd = loadData(test_imbd, train_cv=0, TFIDF_b=True) X_train_large.groupby('score').count().show() X_test_large.groupby('score').count().show() X_test_imbd.groupby('score').count().show() # Model path NB_model_path = './Model/NB_model_500k' LR_model_path = './Model/LR_model_500k' RF_model_path = './Model/RF_model_500k' # Naive Bayes NB = NaiveBayes(modelType="multinomial", labelCol="score", featuresCol="X") NB_model = NB.fit(X_train_large)
def _do_stratification_spark( data, ratio=0.75, min_rating=1, filter_by="user", is_partitioned=True, is_random=True, seed=42, col_user=DEFAULT_USER_COL, col_item=DEFAULT_ITEM_COL, col_timestamp=DEFAULT_TIMESTAMP_COL, ): """Helper function to perform stratified splits. This function splits data in a stratified manner. That is, the same values for the filter_by column are retained in each split, but the corresponding set of entries are divided according to the ratio provided. Args: data (pyspark.sql.DataFrame): Spark DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two sets and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. min_rating (int): minimum number of ratings for user or item. filter_by (str): either "user" or "item", depending on which of the two is to filter with min_rating. is_partitioned (bool): flag to partition data by filter_by column is_random (bool): flag to make split randomly or use timestamp column seed (int): Seed. col_user (str): column name of user IDs. col_item (str): column name of item IDs. col_timestamp (str): column name of timestamps. Args: Returns: """ # A few preliminary checks. if filter_by not in ["user", "item"]: raise ValueError("filter_by should be either 'user' or 'item'.") if min_rating < 1: raise ValueError( "min_rating should be integer and larger than or equal to 1.") if col_user not in data.columns: raise ValueError("Schema of data not valid. Missing User Col") if col_item not in data.columns: raise ValueError("Schema of data not valid. Missing Item Col") if not is_random: if col_timestamp not in data.columns: raise ValueError("Schema of data not valid. Missing Timestamp Col") if min_rating > 1: data = min_rating_filter_spark( data=data, min_rating=min_rating, filter_by=filter_by, col_user=col_user, col_item=col_item, ) split_by = col_user if filter_by == "user" else col_item partition_by = split_by if is_partitioned else [] col_random = "_random" if is_random: data = data.withColumn(col_random, F.rand(seed=seed)) order_by = F.col(col_random) else: order_by = F.col(col_timestamp) window_count = Window.partitionBy(partition_by) window_spec = Window.partitionBy(partition_by).orderBy(order_by) data = (data.withColumn("_count", F.count(split_by).over(window_count)).withColumn( "_rank", F.row_number().over(window_spec) / F.col("_count")).drop("_count", col_random)) # Persist to avoid duplicate rows in splits caused by lazy evaluation data.persist(StorageLevel.MEMORY_AND_DISK_2).count() multi_split, ratio = process_split_ratio(ratio) ratio = ratio if multi_split else [ratio, 1 - ratio] splits = [] prev_split = None for split in np.cumsum(ratio): condition = F.col("_rank") <= split if prev_split is not None: condition &= F.col("_rank") > prev_split splits.append(data.filter(condition).drop("_rank")) prev_split = split return splits
counts2 = ( inventory # Select only the columns that are needed .select(['id','element']) # Group by source and count destinations .groupBy('id') .agg({'element': 'count'}) .orderBy('count(element)', ascending=False) .select( F.col('id'), F.col('count(element)').alias('element_count') ) ) # By adding 'element' to the groupby we can determine that there are no duplicates for each element of each inventory inventory2 = inventory.withColumn("core_flag", F.rand()) inventory2 = inventory.withColumn("prcp_flag", F.rand()) inventory2.show() inventory2 = inventory2.withColumn( "core_flag", F.when((F.col('element') == "TMAX") | (F.col('element') == "TMIN") | (F.col('element') == "PRCP") | (F.col('element') == "SNOW") | (F.col('element') == "SNWD"), 1).otherwise(0)) inventory2 = inventory2.withColumn( "prcp_flag", F.when((F.col('element') == "PRCP"), 1).otherwise(0))
from pyspark.ml.feature import StandardScaler # Initialize the `standardScaler` standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled") # Fit the DataFrame to the scaler scaler = standardScaler.fit(df) # Transform the data in `df` with the scaler scaled_df = scaler.transform(df) # Inspect the result scaled_df.take(2) from pyspark.sql.functions import rand df = df.orderBy(rand()) train_data, test_data = df.randomSplit([0.8, 0.2],seed=1234) # Fitting the LogisticRegression: Change the below code for all the types of algorithms that we need for project from pyspark.ml.classification import LogisticRegression mlr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) # Fit the model mlrModel = mlr.fit(train_data) #Predict the values for test_data predicted = mlrModel.transform(test_data) predicted.head(5)
# COMMAND ---------- from pyspark.sql.functions import rand, randn # Create a DataFrame with one int column and 10 rows. df = sqlContext.range(0, 10) df.show() # COMMAND ---------- display(df) # COMMAND ---------- # Generate two other columns using uniform distribution and normal distribution. df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show() # COMMAND ---------- display(df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal"))) # COMMAND ---------- # MAGIC %md ### Summary and Descriptive Statistics # MAGIC # MAGIC The first operation to perform after importing data is to get some sense of what it looks like. For numerical columns, knowing the descriptive summary statistics can help a lot in understanding the distribution of your data. The function `describe` returns a DataFrame containing information such as number of non-null entries (count), mean, standard deviation, and minimum and maximum value for each numerical column. # COMMAND ---------- from pyspark.sql.functions import rand, randn
def combine_matrix(X, y, top = 4): """Create the data matrix for predictive modeling Notes: The default top n number is 5 Args: X(SparkSQL DataFrame): y(SparkSQL DataFrame): Return: matrixAll(SparkSQL DataFrame): """ # logging.info('Creating the big matrix X:y...') # y = hc.createDataFrame(y) ### Change y's column name 'serial_number' to 'SN' y = y.withColumnRenamed('serial_number', 'SN') ### Join X and y on serial_number, SN ### Add a new column 'y' specify return (1) or pass (0) matrixAll = (X.join(y, X.serial_number == y.SN, how = 'left_outer') .withColumn('y', y['SN'].isNotNull().cast('int'))) # matrixAll.cache() ### Drop row that has null values matrixAllDropNa = matrixAll.dropna(how = 'any') # matrixAllDropNa.cache() print 'to pandas()' symptomLocationPdf = matrixAllDropNa[['check_in_code', 'fail_location']].toPandas() print 'complete toPandas()' # locationPdf = matrixAllDropNa[['fail_location']].toPandas() #return symptomPdf #return matrixAllDropNa, matrixAll codeSeries = symptomLocationPdf['check_in_code'].value_counts() #print codeSeries locationSeries = symptomLocationPdf['fail_location'].value_counts() ### Top N = 5 symptoms codeDict = {} locationDict = {} for i in range(top): # top n check in codes code = codeSeries.index[i] #codeLabel = 'code_{}'.format(i) codeLabel = '{}'.format(code) codeDict[code] = codeSeries[i] print 'top {} symptom: {}, count: {}'.format(i+1, code, codeSeries[i]) matrixAll = (matrixAll.withColumn(codeLabel, (matrixAll['check_in_code'].like('%{}'.format(code))).cast('int')) .fillna({codeLabel: 0})) # top n fail locations location = locationSeries.index[i] #locationLabel = 'location_{}'.format(i) locationLabel = '{}'.format(location) locationDict[location] = locationSeries[i] #print location print 'top {} fail location: {}, count: {}'.format(i+1, location, locationSeries[i]) matrixAll = (matrixAll.withColumn(locationLabel, (matrixAll['fail_location'].like('%{}'.format(location))).cast('int')) .fillna({locationLabel: 0})) # add a random integer column from 1 to 100 for later on sampling of training samples matrixAllRandDf = matrixAll.withColumn('random', rand()) # transform the float random number to integer between 1 to 100 matrixAllIntDf = matrixAllRandDf.withColumn('randInt', (matrixAllRandDf.random * 100).cast('int')) # cache the whole matrix table matrixAllIntDf.cache() return matrixAllIntDf
def _transform(self, dataset): return dataset.withColumn("prediction", dataset.feature + (rand(0) * self.getInducedError()))
'RNTP', 'SMP', 'VALP', 'WATP', 'GRNTP', 'GRPIP', 'GASP', 'NOC', 'NPF', 'NRC', 'OCPIP', 'SMOCP', 'AGEP', 'INTP', 'JWMNP', 'OIP', 'PAP', 'RETP', 'SEMP', 'SSIP', 'SSP', 'WKHP', 'POVPIP' ] ordinals = [ 'AGS', 'YBL', 'MV', 'TAXP', 'CITWP', 'DRAT', 'JWRIP', 'MARHT', 'MARHYP', 'SCHG', 'SCHL', 'WKW', 'YOEP', 'DECADE', 'JWAP', 'JWDP', 'SFN' ] ################################################################ #fill all null numericals value with 0 df = df.fillna(0, numericals) # SPLIT DATASET from pyspark.sql.functions import rand (train_set, test_set) = df.orderBy(rand()).randomSplit([0.7, 0.3]) ############################################################### #INDEXING AND SCALING from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, VectorAssembler from pyspark.ml.feature import StandardScaler utils.printNowToFile("starting pipeline") ordinals_input = [col + "_index" for col in ordinals] stdFeatures = ['numericals_std', 'ordinals_std'] # stages for index and scaling pipeline stages = [