def test_element_wise_product(self): data = self.spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]), )], ["features"]) model = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]), inputCol="features", outputCol="eprod") feature_count = data.first()[0].size model_onnx = convert_sparkml( model, 'Sparkml ElementwiseProduct', [('features', FloatTensorType([1, feature_count]))]) self.assertTrue(model_onnx is not None) # run the model predicted = model.transform(data) expected = [ predicted.toPandas().eprod.apply( lambda x: pandas.Series(x.toArray())).values.astype( numpy.float32) ] data_np = data.toPandas().features.apply( lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32) paths = save_data_models(data_np, expected, model, model_onnx, basename="SparkmlElementwiseProduct") onnx_model_path = paths[3] output, output_shapes = run_onnx_model(['eprod'], data_np, onnx_model_path) compare_results(expected, output, decimal=5)
def fit(self, sdf): """ :param sdf: :return: """ if self.weighter is None: raise NotImplementedError( "The weighter parameter has not been defined.") weights_arr = self.weighter.get_feature_importances(sdf) pipeline_lst = [ VectorAssembler(inputCols=self.input_cols, outputCol="vec"), StandardScaler(inputCol="vec", outputCol="standard_vec"), ElementwiseProduct(scalingVec=weights_arr, inputCol='standard_vec', outputCol='scaled_vec') ] _model = Pipeline(stages=pipeline_lst) model = _model.fit(sdf) self.model = model return self
def test_vector(self): ewp = ElementwiseProduct(scalingVec=[1, 3]) self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0])) ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4])) self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4])) self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"]))
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("ElementwiseProductExample")\ .getOrCreate() # $example on$ # Create some vector data; also works for sparse vectors data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)] df = spark.createDataFrame(data, ["vector"]) transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]), inputCol="vector", outputCol="transformedVector") # Batch transform the vectors to create new column: transformer.transform(df).show() # $example off$ spark.stop()
bucketedData = bucketizer.transform(dataFrame) print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1)) bucketedData.show() # COMMAND ---------- ###Element wise product multiplies the given vectors with the scaling vector from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors # Create some vector data; also works for sparse vectors data = [(Vectors.dense([1.0, 2.0, 3.0]), ), (Vectors.dense([4.0, 5.0, 6.0]), )] df = spark.createDataFrame(data, ["vector"]) transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]), inputCol="vector", outputCol="transformedVector") # Batch transform the vectors to create new column: transformer.transform(df).show() # COMMAND ---------- ###SQL transformer transforms the given dtaframe into the following manner (due to lack of sql support) from pyspark.ml.feature import SQLTransformer df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"]) sqlTrans = SQLTransformer( statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") sqlTrans.transform(df).show() # COMMAND ----------
from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features").setOutputCol( "features_MaxAbs_scaled") fittedmaScaler = maScaler.fit(scaleDF) fittedmaScaler.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors scaleUpVec = Vectors.dense(10.0, 15.0, 20.0) scalingUp = ElementwiseProduct()\ .setScalingVec(scaleUpVec)\ .setInputCol("features") scalingUp.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import Normalizer manhattanDistance = Normalizer().setP(1).setInputCol("features") manhattanDistance.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import StringIndexer lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
def main(sc): sqlContext = SQLContext(sc) # In[1]: input_path = '' model_path = '' model_info_path = model_path + '' model_scaler_path = model_path + '' model_train_set_path = model_path + '' # Import the table of features and labels into dataframes df_data = sqlContext.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load(input_path) # Convert all features to double type except for ID and Label, which remain as strings # This is done because the Random Forest Algorithm requires features to be numbers df_data = df_data.select( *(col(c).cast("double").alias(c) for c in df_data.columns[1:-1]), df_data.u_msisdn.cast('string'), df_data.tag.cast('string')) # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features df_master = df_data.rdd.map(lambda r: Row( cust_id=r[-2], label=r[-1], features=Vectors.dense(r[:-2]))).toDF() # Randomly Split the data into a test and train set (df_master_train, df_master_test) = df_master.randomSplit([0.5, 0.5], seed=123) # Set the Random Forest input to the training set rf_init_data = df_master_train # Indexing labels for Random Forest Algorithm labelIndexer = StringIndexer(inputCol="label", outputCol="indexed_label") model = labelIndexer.fit(rf_init_data) rf_init_data = model.transform(rf_init_data) # Indexing features for Random Forest Algorithm featureIndexer = VectorIndexer(inputCol="features", outputCol="indexed_features", maxCategories=2) model = featureIndexer.fit(rf_init_data) rf_init_data = model.transform(rf_init_data) # Configures inbuilt Random Forest Classifier function with 500 trees, # max depth = 8 and 32 bins rf_init = RandomForestClassifier(labelCol="indexed_label", featuresCol="indexed_features", numTrees=500, impurity="gini", maxDepth=8, maxBins=32) rf_init_data.persist() # Cache the data set rf_init_model = rf_init.fit( rf_init_data) # Run the Random Forest Algorithm rf_init_data.unpersist() # Extract a list of feature importances from the output of the Random Forest # Algorithm with each element corresponding to a feature rf_init_varimp = np.sqrt(rf_init_model.featureImportances.toArray()) # Creates a list containing the 6 most important features to be used later # to subset our entire data from 146 features to just 6! # Create a list containing the names of all features column_names = df_data.columns[:-2] #Creating a dictionary mapping feature names to their respective importances NameToImp = dict() for i in range(len(column_names)): key = column_names[i] value = rf_init_varimp[i] NameToImp[key] = value # Sorted list in reverse order according to the variable importances sorted_varimp = sorted(NameToImp.values(), reverse=True) # Collect importances of 6 most important features sorted_top_varimp = sorted_varimp[:6] # Sorted list of column names in reverse order according to varimp sorted_colnames = sorted(NameToImp, key=NameToImp.get, reverse=True) # Collect colnames of 6 most imp features col_names = sorted_colnames[:6] # Pulling data for most import 6 features df_data_new = df_data.select( df_data.u_msisdn.cast('string'), df_data.tag.cast('string'), *(col(c).cast("double").alias(c) for c in col_names)) # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features df_master_new = df_data_new.rdd.map(lambda r: Row( cust_id=r[0], label=r[1], features=Vectors.dense(r[2:]))).toDF() # Scale and normaize the features so that all features can be compared # and create a new column for the features scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True) # Compute summary statistics by fitting the StandardScaler scalerModel = scaler.fit(df_master_new) # Normalize each feature to have unit standard deviation. df_master_new = scalerModel.transform(df_master_new) #The old features have been replaced with their scaled versions and thus # we no longer care about the old, unbalanced features df_master_new = df_master_new.drop('features') # Randomly Split the data into a test and train set (df_master_train, df_master_test) = df_master_new.randomSplit([0.5, 0.5], seed=123) test_all = df_master_test sqlContext.registerDataFrameAsTable(df_master_train, "df_master_train_table") # Remove the negative labels as only the positive ones are important train_all = sqlContext.sql( 'select * from df_master_train_table where label = 1') # Multiply feature values with corresponding importances m = ElementwiseProduct(scalingVec=Vectors.dense(sorted_top_varimp), inputCol="scaled_features", outputCol="scaled_weighted_features") train_all = m.transform(train_all) test_all = m.transform(test_all) sqlContext.dropTempTable("df_master_train_table") # Create a list of tasks containing tuples of number of neighbours and # cutoff frequencies to be passed to KNN algorithm number_of_neighbours = [250, 550, 750, 1000] popshared = 0.30 num_indices = int(popshared * (test_all.count())) tasks = [] for num_neighbour in number_of_neighbours: tasks = tasks + [(num_neighbour, num_indices)] # Partitioning the tasks for parallel processing tasksRDD = sc.parallelize(tasks, numSlices=len(tasks)) tasksRDD.collect() train_pd = train_all.toPandas() test_pd = test_all.toPandas() train_pd['indices'] = train_pd.index test_pd['indices'] = test_pd.index # Converting features into SparseVector format l_train = list() for k in train_pd.scaled_weighted_features: l_train.append( Vectors.sparse(len(k), [(i, j) for i, j in enumerate(k) if j != 0])) l_test = list() for k in test_pd.scaled_weighted_features: l_test.append( Vectors.sparse(len(k), [(i, j) for i, j in enumerate(k) if j != 0])) # Converting to a numpy array knn_train = np.asarray(l_train) knn_test = np.asarray(l_test) # Broadcasting the training and test sets to all partitions train_broadcast = sc.broadcast(knn_train) test_broadcast = sc.broadcast(knn_test) # Calling K Nearest Neighbour search on each partition tree_type = "kd_tree" resultsRDD = tasksRDD.map(lambda nc: findNearestNeighbour( train_broadcast, test_broadcast, nc[0], nc[1], test_pd, tree_type)) resultsRDD.cache() resultsRDD.count() resultsPD = resultsRDD.toDF().toPandas() resultsPD["popshared"] = popshared resultsPD = resultsPD.rename(columns={'_1': 'Recall'}) resultsPD = resultsPD.rename(columns={'_2': 'Number of Neighbors'}) bestResult = (resultsPD.sort_values(by=["Recall"], ascending=[0])).iloc[0] bestNN = int(bestResult["Number of Neighbors"]) bestRecall = bestResult["Recall"] # saving the model info - varimp,recall,NN,col_names to model_path column_names = [i for i in col_names] model_info = sc.parallelize([{ "varimp": sorted_top_varimp, "recall": bestRecall, "NN": bestNN, "col_names": column_names }]) model_info.saveAsPickleFile(path=model_info_path) # saving the scaler model to model_path scalerModel.write().overwrite().save(model_scaler_path) # saving the train set to model_path df_master_new.rdd.saveAsPickleFile(path=model_train_set_path)
def get_output_col(self): return self.getOrDefault(self.output_col) def _transform(self, df: DataFrame): input_col = self.get_input_col() output_col = self.get_output_col() # The custom action: concatenate the integer form of the doubles from the Vector transform_udf = F.udf(lambda x: '/'.join([str(int(y)) for y in x]), StringType()) return df.withColumn(output_col, transform_udf(input_col)) if __name__ == "__main__": spark = sparknlp.start() df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]), ), (Vectors.dense([0.4, 0.9, 7.0]), )], ["numbers"]) elementwise_product = ElementwiseProduct(scalingVec=Vectors.dense( [2.0, 3.0, 5.0]), inputCol="numbers", outputCol="product") custom_transformer = CustomTransformer(input_col="product", output_col="results") pipeline = Pipeline(stages=[elementwise_product, custom_transformer]) model = pipeline.fit(df) results = model.transform(df) results.show()
# COMMAND ---------- from pyspark.ml.feature import MaxAbsScaler maScaler = MaxAbsScaler().setInputCol("features") fittedmaScaler = maScaler.fit(scaleDF) fittedmaScaler.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import ElementwiseProduct from pyspark.ml.linalg import Vectors scaleUpVec = Vectors.dense(10.0, 15.0, 20.0) scalingUp = ElementwiseProduct()\ .setScalingVec(scaleUpVec)\ .setInputCol("features") scalingUp.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import Normalizer manhattanDistance = Normalizer().setP(1).setInputCol("features") manhattanDistance.transform(scaleDF).show() # COMMAND ---------- from pyspark.ml.feature import StringIndexer lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
def main(sc): sqlContext = SQLContext(sc) input_path = '' output_path = '' model_path = '' model_info_path = model_path + '' model_scaler_path = model_path + '' model_train_set_path = model_path + '' #IMPORT THE CLIENT DATA client_data = sqlContext.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load(input_path) # Load the models and train data from Training Interface paths model_info = sc.pickleFile(model_info_path).flatMap( lambda x: x.items()).collectAsMap() scalerModel = StandardScalerModel.load(model_scaler_path) df_master_new = sc.pickleFile(model_train_set_path).toDF() col_names = model_info['col_names'] sorted_top_varimp = model_info['varimp'] # Pulling data for most import 6 features client_data = client_data.select( client_data.u_msisdn.cast('string'), *(col(c).cast("double").alias(c) for c in col_names)) # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features client_master = client_data.rdd.map( lambda r: Row(cust_id=r[0], features=Vectors.dense(r[1:]))).toDF() # Scale and normaize the features so that all features can be compared # and create a new column for the features client_scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True) # Compute summary statistics by fitting the StandardScaler scalerModel = client_scaler.fit(client_master) # Normalize each feature to have unit standard deviation. client_master = scalerModel.transform(client_master) #The old features have been replaced with their scaled versions and thus # we no longer care about the old, unbalanced features client_master = client_master.drop('features') sqlContext.registerDataFrameAsTable(df_master_new, "df_master_train_table") # Remove the negative labels as only the positive ones are important train_all_client = sqlContext.sql( 'select * from df_master_train_table where label = 1') # Multiply feature values with corresponding importances m = ElementwiseProduct(scalingVec=Vectors.dense(sorted_top_varimp), inputCol="scaled_features", outputCol="scaled_weighted_features") train_all_client = m.transform(train_all_client) client_master = m.transform(client_master) sqlContext.dropTempTable("df_master_train_table") nn = 1000 popshared = 0.30 num_indices = (int)(popshared * client_master.count()) tree_type = "kd_tree" nn, popshared, num_indices train_pd = train_all_client.toPandas() test_pd = client_master.toPandas() freq_table = findNearestNeighbour_client(train_pd, test_pd, nn, num_indices, tree_type) sqlContext.createDataFrame(freq_table[['cust_id', 'freq']], ).repartition( 1).write.format("com.databricks.spark.csv").save(output_path)