Beispiel #1
0
    def test_element_wise_product(self):
        data = self.spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]), )],
                                          ["features"])
        model = ElementwiseProduct(scalingVec=Vectors.dense([1.0, 2.0, 3.0]),
                                   inputCol="features",
                                   outputCol="eprod")
        feature_count = data.first()[0].size
        model_onnx = convert_sparkml(
            model, 'Sparkml ElementwiseProduct',
            [('features', FloatTensorType([1, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = [
            predicted.toPandas().eprod.apply(
                lambda x: pandas.Series(x.toArray())).values.astype(
                    numpy.float32)
        ]
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlElementwiseProduct")
        onnx_model_path = paths[3]
        output, output_shapes = run_onnx_model(['eprod'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)
Beispiel #2
0
    def fit(self, sdf):
        """

        :param sdf:
        :return:
        """

        if self.weighter is None:
            raise NotImplementedError(
                "The weighter parameter has not been defined.")

        weights_arr = self.weighter.get_feature_importances(sdf)

        pipeline_lst = [
            VectorAssembler(inputCols=self.input_cols, outputCol="vec"),
            StandardScaler(inputCol="vec", outputCol="standard_vec"),
            ElementwiseProduct(scalingVec=weights_arr,
                               inputCol='standard_vec',
                               outputCol='scaled_vec')
        ]

        _model = Pipeline(stages=pipeline_lst)
        model = _model.fit(sdf)

        self.model = model

        return self
Beispiel #3
0
 def test_vector(self):
     ewp = ElementwiseProduct(scalingVec=[1, 3])
     self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0]))
     ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4]))
     self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4]))
     self.assertRaises(TypeError,
                       lambda: ElementwiseProduct(scalingVec=["a", "b"]))
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("ElementwiseProductExample")\
        .getOrCreate()

    # $example on$
    # Create some vector data; also works for sparse vectors
    data = [(Vectors.dense([1.0, 2.0, 3.0]),), (Vectors.dense([4.0, 5.0, 6.0]),)]
    df = spark.createDataFrame(data, ["vector"])
    transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
                                     inputCol="vector", outputCol="transformedVector")
    # Batch transform the vectors to create new column:
    transformer.transform(df).show()
    # $example off$

    spark.stop()
bucketedData = bucketizer.transform(dataFrame)

print("Bucketizer output with %d buckets" % (len(bucketizer.getSplits()) - 1))
bucketedData.show()

# COMMAND ----------

###Element wise product multiplies the given vectors with the scaling vector
from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors

# Create some vector data; also works for sparse vectors
data = [(Vectors.dense([1.0, 2.0, 3.0]), ), (Vectors.dense([4.0, 5.0, 6.0]), )]
df = spark.createDataFrame(data, ["vector"])
transformer = ElementwiseProduct(scalingVec=Vectors.dense([0.0, 1.0, 2.0]),
                                 inputCol="vector",
                                 outputCol="transformedVector")
# Batch transform the vectors to create new column:
transformer.transform(df).show()

# COMMAND ----------

###SQL transformer transforms the given dtaframe into the following manner (due to lack of sql support)
from pyspark.ml.feature import SQLTransformer

df = spark.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", "v1", "v2"])
sqlTrans = SQLTransformer(
    statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
sqlTrans.transform(df).show()

# COMMAND ----------
Beispiel #6
0
from pyspark.ml.feature import MaxAbsScaler

maScaler = MaxAbsScaler().setInputCol("features").setOutputCol(
    "features_MaxAbs_scaled")
fittedmaScaler = maScaler.fit(scaleDF)
fittedmaScaler.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors

scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
  .setScalingVec(scaleUpVec)\
  .setInputCol("features")
scalingUp.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import Normalizer

manhattanDistance = Normalizer().setP(1).setInputCol("features")
manhattanDistance.transform(scaleDF).show()

# COMMAND ----------

from pyspark.ml.feature import StringIndexer

lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
Beispiel #7
0
 def test_vector(self):
     ewp = ElementwiseProduct(scalingVec=[1, 3])
     self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0]))
     ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4]))
     self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4]))
     self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"]))
Beispiel #8
0
def main(sc):
    sqlContext = SQLContext(sc)
    # In[1]:
    input_path = ''
    model_path = ''
    model_info_path = model_path + ''
    model_scaler_path = model_path + ''
    model_train_set_path = model_path + ''

    # Import the table of features and labels into dataframes
    df_data = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true', inferschema='true').load(input_path)

    # Convert all features to double type except for ID and Label, which remain as strings
    # This is done because the Random Forest Algorithm requires features to be numbers
    df_data = df_data.select(
        *(col(c).cast("double").alias(c) for c in df_data.columns[1:-1]),
        df_data.u_msisdn.cast('string'), df_data.tag.cast('string'))

    # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features
    df_master = df_data.rdd.map(lambda r: Row(
        cust_id=r[-2], label=r[-1], features=Vectors.dense(r[:-2]))).toDF()

    # Randomly Split the data into a test and train set
    (df_master_train, df_master_test) = df_master.randomSplit([0.5, 0.5],
                                                              seed=123)

    # Set the Random Forest input to the training set
    rf_init_data = df_master_train

    # Indexing labels for Random Forest Algorithm
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexed_label")
    model = labelIndexer.fit(rf_init_data)
    rf_init_data = model.transform(rf_init_data)

    # Indexing features for Random Forest Algorithm
    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexed_features",
                                   maxCategories=2)
    model = featureIndexer.fit(rf_init_data)
    rf_init_data = model.transform(rf_init_data)

    # Configures inbuilt Random Forest Classifier function with 500 trees,
    # max depth = 8 and 32 bins
    rf_init = RandomForestClassifier(labelCol="indexed_label",
                                     featuresCol="indexed_features",
                                     numTrees=500,
                                     impurity="gini",
                                     maxDepth=8,
                                     maxBins=32)

    rf_init_data.persist()  # Cache the data set
    rf_init_model = rf_init.fit(
        rf_init_data)  # Run the Random Forest Algorithm

    rf_init_data.unpersist()

    # Extract a list of feature importances from the output of the Random Forest
    # Algorithm with each element corresponding to a feature
    rf_init_varimp = np.sqrt(rf_init_model.featureImportances.toArray())

    # Creates a list containing the 6 most important features to be used later
    # to subset our entire data from 146 features to just 6!

    # Create a list containing the names of all features
    column_names = df_data.columns[:-2]

    #Creating a dictionary mapping feature names to their respective importances
    NameToImp = dict()
    for i in range(len(column_names)):
        key = column_names[i]
        value = rf_init_varimp[i]
        NameToImp[key] = value

    # Sorted list in reverse order according to the variable importances
    sorted_varimp = sorted(NameToImp.values(), reverse=True)

    # Collect importances of 6 most important features
    sorted_top_varimp = sorted_varimp[:6]

    # Sorted list of column names in reverse order according to varimp
    sorted_colnames = sorted(NameToImp, key=NameToImp.get, reverse=True)

    # Collect colnames of 6 most imp features
    col_names = sorted_colnames[:6]

    # Pulling data for most import 6 features
    df_data_new = df_data.select(
        df_data.u_msisdn.cast('string'), df_data.tag.cast('string'),
        *(col(c).cast("double").alias(c) for c in col_names))

    # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features
    df_master_new = df_data_new.rdd.map(lambda r: Row(
        cust_id=r[0], label=r[1], features=Vectors.dense(r[2:]))).toDF()

    # Scale and normaize the features so that all features can be compared
    # and create a new column for the features
    scaler = StandardScaler(inputCol="features",
                            outputCol="scaled_features",
                            withStd=True,
                            withMean=True)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(df_master_new)

    # Normalize each feature to have unit standard deviation.
    df_master_new = scalerModel.transform(df_master_new)

    #The old features have been replaced with their scaled versions and thus
    # we no longer care about the old, unbalanced features
    df_master_new = df_master_new.drop('features')

    # Randomly Split the data into a test and train set
    (df_master_train, df_master_test) = df_master_new.randomSplit([0.5, 0.5],
                                                                  seed=123)

    test_all = df_master_test

    sqlContext.registerDataFrameAsTable(df_master_train,
                                        "df_master_train_table")

    # Remove the negative labels as only the positive ones are important
    train_all = sqlContext.sql(
        'select * from df_master_train_table where label = 1')

    # Multiply feature values with corresponding importances
    m = ElementwiseProduct(scalingVec=Vectors.dense(sorted_top_varimp),
                           inputCol="scaled_features",
                           outputCol="scaled_weighted_features")

    train_all = m.transform(train_all)

    test_all = m.transform(test_all)

    sqlContext.dropTempTable("df_master_train_table")

    # Create a list of tasks containing tuples of number of neighbours and
    # cutoff frequencies to be passed to KNN algorithm
    number_of_neighbours = [250, 550, 750, 1000]
    popshared = 0.30
    num_indices = int(popshared * (test_all.count()))
    tasks = []
    for num_neighbour in number_of_neighbours:
        tasks = tasks + [(num_neighbour, num_indices)]

    # Partitioning the tasks for parallel processing
    tasksRDD = sc.parallelize(tasks, numSlices=len(tasks))
    tasksRDD.collect()

    train_pd = train_all.toPandas()
    test_pd = test_all.toPandas()

    train_pd['indices'] = train_pd.index
    test_pd['indices'] = test_pd.index

    # Converting features into SparseVector format
    l_train = list()
    for k in train_pd.scaled_weighted_features:
        l_train.append(
            Vectors.sparse(len(k),
                           [(i, j) for i, j in enumerate(k) if j != 0]))

    l_test = list()
    for k in test_pd.scaled_weighted_features:
        l_test.append(
            Vectors.sparse(len(k),
                           [(i, j) for i, j in enumerate(k) if j != 0]))

        # Converting to a numpy array
    knn_train = np.asarray(l_train)
    knn_test = np.asarray(l_test)
    # Broadcasting the training and test sets to all partitions
    train_broadcast = sc.broadcast(knn_train)
    test_broadcast = sc.broadcast(knn_test)

    # Calling K Nearest Neighbour search on each partition
    tree_type = "kd_tree"
    resultsRDD = tasksRDD.map(lambda nc: findNearestNeighbour(
        train_broadcast, test_broadcast, nc[0], nc[1], test_pd, tree_type))
    resultsRDD.cache()
    resultsRDD.count()

    resultsPD = resultsRDD.toDF().toPandas()

    resultsPD["popshared"] = popshared
    resultsPD = resultsPD.rename(columns={'_1': 'Recall'})
    resultsPD = resultsPD.rename(columns={'_2': 'Number of Neighbors'})

    bestResult = (resultsPD.sort_values(by=["Recall"], ascending=[0])).iloc[0]
    bestNN = int(bestResult["Number of Neighbors"])
    bestRecall = bestResult["Recall"]

    # saving the model info - varimp,recall,NN,col_names to model_path
    column_names = [i for i in col_names]
    model_info = sc.parallelize([{
        "varimp": sorted_top_varimp,
        "recall": bestRecall,
        "NN": bestNN,
        "col_names": column_names
    }])
    model_info.saveAsPickleFile(path=model_info_path)

    # saving the scaler model to model_path
    scalerModel.write().overwrite().save(model_scaler_path)

    # saving the train set to model_path
    df_master_new.rdd.saveAsPickleFile(path=model_train_set_path)
    def get_output_col(self):
        return self.getOrDefault(self.output_col)

    def _transform(self, df: DataFrame):
        input_col = self.get_input_col()
        output_col = self.get_output_col()
        # The custom action: concatenate the integer form of the doubles from the Vector
        transform_udf = F.udf(lambda x: '/'.join([str(int(y)) for y in x]),
                              StringType())
        return df.withColumn(output_col, transform_udf(input_col))


if __name__ == "__main__":

    spark = sparknlp.start()

    df = spark.createDataFrame([(Vectors.dense([2.0, 1.0, 3.0]), ),
                                (Vectors.dense([0.4, 0.9, 7.0]), )],
                               ["numbers"])

    elementwise_product = ElementwiseProduct(scalingVec=Vectors.dense(
        [2.0, 3.0, 5.0]),
                                             inputCol="numbers",
                                             outputCol="product")
    custom_transformer = CustomTransformer(input_col="product",
                                           output_col="results")
    pipeline = Pipeline(stages=[elementwise_product, custom_transformer])
    model = pipeline.fit(df)
    results = model.transform(df)
    results.show()
# COMMAND ----------

from pyspark.ml.feature import MaxAbsScaler
maScaler = MaxAbsScaler().setInputCol("features")
fittedmaScaler = maScaler.fit(scaleDF)
fittedmaScaler.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import ElementwiseProduct
from pyspark.ml.linalg import Vectors
scaleUpVec = Vectors.dense(10.0, 15.0, 20.0)
scalingUp = ElementwiseProduct()\
  .setScalingVec(scaleUpVec)\
  .setInputCol("features")
scalingUp.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import Normalizer
manhattanDistance = Normalizer().setP(1).setInputCol("features")
manhattanDistance.transform(scaleDF).show()


# COMMAND ----------

from pyspark.ml.feature import StringIndexer
lblIndxr = StringIndexer().setInputCol("lab").setOutputCol("labelInd")
Beispiel #11
0
def main(sc):
    sqlContext = SQLContext(sc)
    input_path = ''
    output_path = ''
    model_path = ''
    model_info_path = model_path + ''
    model_scaler_path = model_path + ''
    model_train_set_path = model_path + ''

    #IMPORT THE CLIENT DATA
    client_data = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true', inferschema='true').load(input_path)

    # Load the models and train data from Training Interface paths
    model_info = sc.pickleFile(model_info_path).flatMap(
        lambda x: x.items()).collectAsMap()
    scalerModel = StandardScalerModel.load(model_scaler_path)
    df_master_new = sc.pickleFile(model_train_set_path).toDF()

    col_names = model_info['col_names']
    sorted_top_varimp = model_info['varimp']

    # Pulling data for most import 6 features
    client_data = client_data.select(
        client_data.u_msisdn.cast('string'),
        *(col(c).cast("double").alias(c) for c in col_names))

    # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features
    client_master = client_data.rdd.map(
        lambda r: Row(cust_id=r[0], features=Vectors.dense(r[1:]))).toDF()

    # Scale and normaize the features so that all features can be compared
    # and create a new column for the features
    client_scaler = StandardScaler(inputCol="features",
                                   outputCol="scaled_features",
                                   withStd=True,
                                   withMean=True)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = client_scaler.fit(client_master)

    # Normalize each feature to have unit standard deviation.
    client_master = scalerModel.transform(client_master)

    #The old features have been replaced with their scaled versions and thus
    # we no longer care about the old, unbalanced features
    client_master = client_master.drop('features')

    sqlContext.registerDataFrameAsTable(df_master_new, "df_master_train_table")

    # Remove the negative labels as only the positive ones are important
    train_all_client = sqlContext.sql(
        'select * from df_master_train_table where label = 1')

    # Multiply feature values with corresponding importances
    m = ElementwiseProduct(scalingVec=Vectors.dense(sorted_top_varimp),
                           inputCol="scaled_features",
                           outputCol="scaled_weighted_features")

    train_all_client = m.transform(train_all_client)

    client_master = m.transform(client_master)

    sqlContext.dropTempTable("df_master_train_table")

    nn = 1000
    popshared = 0.30
    num_indices = (int)(popshared * client_master.count())
    tree_type = "kd_tree"
    nn, popshared, num_indices

    train_pd = train_all_client.toPandas()
    test_pd = client_master.toPandas()

    freq_table = findNearestNeighbour_client(train_pd, test_pd, nn,
                                             num_indices, tree_type)

    sqlContext.createDataFrame(freq_table[['cust_id', 'freq']], ).repartition(
        1).write.format("com.databricks.spark.csv").save(output_path)