Python VectorSlicer.transform Exemples, pyspark.ml.feature.VectorSlicer.transform Python Exemples

Exemple #1

0

Afficher le fichier

    def test_vector_slicer(self):
        data = self.spark.createDataFrame(
            [(Vectors.dense([-2.0, 2.3, 0.0, 0.0, 1.0]), ),
             (Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0]), ),
             (Vectors.dense([0.6, -1.1, -3.0, 4.5, 3.3]), )], ["features"])
        model = VectorSlicer(inputCol="features",
                             outputCol="sliced",
                             indices=[1, 4])

        feature_count = data.first()[0].array.size
        model_onnx = convert_sparkml(
            model, 'Sparkml VectorSlicer',
            [('features', FloatTensorType([None, feature_count]))])
        self.assertTrue(model_onnx is not None)

        # run the model
        predicted = model.transform(data)
        expected = predicted.toPandas().sliced.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        data_np = data.toPandas().features.apply(
            lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
        paths = save_data_models(data_np,
                                 expected,
                                 model,
                                 model_onnx,
                                 basename="SparkmlVectorSlicer")
        onnx_model_path = paths[-1]
        output, output_shapes = run_onnx_model(['sliced'], data_np,
                                               onnx_model_path)
        compare_results(expected, output, decimal=5)

Exemple #2

0

Afficher le fichier

Fichier : find_null.rows.py Projet : INKWWW/Spark

def predict_prob(lrModelPath, test_data):
    lrModel = LogisticRegressionModel.load(lrModelPath)
    predictions = lrModel.transform(test_data)
    result = predictions.select(['_c0', '_c1', '_c2', 'probability'])
    print('*************** result **************')
    print(result.show(5))
    # result.write.csv('file:///opt/int_group/result123')

    vs = VectorSlicer(inputCol="probability", outputCol="prob_1", indices=[1])
    prob_1 = vs.transform(result)
    print('*************** prob_1 **************')
    print(prob_1.show(5))
    result_prob1 = prob_1.select(['_c0', '_c1', '_c2', 'prob_1'])
    print('*************** result_prob1 **************')
    print(result_prob1.show(5))

    new_result_prob1 = result_prob1.select([
        '_c0', '_c1', '_c2',
        result_prob1['prob_1'].cast('string').alias('prob_1_str')
    ])
    print('*************** new_result_prob1 **************')
    print(new_result_prob1.show(10))
    print(new_result_prob1)
    # find null rows
    final_null_rows = new_result_prob1.filter(new_result_prob1._c0.isNull() | new_result_prob1._c1.isNull()\
        | new_result_prob1._c2.isNull() | new_result_prob1.prob_1_str.isNull())
    print('########### find null rows #############')
    final_null_rows.show(100)

Exemple #3

0

Afficher le fichier

def slice_win_source_to(source, destination):
    df_w = spark.read.parquet(os.path.join("datasets", source))
    for j in range(8):
        slicer = VectorSlicer(inputCol="f"+str(j), outputCol="f_sl"+str(j), indices=[i for i in range(50,76)])
        df_w = slicer.transform(df_w).drop("f"+str(j))
    cols = ["f_sl"+str(i) for i in range(8)]
    assembler = VectorAssembler(inputCols=cols, outputCol="f")
    df_w = assembler.transform(df_w) 
    df_w.write.mode("overwrite").parquet(os.path.join("datasets", destination))
    df_w.printSchema()

Exemple #4

0

Afficher le fichier

Fichier : predict_prob.py Projet : INKWWW/Spark

def predict_prob(lrModelPath, test_data):
    lrModel = LogisticRegressionModel.load(lrModelPath)
    predictions = lrModel.transform(test_data)
    result = predictions.select(['_c0', '_c1', '_c2', 'probability'])
    print('*************** result **************')
    print(result.show(5))
    # result.write.csv('file:///opt/int_group/result123')

    vs = VectorSlicer(inputCol="probability", outputCol="prob_1", indices=[1])
    prob_1 = vs.transform(result)
    print('*************** prob_1 **************')
    print(prob_1.show(5))
    result_prob1 = prob_1.select(['_c0', '_c1', '_c2', 'prob_1'])
    print('*************** result_prob1 **************')
    print(result_prob1.show(5))
    # for i in range(800, 802):
    #     g = i / 1000
    #     h = g + 0.001
    #     sqlTrans = SQLTransformer(statement="SELECT _c0, _c1, _c2, prob_1[0] AS prob FROM __THIS__ WHERE prob_1[0] < h  AND prob_1[0] >= g")
    #     dd = sqlTrans.transform(result_prob1)
    #     dd.write.csv('file:///opt/int_group/sql_test')

    new_result_prob1 = result_prob1.select([
        '_c0', '_c1', '_c2',
        result_prob1['prob_1'].cast('string').alias('prob_1_str')
    ])
    print('*************** new_result_prob1 **************')
    print(new_result_prob1.show(5))
    print(new_result_prob1)

    dd = new_result_prob1.head(1000)
    dd_df = spark.createDataFrame(dd)
    dd_df.write.csv('file:///opt/int_group/head_1kw_test')
    # for i in [1,2,3,4,5]:
    #     dd = new_result_prob1.head(i)
    #     dd_df = spark.createDataFrame(dd)
    #     dd_df.write.csv('file:///opt/int_group/head_test', mode='append')

    # DataFrame[_c0: string, _c1: string, _c2: string, prob_1_str: string]

    ###
    '''
    Error:
    Exception: Python in worker has different version 2.7 than that in driver 3.6, 
    PySpark cannot run with different minor versions.Please check environment variables PYSPARK_PYTHON and 
    PYSPARK_DRIVER_PYTHON are correctly set.
    '''
    # new_result_prob1.toPandas().to_csv('file:///opt/int_group/result.csv')

    # new_result_prob1.toPandas().to_csv('hdfs://bcg/opt/int_group/result/result.csv')

    ###
    '''

Exemple #5

0

Afficher le fichier

    def _evaluate(self, dataset, metric="AP"):
        def precision(y_true, y_scores, k):
            act_set = set(y_true)
            pred_set = set(y_scores[:k])
            result = len(act_set & pred_set) / float(k)
            return result

        def recall(y_true, y_scores, k):
            act_set = set(y_true)
            pred_set = set(y_scores[:k])
            result = len(act_set & pred_set) / float(len(act_set))
            return result

        neg_slicer = VectorSlicer(inputCol="probability",
                                  outputCol="0_prob",
                                  indices=[0])
        pos_slicer = VectorSlicer(inputCol="probability",
                                  outputCol="1_prob",
                                  indices=[1])

        output_stg1 = neg_slicer.transform(dataset)
        output = pos_slicer.transform(output_stg1)

        Ranked_prediction = output.sort(col("1_prob").desc())

        y_true = Ranked_prediction.select("label").rdd.flatMap(
            lambda x: x).collect()
        y_scores = Ranked_prediction.select("prediction").rdd.flatMap(
            lambda x: x).collect()

        score = 0
        if metric == "AP":
            score = average_precision_score(y_true, y_scores)
        elif metric == "P100":
            score = precision(y_true, y_scores, 100)

        return score

Exemple #6

0

Afficher le fichier

Fichier : write_all_prob.py Projet : INKWWW/Spark

def predict_prob(lrModelPath, test_data):
    lrModel = LogisticRegressionModel.load(lrModelPath)
    predictions = lrModel.transform(test_data)
    result = predictions.select(['_c0', '_c1', '_c2', 'probability'])
    print('*************** result **************')
    print(result.show(5))
    # result.write.csv('file:///opt/int_group/result123')

    vs = VectorSlicer(inputCol="probability", outputCol="prob_1", indices=[1])
    prob_1 = vs.transform(result)
    print('*************** prob_1 **************')
    print(prob_1.show(5))
    result_prob1 = prob_1.select(['_c0', '_c1', '_c2', 'prob_1'])
    print('*************** result_prob1 **************')
    print(result_prob1.show(5))

    new_result_prob1 = result_prob1.select(['_c0', '_c1', '_c2', result_prob1['prob_1'].cast('string').alias('prob_1_str')])
    print('*************** new_result_prob1 **************')
    print(new_result_prob1.show(10))
    print(new_result_prob1)
    new_result_prob1 = new_result_prob1.na.drop()
    # print('######################### printing num - {} #########################'.format(new_path))
    new_result_prob1.write.csv('/opt/int_group/hanmo.wang/3key_all_v3', nullValue=None, mode='append')

Exemple #7

0

Afficher le fichier

Fichier : predict_prob_V2.py Projet : INKWWW/Spark

def predict_prob(lrModel, test_data):
    # lrModel = LogisticRegressionModel.load(lrModelPath)
    predictions = lrModel.transform(test_data)
    result = predictions.select(['id', 'name', 'cell', 'probability'])
    print('*************** result **************')
    print(result.show(5))

    vs = VectorSlicer(inputCol="probability", outputCol="prob_1", indices=[1])
    prob_1 = vs.transform(result)
    print('*************** prob_1 **************')
    print(prob_1.show(5))

    result_prob1 = prob_1.select(['id', 'name', 'cell', 'prob_1'])
    print('*************** result_prob1 **************')
    print(result_prob1.show(5))

    new_result_prob1 = result_prob1.select([
        'id', 'name', 'cell',
        result_prob1['prob_1'].cast('string').alias('prob_1_str')
    ])
    print('*************** new_result_prob1 **************')
    print(new_result_prob1.show(5))
    print(new_result_prob1)
    return new_result_prob1

Exemple #8

0

Afficher le fichier

Fichier : Data engineering pyspark.py Projet : hmk88/Pyspark_ML_databricks_ApacheSpark

from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row

df = spark.createDataFrame([
    Row(userFeatures=Vectors.sparse(3, {
        0: -2.0,
        1: 2.3
    })),
    Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]))
])

slicer = VectorSlicer(inputCol="userFeatures",
                      outputCol="features",
                      indices=[1])

output = slicer.transform(df)

output.select("userFeatures", "features").show()

# COMMAND ----------

###R formula selects columns specified by R formula
from pyspark.ml.feature import RFormula

dataset = spark.createDataFrame([(7, "US", 18, 1.0), (8, "CA", 12, 0.0),
                                 (9, "NZ", 15, 0.0)],
                                ["id", "country", "hour", "clicked"])

formula = RFormula(formula="clicked ~ country + hour",
                   featuresCol="features",
                   labelCol="label")

Exemple #9

0

Afficher le fichier

Fichier : vector_slicer_example.py Projet : 15652101501/spark

#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import VectorSlicer
from pyspark.mllib.linalg import Vectors
from pyspark.sql.types import Row
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("VectorSlicerExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([
        Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3}),),
        Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]),)])

    slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1])

    output = slicer.transform(df)

    output.select("userFeatures", "features").show()
    # $example off$

    spark.stop()

Exemple #10

0

Afficher le fichier

result_df = chisq_selector.fit(scaled_df).transform(scaled_df)

result_df.select("selected_features").display()

# COMMAND ----------

# MAGIC %md Feature Selection using VectorSclicer

# COMMAND ----------

from pyspark.ml.feature import VectorSlicer

vec_slicer = VectorSlicer(inputCol="scaled_features",
                          outputCol="selected_features",
                          indices=[1])
result_df = vec_slicer.transform(scaled_df)
result_df.select("scaled_features", "selected_features").display()

# COMMAND ----------

# MAGIC %md ###Delta Lake as Feature Store

# COMMAND ----------

spark.sql("CREATE DATABASE IF NOT EXISTS feature_store ")

(result_df.write.format("delta").mode("overwrite").option(
    "location",
    "/FileStore/shared_uploads/delta/retail_features.delta").saveAsTable(
        "feature_store.retail_features"))

Exemple #11

0

Afficher le fichier

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, GRU, CuDNNGRU, RNN, ConvLSTM2D, Conv1D, Reshape, MaxPooling1D, SimpleRNNCell, Flatten
from keras.callbacks import Callback
from keras.callbacks import ModelCheckpoint   
import matplotlib.pyplot as plt
import config
from sklearn.metrics import mean_absolute_error
import pandas
import csv
from pyspark.ml.feature import VectorSlicer

spark = config.get_config()

df_f = spark.read.parquet(os.path.join("datasets", "train.vector.fbin.2.parquet"))
slicer = VectorSlicer(inputCol='f', outputCol="fsl", indices=[i for i in range(0,26,2)])
df_f = slicer.transform(df_f).drop('f')
df_y = spark.read.parquet(os.path.join("datasets", "train.target.parquet"))

df_f = df_f.selectExpr("*").drop("_c0")
df_y = df_y.selectExpr("seg AS seg2", "y as label").drop("seg")

df_train = df_f
df_train = df_train.join(df_y, df_train.seg.cast(IntegerType()) == df_y.seg2.cast(IntegerType())).drop("seg2")

df_train.printSchema()

n_dim = 13 #26 99 119 99+14 20+26 14+26

# vect_cols = ["f"]
# vectorAssembler = VectorAssembler(inputCols=vect_cols, outputCol="features")
# df_train = vectorAssembler.transform(df_train)

Exemple #12

0

Afficher le fichier

testset,trainset = Data.randomSplit([0.3,0.7], seed=25)
print("Training Dataset Count: " + str(trainset.count()))
print("Test Dataset Count: " + str(testset.count()))

### GENERALIZED LINEAR REGRESSION FOR FEATURE SELECTION
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(predictionCol="Predicted_median", labelCol="label", featuresCol="features",family="binomial", link="logit", maxIter=10,regParam=0.01)
model = glr.fit(Data)
summary = model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("P Values: " + str(summary.pValues))

#Removing all the columns that had a p-value above 0.05
vs = VectorSlicer(inputCol="features", outputCol="selected_features", indices=[0,2,9,18,21,23,24,26,27,28,31,32,37,41])
Training_set= vs.transform(trainset)
Test_set = vs.transform(testset)

#### LOGISTIC REGRESSION
logReg = LogisticRegression(predictionCol="Predicted_median", labelCol="label", featuresCol="features", maxIter=20,regParam=0.01, elasticNetParam=0.8, family="binomial")
logReg_model = logReg.fit(Training_set)
trainingSummary = logReg_model.summary
roc = trainingSummary.roc.toPandas()
print('Training set ROC: ' + str(trainingSummary.areaUnderROC))
predictions = logReg_model.transform(Test_set)
predictions.select('features', 'label', 'rawPrediction', 'Predicted_median', 'probability').show(10)
evaluator = BinaryClassificationEvaluator()
print("Test_SET (Area Under ROC): " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))


### GRADIENT BOOSTING

Exemple #13

0

Afficher le fichier

def slice_source_to(source, destination):
    slicer = VectorSlicer(inputCol="f_t1", outputCol="f", indices=[i for i in range(50,76)])
    df_v = spark.read.parquet(os.path.join("datasets", source))
    df_v = df_v.selectExpr("*", "f AS f_t1").drop("_c0").drop("f")
    df_v = slicer.transform(df_v).drop("f_t1")
    df_v.write.mode("overwrite").parquet(os.path.join("datasets", destination))

Exemple #14

0

Afficher le fichier

Fichier : gfbank_feature_filter.py Projet : zhilangtaosha/sparkdata

rdd = sc.textFile("/hive/warehouse/wlcredit.db/t_credit_feature_merge/ds=" + today + "_cms1234_anf")
rdd1 = rdd.map(lambda x:x.split("\001")[0] + " " + x.split("\001")[1])
rdd1.saveAsTextFile("/user/wrt/credit/allexample.libsvm")

data_svm_sql = sqlContext.read.format("libsvm").load("/user/wrt/credit/allexample.libsvm")
data_svm = data_svm_sql.map(lambda row:LabeledPoint(int(row.label),row.features))
features = data_svm.map(lambda x: x.features)
stat = Statistics.colStats(features)
coverage = (stat.numNonzeros()/stat.count()).tolist()
std = numpy.sqrt(stat.variance()).tolist()
features_nums = data_svm.map(lambda x: x.features.size).take(1)[0]
features_arr = range(0, features_nums)	
re = zip(zip(coverage,std),features_arr)
filteredIndexes = map(lambda m: m[1],filter(lambda a:a[0][0] >=0.005,re))
slicer = VectorSlicer(inputCol="features", outputCol="featuresFiltered", indices=filteredIndexes)
output_df = slicer.transform(data_svm_sql)
data_svm_filtered = output_df.select("label","featuresFiltered")
data_svm_labelpoint = data_svm_filtered.map(lambda row:LabeledPoint(int(row.label),row.featuresFiltered))
MLUtils.saveAsLibSVMFile(data_svm_labelpoint,"/user/wrt/credit/allexample_filter.libsvm")
rdd_r = sc.textFile("/user/wrt/credit/allexample_filter.libsvm")\
    .map(lambda x :x.split()[0].split('.')[0] + '\001' + ' '.join(x.split()[1:]))
rdd_r.saveAsTextFile("/user/wrt/credit/allexample_filter_telindex_features")
feature_raw = sc.textFile("/hive/warehouse/wlcredit.db/t_wrt_credit_all_features_name/ds=" + today + "_cms1234_anf")\
    .map(lambda x:valid_jsontxt(x.split("\t")[0])).collect()
fea_all_index = []
j = 1
for i in filteredIndexes:
    fea_all_index.append(feature_raw[i] + "\t" + str(j))
    j += 1
sc.parallelize(fea_all_index).saveAsTextFile('/user/wrt/temp/filter_feature_name')

Exemple #15

0

Afficher le fichier

Fichier : titanic-analysis.py Projet : milovanovicm/titanic-survivor-analysis

# MAGIC After comparing the results of four tested models, we can see that we are getting quite satisfying results with Random Forest, even for the small number of trees. Random Forest is performing much better than the other tested algorithms. In that manner, we will try to select only the most relevant features, and try to lower the dimensionality of the problem. <br>
# MAGIC After evaluating the results of Random Forest, we have identified the importance of the used features:
# MAGIC SparseVector(17, {0: 0.0589, 1: 0.0454, 2: 0.028, 3: 0.0834, 4: 0.1145, 5: 0.0469, 6: 0.0257, 7: 0.019, 8: 0.0159, 9: 0.003, 10: 0.0059, 11: 0.0009, 12: 0.0524, 13: 0.0211, 14: 0.0021, 15: 0.2103, 16: 0.2664}).<br>
# MAGIC Here we are going to lower the dimensionality of the problem based on feature importance, and using Vector Slicer Feature Selecion. Let's test how our datasets behave with less features, and try to make more robust model while using smaller number of features.

# COMMAND ----------

from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row

slicer = VectorSlicer(inputCol="features",
                      outputCol="selectedFeatures").setIndices([3, 4, 15, 16])

# We are using the same datasets as for the other algorithms
output = slicer.transform(transformed)
otestData = slicer.transform(testData)
otrainData = slicer.transform(trainingData)

# Let's make our model
rf = RandomForestClassifier(labelCol="label",
                            featuresCol="selectedFeatures",
                            numTrees=10)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, rf])

# Train model.  This also runs the indexers.
model = pipeline.fit(otrainData)

# Make predictions.