Beispiel #1
0
 def combine(pair):
     # list of np array
     if isinstance(pair[1], list):
         row = Row(*([pair[0][col] for col in pair[0].__fields__] +
                     [[Vectors.dense(elem) for elem in pair[1]]]))
         return row, ArrayType(VectorUDT())
     # scalar
     elif len(pair[1].shape) == 0:
         row = Row(*([pair[0][col] for col in pair[0].__fields__] +
                     [float(pair[1].item(0))]))
         return row, FloatType()
     # np ndarray
     else:
         dim = len(pair[1].shape)
         if dim == 1:
             # np 1-D array
             row = Row(*([pair[0][col] for col in pair[0].__fields__] +
                         [Vectors.dense(pair[1])]))
             return row, VectorUDT()
         else:
             # multi-dimensional array
             structType = FloatType()
             for _ in range(dim):
                 structType = ArrayType(structType)
             row = Row(*([pair[0][col] for col in pair[0].__fields__] +
                         [pair[1].tolist()]))
             return row, structType
    def transform(self, X_rdd, y_rdd=None):
        '''
        given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
        '''
        #check input type
        if type(X_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        if y_rdd and type(y_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")

        #convert X to URL paths
        X = X_rdd.map(self._term_frequency).cache()

        #check if labels exist
        if y_rdd:
            #combine X and y into single dataframe
            X = X.zipWithIndex().map(lambda r: (r[1], r[0]))
            y = y_rdd.zipWithIndex().map(lambda r: (r[1], r[0]))
            data = X.join(y).map(lambda r: r[1])
            schema = StructType([
                StructField('features', VectorUDT(), True),
                StructField('label', StringType(), True)
            ])
            data = data.toDF(schema)
            data = data.withColumn('label', data.label.cast(DoubleType()))

        else:
            X = X.map(lambda row: [row])
            schema = StructType([StructField("features", VectorUDT(), True)])
            data = X.toDF(schema)

        return data
def predict(index, s):
    items = [i for i in s]
    feature = VectorUDT().deserialize(pickle.loads(items[0]))
    print(pickle.loads(items[1])[0])
    model = pickle.load(open(pickle.loads(items[1])[0] + "/model.pkl", "rb"))
    y = model.predict([feature.toArray()])
    return [VectorUDT().serialize(Vectors.dense(y))]
def predict(index, s):
    items = [i for i in s]
    modelPath = pickle.loads(items[1])[0] + "/model.h5"
    if not hasattr(os, "mlsql_models"):
        setattr(os, "mlsql_models", {})
    if modelPath not in os.mlsql_models:
        # import tensorflow as tf
        # from keras import backend as K
        # gpu_options = tf.GPUOptions(allow_growth=True)
        # config = tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)
        # session = tf.Session(config=config)
        # K.set_session(session)
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
        print("Load Keras model %s, CUDA_VISIBLE_DEVICES:%s " %
              (modelPath, os.environ["CUDA_VISIBLE_DEVICES"]))
        from keras.models import load_model
        os.mlsql_models[modelPath] = load_model(modelPath)
    # here we can get train params
    trainParams = pickle.loads(items[2])[0]
    width = int(trainParams["fitParam.0.width"])
    height = int(trainParams["fitParam.0.height"])

    model = os.mlsql_models[modelPath]
    rawVector = pickle.loads(items[0])
    feature = VectorUDT().deserialize(rawVector).toArray()
    feature_final = np.reshape(feature, [1, width, height, 3])
    # y是一个numpy对象,是一个预测结果的数组。因为predict是支持批量预测的,所以是一个二维数组。
    y = model.predict(feature_final)
    return [VectorUDT().serialize(Vectors.dense(y.tolist()[0]))]
Beispiel #5
0
    def test_get_metadata(self):
        expected_metadata = \
            {
                'float': {
                    'spark_data_type': FloatType,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.NOCHANGE,
                    'max_size': 1,
                    'shape': 1
                },
                'dense': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
                'sparse': {
                    'spark_data_type': SparseVector,
                    'is_sparse_vector_only': True,
                    'intermediate_format': constants.CUSTOM_SPARSE,
                    'max_size': 1,
                    'shape': 2
                },
                'mixed': {
                    'spark_data_type': DenseVector,
                    'is_sparse_vector_only': False,
                    'intermediate_format': constants.ARRAY,
                    'max_size': 2,
                    'shape': 2
                },
            }

        with spark_session('test_get_metadata') as spark:
            data = [[
                1.0,
                DenseVector([1.0, 1.0]),
                SparseVector(2, {0: 1.0}),
                DenseVector([1.0, 1.0])
            ],
                    [
                        1.0,
                        DenseVector([1.0, 1.0]),
                        SparseVector(2, {1: 1.0}),
                        SparseVector(2, {1: 1.0})
                    ]]
            schema = StructType([
                StructField('float', FloatType()),
                StructField('dense', VectorUDT()),
                StructField('sparse', VectorUDT()),
                StructField('mixed', VectorUDT())
            ])
            df = create_test_data_from_schema(spark, data, schema)

            metadata = util._get_metadata(df)
            self.assertDictEqual(metadata, expected_metadata)
Beispiel #6
0
def create_mnist_data(spark):
    features = DenseVector([1.0] * 64)
    label_vec = DenseVector([0.0, 0.0, 1.0] + [0.0] * 7)
    label = 2.0
    data = [[features, label_vec, label]] * 10
    schema = StructType([StructField('features', VectorUDT()),
                         StructField('label_vec', VectorUDT()),
                         StructField('label', FloatType())])
    df = create_test_data_from_schema(spark, data, schema)
    return df
Beispiel #7
0
 def test_one_hot_encoder():
     actual_df = fe.one_hot_encoder(source_df, input_cols=['id'])
     expected_df = op.create.df([
         ('id', LongType(), True), ('x', LongType(), True),
         ('y', LongType(), True), ('features', VectorUDT(), True),
         ('id***ONE_HOT_ENCODER', VectorUDT(), True)
     ], [(0, 1, 2, DenseVector([1.0, 0.5, -1.0]), SparseVector(2,
                                                               {0: 1.0})),
         (1, 2, 3, DenseVector([2.0, 1.0, 1.0]), SparseVector(2, {1: 1.0})),
         (2, 3, 4, DenseVector([4.0, 10.0, 2.0]), SparseVector(2, {}))])
     assert (expected_df.collect() == actual_df.collect())
Beispiel #8
0
 def test_vector_assembler():
     actual_df = fe.vector_assembler(source_df, input_cols=['id', 'x', 'y'])
     expected_df = op.create.df(
         [('id', LongType(), True), ('x', LongType(), True),
          ('y', LongType(), True), ('features', VectorUDT(), True),
          ('id_x_y******VECTOR_ASSEMBLER', VectorUDT(), True)],
         [(0, 1, 2, DenseVector([1.0, 0.5, -1.0
                                 ]), DenseVector([0.0, 1.0, 2.0])),
          (1, 2, 3, DenseVector([2.0, 1.0, 1.0
                                 ]), DenseVector([1.0, 2.0, 3.0])),
          (2, 3, 4, DenseVector([4.0, 10.0,
                                 2.0]), DenseVector([2.0, 3.0, 4.0]))])
     assert (expected_df.collect() == actual_df.collect())
Beispiel #9
0
 def combine(pair):
     # list of np array
     if isinstance(pair[1], list):
         row = Row(*([pair[0][col] for col in pair[0].__fields__] +
                     [[Vectors.dense(elem) for elem in pair[1]]]))
         return row, ArrayType(VectorUDT())
     # scalar
     elif len(pair[1].shape) == 0:
         row = Row(*([pair[0][col] for col in pair[0].__fields__] + [float(pair[1].item(0))]))
         return row, FloatType()
     # np array
     else:
         row = Row(*([pair[0][col] for col in pair[0].__fields__] + [Vectors.dense(pair[1])]))
         return row, VectorUDT()
Beispiel #10
0
    def test_check_shape_compatibility(self):
        feature_columns = ['x1', 'x2', 'features']
        label_columns = ['y1', 'y_embedding']

        schema = StructType([
            StructField('x1', DoubleType()),
            StructField('x2', IntegerType()),
            StructField('features', VectorUDT()),
            StructField('y1', FloatType()),
            StructField('y_embedding', VectorUDT())
        ])
        data = [[
            1.0, 1,
            DenseVector([1.0] * 12), 1.0,
            DenseVector([1.0] * 12)
        ]] * 10

        with spark_session('test_df_cache') as spark:
            df = create_test_data_from_schema(spark, data, schema)
            metadata = util._get_metadata(df)

            input_shapes = [[1], [1], [-1, 3, 4]]
            output_shapes = [[1], [-1, 3, 4]]
            util.check_shape_compatibility(metadata, feature_columns,
                                           label_columns, input_shapes,
                                           output_shapes)

            input_shapes = [[1], [1], [3, 2, 2]]
            output_shapes = [[1, 1], [-1, 2, 3, 2]]
            util.check_shape_compatibility(metadata, feature_columns,
                                           label_columns, input_shapes,
                                           output_shapes)

            bad_input_shapes = [[1], [1], [-1, 3, 5]]
            with pytest.raises(ValueError):
                util.check_shape_compatibility(metadata, feature_columns,
                                               label_columns, bad_input_shapes,
                                               output_shapes)

            bad_input_shapes = [[2], [1], [-1, 3, 4]]
            with pytest.raises(ValueError):
                util.check_shape_compatibility(metadata, feature_columns,
                                               label_columns, bad_input_shapes,
                                               output_shapes)

            bad_output_shapes = [[7], [-1, 3, 4]]
            with pytest.raises(ValueError):
                util.check_shape_compatibility(metadata, feature_columns,
                                               label_columns, input_shapes,
                                               bad_output_shapes)
Beispiel #11
0
def predict(index, s):
    items = [i for i in s]
    modelPath = pickle.loads(items[1])[0] + "/model.pkl"

    if not hasattr(os, "mlsql_models"):
        setattr(os, "mlsql_models", {})
    if modelPath not in os.mlsql_models:
        print("Load sklearn model %s" % modelPath)
        os.mlsql_models[modelPath] = pickle.load(open(modelPath, "rb"))

    model = os.mlsql_models[modelPath]
    rawVector = pickle.loads(items[0])
    feature = VectorUDT().deserialize(rawVector)
    y = model.predict([feature.toArray()])
    return [VectorUDT().serialize(Vectors.dense(y))]
    def transform(self, metadata, hashes_and_labels=None, train=True):
        '''
        extract features from .asm files
        '''
        #Check input type
        if type(metadata) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        if hashes_and_labels and type(hashes_and_labels) != RDD and type(
                hashes_and_labels) != PipelinedRDD:
            raise TypeError("Arguments must be pySpark RDDs")

        #word tokenization
        X = metadata.map(self._tokenize).cache()

        #create dictionary of words
        if train:
            self.dictionary = X.map(lambda row: row[1]).flatMap(
                lambda word: word).map(lambda word: (word, 1)).reduceByKey(
                    lambda acc, w: acc + w).filter(
                        lambda x: x[1] >= self.min_df).collectAsMap()
            self.dictionary = dict(
                zip(self.dictionary, xrange(len(self.dictionary))))

        #create word vectors
        X = X.map(self._term_frequency)

        #check if labels exist
        if hashes_and_labels:
            #combine X and y into single dataframe
            data = hashes_and_labels.join(X).map(
                lambda (hash, (label, features)): (hash, features, label))
            schema = StructType([
                StructField('hash', StringType(), True),
                StructField('features', VectorUDT(), True),
                StructField('label', StringType(), True)
            ])
            data = data.toDF(schema)
            data = data.withColumn('label', data.label.cast(DoubleType()))

        else:
            #if no labels, just use X
            schema = StructType([
                StructField('hash', StringType(), True),
                StructField("features", VectorUDT(), True)
            ])
            data = X.toDF(schema)

        return data
Beispiel #13
0
    def test_get_col_info(self):
        with spark_session('test_get_col_info') as spark:
            data = [[
                0,
                0.0,
                None,
                [1, 1],
                DenseVector([1.0, 1.0]),
                SparseVector(2, {1: 1.0}),
                DenseVector([1.0, 1.0])
            ], [
                1,
                None,
                None,
                [1, 1],
                DenseVector([1.0, 1.0]),
                SparseVector(2, {1: 1.0}),
                SparseVector(2, {1: 1.0})
            ]]

            schema = StructType([
                StructField('int', IntegerType()),
                StructField('float', FloatType()),
                StructField('null', NullType()),
                StructField('array', ArrayType(IntegerType())),
                StructField('dense', VectorUDT()),
                StructField('sparse', VectorUDT()),
                StructField('mixed', VectorUDT())
            ])

            df = create_test_data_from_schema(spark, data, schema)
            all_col_types, col_shapes, col_max_sizes = util._get_col_info(df)

            expected = [
                ('int', {int}, 1, 1),
                ('float', {float, NullType}, 1, 1),
                ('null', {NullType}, 1, 1),
                ('array', {list}, 2, 2),
                ('dense', {DenseVector}, 2, 2),
                ('sparse', {SparseVector}, 2, 1),
                ('mixed', {DenseVector, SparseVector}, 2, 2)
            ]

            for expected_col_info in expected:
                col_name, col_types, col_shape, col_size = expected_col_info
                assert all_col_types[col_name] == col_types, col_name
                assert col_shapes[col_name] == col_shape, col_name
                assert col_max_sizes[col_name] == col_size, col_name
    def transform(self, X_rdd, y_rdd=None, train=True):
        '''
        given X RDD (and optionally y RDD), output dataframe with term frequency feature vector and labels
        '''
        #check input type
        if type(X_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")
        if y_rdd and type(y_rdd) != RDD:
            raise TypeError("Arguments must be pySpark RDDs")

        #word tokenization
        X = X_rdd.map(self._tokenize).cache()

        #create dictionary of words
        if train:
            self.dictionary = X.map(lambda row: row[1]).flatMap(
                lambda word: word).map(lambda word: (word, 1)).reduceByKey(
                    lambda acc, w: acc + w).filter(
                        lambda x: x[1] >= self.min_df).collectAsMap()
            self.dictionary = dict(
                zip(self.dictionary, xrange(len(self.dictionary))))

        #create word vectors
        X = X.map(self._term_frequency)

        #check if labels exist
        if y_rdd:
            #combine X and y into single dataframe
            X = X.zipWithIndex().map(lambda r: (r[1], r[0]))
            y = y_rdd.zipWithIndex().map(lambda r: (r[1], r[0]))
            data = X.join(y).map(lambda (idx, ((hash, features), label)):
                                 (hash, features, label))
            schema = StructType([
                StructField('hash', StringType(), True),
                StructField('features', VectorUDT(), True),
                StructField('label', StringType(), True)
            ])
            data = data.toDF(schema)
            data = data.withColumn('label', data.label.cast(DoubleType()))

        else:
            schema = StructType([
                StructField('hash', StringType(), True),
                StructField("features", VectorUDT(), True)
            ])
            data = X.toDF(schema)

        return data
def createInputUtc(utc):
    spark = SparkSession.builder.getOrCreate()
    int_utc = (utcToInt(utc))
    print "timestamp: ", utc, "num_time: ", int_utc

    schema = T.StructType([T.StructField('features', VectorUDT())])
    return spark.createDataFrame([Row(features=DenseVector([int_utc]))], schema = schema)
def testSimpleOnDataFrame():
    spark = test_helpers.getOrCreateSparkSession(test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcDataSchema = pool_test_helpers.createSchema(
        [
            ("features", VectorUDT()),
            ("label", DoubleType())
        ],
        featureNames,
        addFeatureNamesMetadata=True
    )

    srcData = [
      Row(Vectors.dense(0.1, 0.2, 0.11), 0.12),
      Row(Vectors.dense(0.97, 0.82, 0.33), 1.1),
      Row(Vectors.dense(0.13, 0.22, 0.23), 2.1),
      Row(Vectors.dense(0.14, 0.18, 0.1), 0.0),
      Row(Vectors.dense(0.9, 0.67, 0.17), -1.0),
      Row(Vectors.dense(0.66, 0.1, 0.31), 0.62)
    ]

    df = spark.createDataFrame(spark.sparkContext.parallelize(srcData), StructType(srcDataSchema))

    regressor = (catboost_spark.CatBoostRegressor()
      .setIterations(20)
      .setTrainDir(tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))
    model = regressor.fit(df)
    predictions = model.transform(df)

    print ("predictions")
    predictions.show(truncate=False)
Beispiel #17
0
    def trainALS(self, ranks, iterations):
        for rank in ranks:
            als = ALS(rank=rank, maxIter=iterations, regParam=0.1, userCol="UserID", itemCol="MovieID",ratingCol="label")
            paramGrid = ParamGridBuilder().addGrid(als.rank,[rank]).build()
            crossval = CrossValidator(estimator=als,
                                      estimatorParamMaps=paramGrid,
                                      evaluator=Remove_nan(metricName="rmse", labelCol="label",
                                      predictionCol="prediction"),
                                      numFolds=5)
            self.trainDf.show()
            cvModel = crossval.fit(self.trainDf)
            predictions = cvModel.transform(self.testDf)
            rmse = Remove_nan(metricName="rmse", labelCol="label",
                                        predictionCol="prediction").evaluate(predictions)
            print "****RMSE VALUE IS :*****", rmse
            movieFactors = cvModel.bestModel.itemFactors.orderBy('id').cache()
            movieFactors.show(truncate=False)
            convertToVectors = udf(lambda features: Vectors.dense(features), VectorUDT())
            movieFactors = movieFactors.withColumn("features", convertToVectors(movieFactors.features))
            kmeans = KMeans(k=50, seed=1)
            kModel = kmeans.fit(movieFactors)
            kmeansDF = kModel.transform(movieFactors)
            clusters = [1, 2]
            kmeansDF = kmeansDF.join(self.movieDf, kmeansDF.id == self.movieDf.MovieID).drop('MovieID')
            for cluster in clusters:
                movieNamesDf = kmeansDF.where(col("prediction") == cluster).select("MovieName")
                movieNamesDf.rdd.map(lambda row: row[0]).saveAsTextFile(outputDir + \
                                                                        "Rank" + str(rank) + "Cluster" + str(cluster))

        if __name__ == "__main__":
            mr = movieRecALS(inputDir + "/MovieLens100K_train.txt", inputDir + "/MovieLens100K_test.txt",
                             inputDir + "/u.item")
            ranks = [2, 4, 8, 16, 32, 64, 128, 256]
            iterations = 20
            mr.trainALS(ranks, iterations)
Beispiel #18
0
    def test_cast_vector():
        source_df = op.create.df(
            rows=[
                ("happy", [1, 2, 3]),
                ("excited", [4, 5, 6])
            ],
            cols=[
                ("emotion", StringType(), True),
                ("num", ArrayType(IntegerType()), True)
            ]
        )

        actual_df = source_df.cols.cast("num", Vectors)

        expected_df = op.create.df(
            rows=[
                ("happy", DenseVector([1, 2, 3])),
                ("excited", DenseVector([4, 5, 6]))],
            cols=[
                ("emotion", StringType(), True),
                ("num", VectorUDT(), True)
            ]
        )

        assert (actual_df.collect() == expected_df.collect())
Beispiel #19
0
def zero_features(df, *feature_names):
    """Zero out features in the feature vector.

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    feature_names : list of str

    Returns
    -------
    pyspark.sql.DataFrame
    """
    features = df.schema['features'].metadata['features']
    idxs = [features.index(name) for name in feature_names]

    def zero_features(feat):
        raw = feat.toArray()
        for idx in idxs:
            raw[idx] = 0.
        return Vectors.dense(raw)

    zero_features_udf = F.udf(zero_features, VectorUDT())
    return df.withColumn(
        'features',
        mjolnir.spark.add_meta(df._sc, zero_features_udf('features'),
                               {'features': features}))
def ratingFeatures(ratingSamples):
    ratingSamples.printSchema()
    ratingSamples.show()

    # calculate average movie rating score and rating count
    # 按movieId做聚合,统计电影点击次数count(1) as ratingCount
    # avg(rating) as avgRating
    # variance(rating) as ratingVar   -- 这个是方差
    movieFeatures = ratingSamples.groupBy('movieId').agg(F.count(F.lit(1)).alias('ratingCount'),
                                                         F.avg("rating").alias("avgRating"),
                                                         F.variance('rating').alias('ratingVar')) \
        .withColumn('avgRatingVec', udf(lambda x: Vectors.dense(x), VectorUDT())('avgRating'))  # 把平均得分转成只有1列的向量存储,后续做标准化要求的
    movieFeatures.show(10)

    ######## 走pipeline特征处理 ########
    # bucketing
    # 连续值分桶:对ratingCount按分布划分成100个大小一样的桶
    ratingCountDiscretizer = QuantileDiscretizer(numBuckets=100,
                                                 inputCol="ratingCount",
                                                 outputCol="ratingCountBucket")
    # Normalization
    # 标准化:将平均得分向量进行标准化
    ratingScaler = MinMaxScaler(inputCol="avgRatingVec",
                                outputCol="scaleAvgRating")

    # 创建pipeline
    pipelineStage = [ratingCountDiscretizer, ratingScaler]
    featurePipeline = Pipeline(stages=pipelineStage)
    movieProcessedFeatures = featurePipeline.fit(movieFeatures).transform(
        movieFeatures)

    # 把分桶转成整数类型, 把标准化的向量提取为非向量
    movieProcessedFeatures = movieProcessedFeatures.withColumn('ratingCountBucket', F.col('ratingCountBucket').cast(IntegerType()))\
        .withColumn('scaleAvgRating', udf(lambda v: float(v[0]), FloatType())(F.col('scaleAvgRating'))).drop(F.col('avgRatingVec'))
    movieProcessedFeatures.show(10)
def testBinaryClassificationWithClassWeightsMap():
    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcSchemaData = [("features", VectorUDT()), ("label", IntegerType())]

    srcData = [
        Row(Vectors.dense(0.1, 0.2, 0.11), 0),
        Row(Vectors.dense(0.97, 0.82, 0.33), 1),
        Row(Vectors.dense(0.13, 0.22, 0.23), 1),
        Row(Vectors.dense(0.14, 0.18, 0.1), 0),
        Row(Vectors.dense(0.9, 0.67, 0.17), 0),
        Row(Vectors.dense(0.66, 0.1, 0.31), 0)
    ]
    pool = pool_test_helpers.createRawPool(
        test_helpers.getCurrentMethodName,
        pool_test_helpers.createSchema(srcSchemaData,
                                       featureNames,
                                       addFeatureNamesMetadata=True), srcData,
        {})

    classWeightsMap = collections.OrderedDict([("0", 1.0), ("1", 2.0)])

    classifier = (catboost_spark.CatBoostClassifier().setIterations(
        20).setClassWeightsMap(classWeightsMap).setLoggingLevel(
            catboost_spark.ELoggingLevel.Debug).setTrainDir(
                tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))

    model = classifier.fit(pool)
    predictions = model.transform(pool.data)
    predictions.show(truncate=False)
    def _transform(self, dataset):
        inp = self.getOrDefault(self.inputCol)
        out = self.getOrDefault(self.predictionCol)
        mod_str = self.getOrDefault(self.modStr)
        use_vector_out = self.getOrDefault(self.useVectorOut)

        model = dill.loads(codecs.decode(mod_str.encode(), "base64"))
        model_broadcast = dataset._sc.broadcast(model)

        def predict_vec(data):
            features = data.toArray().reshape((1, len(data)))
            x_data = torch.from_numpy(features).float()
            model = model_broadcast.value
            model.eval()
            return Vectors.dense(model(x_data).detach().numpy().flatten())

        def predict_float(data):
            features = data.toArray().reshape((1, len(data)))
            x_data = torch.from_numpy(features).float()
            model = model_broadcast.value
            model.eval()
            raw_prediction = model(x_data).detach().numpy().flatten()
            if len(raw_prediction) > 1:
                return float(np.argmax(raw_prediction))
            return float(raw_prediction[0])

        if use_vector_out:
            udfGenerateCode = F.udf(predict_vec, VectorUDT())
        else:
            udfGenerateCode = F.udf(predict_float, DoubleType())

        return dataset.withColumn(out, udfGenerateCode(inp))
Beispiel #23
0
    def _average_feature_vectors(self, data, outputCol):
        '''Average the feature vectors

        Attributes
        ----------
            data (DataFrame): input dataframe
            outputCol (str): name of the output column
        '''

        session = SparkSession.builder.getOrCreate()

        def _averager(v1, v2, v3):
            f1 = v1.toArray()
            f2 = v2.toArray()
            f3 = v3.toArray()

            length = min(len(f1), len(f2), len(f3))
            average = []

            for i in range(length):
                average.append((f1[i] + f2[i] + f3[i]) / 3.0)

            return Vectors.dense(average)

        session.udf.register("averager", _averager, VectorUDT())

        data.createOrReplaceTempView("table")

        sql = f"SELECT *, averager(feature0, feature1, feature2) AS {self.outputCol} from table"

        data = session.sql(sql)

        return data
 def test_model_logistic_regression_binary_class(self):
     import inspect
     import os
     this_script_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
     input_path = os.path.join(this_script_dir, "data", "sample_libsvm_data.txt")
     original_data = self.spark.read.format("libsvm").load(input_path)
     #
     # truncate the features
     #
     self.spark.udf.register("truncateFeatures", lambda x: SparseVector(5, range(0,5), x.toArray()[125:130]),
                             VectorUDT())
     data = original_data.selectExpr("label", "truncateFeatures(features) as features")
     lr = LogisticRegression(maxIter=100, tol=0.0001)
     model = lr.fit(data)
     # the name of the input for Logistic Regression is 'features'
     model_onnx = convert_sparkml(model, 'sparkml logistic regression', [('features', FloatTensorType([1, model.numFeatures]))])
     self.assertTrue(model_onnx is not None)
     self.assertTrue(model_onnx.graph.node is not None)
     # run the model
     import pandas
     predicted = model.transform(data)
     data_np = data.toPandas().features.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     expected = [
         predicted.toPandas().prediction.values.astype(numpy.float32),
         predicted.toPandas().probability.apply(lambda x: pandas.Series(x.toArray())).values.astype(numpy.float32)
     ]
     dump_data_and_sparkml_model(data_np, expected, model, model_onnx,
                                 basename="SparkmlLogisticRegression")
Beispiel #25
0
 def compute_word2vec(self,
                      input_df,
                      output_vec_len,
                      window_size=5,
                      sub_test=False):
     """
     Compute the word2vec for a given dataframe
     @param input_df       : the dataframe to perform the action upon
     @param output_vec_len : the length (int) of the output vector
     @param input_col      : the name (string) of the input column
     @param output_col     : the name (string) of the output column
     @return output dataframe with output column
     """
     # ensure that the input column is of type StringType()
     toArray = udf(lambda vs: vs, ArrayType(StringType()))
     toArray1 = udf(lambda vs: vs.toArray())
     df = input_df.withColumn(self.input_col,
                              toArray(input_df[self.input_col]))
     # initialize word2vec
     word2Vec = Word2Vec(vectorSize=output_vec_len,
                         windowSize=window_size,
                         minCount=5,
                         inputCol=self.input_col,
                         outputCol=self.output_col)
     # train word2vec model
     model = word2Vec.fit(df)
     # compute transformation
     result = model.transform(df)
     # convert result to a vector
     if not sub_test:
         conv = udf(lambda vs: Vectors.dense(vs), VectorUDT())
         out = result.withColumn(output_col, conv(result[output_col]))
         return out
     else:
         return result
Beispiel #26
0
def SentimentFeatureEngineer(selectreviewDF):
    RemovePunct_udf = udf(RemovePunct, StringType())
    countTokens_udf = udf(lambda words: len(words), IntegerType())
    RemoveEmptyEntry_udf = udf(RemoveEmpty, ArrayType(StringType()))
    GetCharacter_List_udf = udf(GetCharacter_List, ArrayType(IntegerType()))
    GetSentimentScore_udf = udf(GetSentimentScore, ArrayType(IntegerType()))
    list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())

    selectreviewDF = selectreviewDF.withColumn(
        'remove_punc', RemovePunct_udf(selectreviewDF['review_text']))

    tokenizer = Tokenizer(inputCol="remove_punc", outputCol="tokens_word")
    selectreviewDF = tokenizer.transform(selectreviewDF)

    # Remender: Do not combine the structure here otherwise you will get an
    # error. (some columns will not be found)

    selectreviewDF = selectreviewDF.withColumn(
        'num', countTokens_udf(selectreviewDF['tokens_word']))
    selectreviewDF = selectreviewDF.withColumn(
        'filtered_review_text_new',
        RemoveEmptyEntry_udf(selectreviewDF['tokens_word']))

    selectreviewDF = selectreviewDF.withColumn('Character_adj',
                                               GetCharacter_List_udf(selectreviewDF['filtered_review_text_new'])[0]) \
        .withColumn('Character_noun', GetCharacter_List_udf(selectreviewDF['filtered_review_text_new'])[1]) \
        .withColumn('Character_verb', GetCharacter_List_udf(selectreviewDF['filtered_review_text_new'])[2]) \
        .withColumn('Character_adv', GetCharacter_List_udf(selectreviewDF['filtered_review_text_new'])[3]) \
        .withColumn('sentiment_neg', GetSentimentScore_udf(selectreviewDF['filtered_review_text_new'])[0]) \
        .withColumn('sentiment_neu', GetSentimentScore_udf(selectreviewDF['filtered_review_text_new'])[1]) \
        .withColumn('sentiment_pos', GetSentimentScore_udf(selectreviewDF['filtered_review_text_new'])[2]) \
        .withColumn('sentiment_compound', GetSentimentScore_udf(selectreviewDF['filtered_review_text_new'])[3])

    return selectreviewDF
Beispiel #27
0
    def data_format(data):

        indexers = [
            StringIndexer(inputCol=col, outputCol=col + "_index").fit(data)
            for col in categoricalColumns
        ]
        pipeline = Pipeline(stages=indexers)
        data_features = pipeline.fit(data).transform(data)

        features_withlabel = ['label'
                              ] + [c + "_index"
                                   for c in categoricalColumns] + numericCols
        data_split = data_features.select(features_withlabel)
        features = [f.col(c + "_index") for c in categoricalColumns
                    ] + [f.col(col) for col in numericCols]
        data_label_features = data_split.withColumn("features",
                                                    f.array(features)).select(
                                                        'label', 'features')

        list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())
        df_with_vectors = data_label_features.select(
            data_label_features["label"],
            list_to_vector_udf(
                data_label_features["features"]).alias('features'))
        return df_with_vectors
def testBinaryClassificationWithTargetBorder():
    spark = test_helpers.getOrCreateSparkSession(
        test_helpers.getCurrentMethodName())
    import catboost_spark

    featureNames = ["f1", "f2", "f3"]

    srcSchemaData = [("features", VectorUDT()), ("label", DoubleType())]

    srcData = [
        Row(Vectors.dense(0.1, 0.2, 0.11), 0.12),
        Row(Vectors.dense(0.97, 0.82, 0.33), 0.1),
        Row(Vectors.dense(0.13, 0.22, 0.23), 0.7),
        Row(Vectors.dense(0.14, 0.18, 0.1), 0.33),
        Row(Vectors.dense(0.9, 0.67, 0.17), 0.82),
        Row(Vectors.dense(0.66, 0.1, 0.31), 0.93)
    ]
    pool = pool_test_helpers.createRawPool(
        test_helpers.getCurrentMethodName,
        pool_test_helpers.createSchema(srcSchemaData,
                                       featureNames,
                                       addFeatureNamesMetadata=True), srcData,
        {})

    classifier = (catboost_spark.CatBoostClassifier().setIterations(
        20).setTargetBorder(0.5).setTrainDir(
            tempfile.mkdtemp(prefix=test_helpers.getCurrentMethodName())))

    model = classifier.fit(pool)
    predictions = model.transform(pool.data)
    predictions.show(truncate=False)
Beispiel #29
0
        def cast_factory(cls):

            # Parse to Vector
            if is_type(cls, Vectors):
                func_type = "udf"

                def cast_to_vectors(val, attr):
                    return Vectors.dense(val)

                func_return_type = VectorUDT()
            # Parse standard data types
            elif get_spark_dtypes_object(cls):

                func_type = "column_exp"

                def cast_to_vectors(col_name, attr):
                    return F.col(col_name).cast(get_spark_dtypes_object(cls))

                func_return_type = None

            # Add here any other parse you want
            else:
                RaiseIt.value_error(cls)

            return func_return_type, cast_to_vectors, func_type
Beispiel #30
0
class VectorUDTTests(MLlibTestCase):

    dv0 = DenseVector([])
    dv1 = DenseVector([1.0, 2.0])
    sv0 = SparseVector(2, [], [])
    sv1 = SparseVector(2, [1], [2.0])
    udt = VectorUDT()

    def test_json_schema(self):
        self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)

    def test_serialization(self):
        for v in [self.dv0, self.dv1, self.sv0, self.sv1]:
            self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v)))

    def test_infer_schema(self):
        rdd = self.sc.parallelize([
            Row(label=1.0, features=self.dv1),
            Row(label=0.0, features=self.sv1)
        ])
        df = rdd.toDF()
        schema = df.schema
        field = [f for f in schema.fields if f.name == "features"][0]
        self.assertEqual(field.dataType, self.udt)
        vectors = df.rdd.map(lambda p: p.features).collect()
        self.assertEqual(len(vectors), 2)
        for v in vectors:
            if isinstance(v, SparseVector):
                self.assertEqual(v, self.sv1)
            elif isinstance(v, DenseVector):
                self.assertEqual(v, self.dv1)
            else:
                raise TypeError("expecting a vector but got %r of type %r" %
                                (v, type(v)))
Beispiel #31
0
 def test_json_schema(self):
     self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt)
Beispiel #32
0
def predict(index, s):
    items = [i for i in s]
    feature = VectorUDT().deserialize(pickle.loads(items[0]))
    model = pickle.loads(pickle.loads(items[1])[0])
    y = model.predict([feature.toArray()])
    return [VectorUDT().serialize(Vectors.dense(y))]