Beispiel #1
        `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably
        within this function.  For example, they both implement the `dot` method.

        weights (DenseVector): An array of model weights (betas).
        lp (LabeledPoint): The `LabeledPoint` for a single observation.

        DenseVector: An array of values the same length as `weights`.  The gradient summand.
    return ( - lp.label) * lp.features

exampleW = DenseVector([1, 1, 1])
exampleLP = LabeledPoint(2.0, [3, 1, 4])
# gradientSummand = (dot([1 1 1], [3 1 4]) - 2) * [3 1 4] = (8 - 2) * [3 1 4] = [18 6 24]
summandOne = gradientSummand(exampleW, exampleLP)
print summandOne

exampleW = DenseVector([.24, 1.2, -1.4])
exampleLP = LabeledPoint(3.0, [-1.4, 4.2, 2.1])
summandTwo = gradientSummand(exampleW, exampleLP)
print summandTwo

# In[72]:

# TEST Gradient summand (3a)
Test.assertTrue(np.allclose(summandOne, [18., 6., 24.]),
                'incorrect value for summandOne')
Beispiel #2
 def test_vector(self):
     ewp = ElementwiseProduct(scalingVec=[1, 3])
     self.assertEqual(ewp.getScalingVec(), DenseVector([1.0, 3.0]))
     ewp = ElementwiseProduct(scalingVec=np.array([1.2, 3.4]))
     self.assertEqual(ewp.getScalingVec(), DenseVector([1.2, 3.4]))
     self.assertRaises(TypeError, lambda: ElementwiseProduct(scalingVec=["a", "b"]))
Beispiel #3
# MAGIC Note that `DenseVector` stores all values as `np.float64`, so even if you pass in an NumPy array of integers, the resulting `DenseVector` will contain floating-point numbers. Also, `DenseVector` objects exist locally and are not inherently distributed.  `DenseVector` objects can be used in the distributed setting by either passing functions that contain them to resilient distributed dataset (RDD) transformations or by distributing them directly as RDDs.
# MAGIC For this exercise, create a `DenseVector` consisting of the values `[3.0, 4.0, 5.0]` and compute the dot product of this vector with `numpyVector`.

from pyspark.mllib.linalg import DenseVector

# TODO: Replace <FILL IN> with appropriate code
numpyVector = np.array([-3, -4, 5])
print '\nnumpyVector:\n{0}'.format(numpyVector)

# Create a DenseVector consisting of the values [3.0, 4.0, 5.0]
myDenseVector = DenseVector([3, 4, 5])
# Calculate the dot product between the two vectors.
denseDotProduct =

print 'myDenseVector:\n{0}'.format(myDenseVector)
print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)

# TEST PySpark's DenseVector (3c)
Test.assertTrue(isinstance(myDenseVector, DenseVector),
                'myDenseVector is not a DenseVector')
Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])),
                'incorrect value for myDenseVector')
Test.assertTrue(np.allclose(denseDotProduct, 0.0),
                'incorrect value for denseDotProduct')
Beispiel #4
pipeline = Pipeline(stages=indexers)
df_indexed =

#S=>0 C=>1 Q=>2'Embarked', 'Embarked_indexed').show(3)

#covert features to vectors
enumVarsIndexed = [i + '_indexed' for i in enumVars]
featuresCol = numVars + catVarsIndexed
lableCol = ['Mark', 'Survived']
row = Row('mark', 'label', 'features')

#0 1 2 map
df_indexed = df_indexed[labelCol + featuresCol]
lf = r: (row(r[0], r[1], DenseVector(r[2:])))).toDF()

lf = StringIndexer(inputCol='label', output='index').fit(lf).transform(lf)

#seperate train/test data
train = lf.where(lf.mark == 'train')
test = lf.where(lf.mark == 'test')

train, validation = train.randomSplit([0.8, 0.2], seed=110)

print 'Training Data Number: ' + str(train.count())
print 'Validation Date Number ' + str(validation.count())
print 'Test Data Number ' + str(test.count())
Beispiel #6
  bin/spark-submit examples/src/main/python/ml/

if __name__ == "__main__":
    if len(sys.argv) > 1:
        print("Usage: simple_params_example", file=sys.stderr)
    sc = SparkContext(appName="PythonSimpleParamsExample")
    sqlContext = SQLContext(sc)

    # prepare training data.
    # We create an RDD of LabeledPoints and convert them into a DataFrame.
    # Spark DataFrames can automatically infer the schema from named tuples
    # and LabeledPoint implements __reduce__ to behave like a named tuple.
    training = sc.parallelize([
        LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])),
        LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])),
        LabeledPoint(0.0, DenseVector([2.0, 1.3, 1.0])),
        LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))

    # Create a LogisticRegression instance with maxIter = 10.
    # This instance is an Estimator.
    lr = LogisticRegression(maxIter=10)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # We may also set parameters using setter methods.

    # Learn a LogisticRegression model.  This uses the parameters stored in lr.
Beispiel #7
 def test_model_transform(self):
     data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]]
     model = StandardScaler().fit(
     self.assertEqual(model.transform([1.0, 2.0, 3.0]),
                      DenseVector([1.0, 2.0, 3.0]))
Beispiel #9
 def test_serialize(self):
     self._test_serialize(DenseVector(array([1., 2., 3., 4.])))
     self._test_serialize(DenseVector(pyarray.array('d', range(10))))
     self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
Beispiel #10
if __name__ == '__main__':

    data, out = sys.argv[1:]
    conf = SparkConf().setAppName('ResNETBroadCast')
    conf.set('spark.sql.execution.arrow.enable', 'true')
    # smaller batches for nodes with small memory
    conf.set('spark.sql.execution.arrow.maxRecordsPerBatch', '1024')
    # allow overwrite s3 files
    conf.set('spark.hadoop.orc.overwrite.output.file', 'true')  #

    sc = SparkContext.getOrCreate(conf)
    spark = SQLContext(sc)
    data =, inferSchema=True, header=True)
    # filter documents longer than MAX_REVIEW_LENGTH words
    data = data.withColumn('review_length', F.size(F.split(F.col('Text'),
                                                           ' ')))

    data = data.where(F.col('review_length') <= MAX_REVIEW_LENGTH)
    # repartition and embed
    res = data.repartition(PARTITIONS)\
            .map(lambda x: (, DenseVector(x.value)))

    frame = spark.createDataFrame(res, schema=schema)
    # out is an s3 bucket
Beispiel #11
 def transform(self, df):
     transformed = super(ALSBinaryModel, self).transform(df)
     as_vector = udf(lambda x: DenseVector([1 - x, x]), VectorUDT())
     return transformed.withColumn("rawPrediction",