def test_dot(self): from scipy.sparse import lil_matrix lil = lil_matrix((4, 1)) lil[1, 0] = 1 lil[3, 0] = 2 dv = DenseVector(array([1., 2., 3., 4.])) self.assertEqual(10.0, dv.dot(lil))
def get_ratings(self, res_id, ratings, top): if res_id not in self.models.keys(): logger.info("Keys: " + str(self.models.keys())) logger.info("Key Type: " + str(type(self.models.keys()[0]))) logger.info("res_id: " + str(res_id)) logger.info("res_id type:" + str(type(res_id))) logger.info("res_id not known") return [] pf = self.models[res_id].productFeatures() user_pf = pf.filter(lambda x: x[0] in ratings) if len(user_pf.collect()) == 0: logger.info("No product matches") return [] user_f = user_pf.collect() tmp = DenseVector(user_f[0][1]) for i in xrange(1, len(user_f)): tmp = tmp + user_f[i][1] #user_f = user_pf.reduce(lambda x, y : DenseVector(x[1]) + DenseVector(y[1])) estimate_score = pf.map(lambda x: (x[0], tmp.dot(DenseVector(x[1])))).filter(lambda x: x[0] not in ratings).takeOrdered(top, lambda (k,v): -v) #estimate_score = pf.map(lambda x: (x[0], DenseVector(user_f).dot(DenseVector(x[1])))).filter(lambda x: x[0] not in ratings).takeOrdered(top, lambda (k,v): -v) estimate_pid = map(lambda x: x[0], estimate_score) return estimate_pid
def test_squared_distance(self): from scipy.sparse import lil_matrix lil = lil_matrix((4, 1)) lil[1, 0] = 3 lil[3, 0] = 2 dv = DenseVector(array([1., 2., 3., 4.])) sv = SparseVector(4, {0: 1, 1: 2, 2: 3, 3: 4}) self.assertEqual(15.0, dv.squared_distance(lil)) self.assertEqual(15.0, sv.squared_distance(lil))
def test_norms(self): a = DenseVector([0, 2, 3, -1]) self.assertAlmostEqual(a.norm(2), 3.742, 3) self.assertTrue(a.norm(1), 6) self.assertTrue(a.norm(inf), 3) a = SparseVector(4, [0, 2], [3, -4]) self.assertAlmostEqual(a.norm(2), 5) self.assertTrue(a.norm(1), 7) self.assertTrue(a.norm(inf), 4) tmp = SparseVector(4, [0, 2], [3, 0]) self.assertEqual(tmp.numNonzeros(), 1)
class VectorUDTTests(MLlibTestCase): dv0 = DenseVector([]) dv1 = DenseVector([1.0, 2.0]) sv0 = SparseVector(2, [], []) sv1 = SparseVector(2, [1], [2.0]) udt = VectorUDT() def test_json_schema(self): self.assertEqual(VectorUDT.fromJson(self.udt.jsonValue()), self.udt) def test_serialization(self): for v in [self.dv0, self.dv1, self.sv0, self.sv1]: self.assertEqual(v, self.udt.deserialize(self.udt.serialize(v))) def test_infer_schema(self): rdd = self.sc.parallelize([LabeledPoint(1.0, self.dv1), LabeledPoint(0.0, self.sv1)]) df = rdd.toDF() schema = df.schema field = [f for f in schema.fields if f.name == "features"][0] self.assertEqual(field.dataType, self.udt) vectors = df.rdd.map(lambda p: p.features).collect() self.assertEqual(len(vectors), 2) for v in vectors: if isinstance(v, SparseVector): self.assertEqual(v, self.sv1) elif isinstance(v, DenseVector): self.assertEqual(v, self.dv1) else: raise TypeError("expecting a vector but got %r of type %r" % (v, type(v))) def test_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException df = self.spark.createDataFrame([Row(Vectors.dense(1))]) row_matrix = RowMatrix(df) self.assertEqual(row_matrix.numRows(), 1) self.assertEqual(row_matrix.numCols(), 1) with self.assertRaises(IllegalArgumentException): RowMatrix(df.selectExpr("'monkey'")) def test_indexed_row_matrix_from_dataframe(self): from pyspark.sql.utils import IllegalArgumentException df = self.spark.createDataFrame([Row(int(0), Vectors.dense(1))]) matrix = IndexedRowMatrix(df) self.assertEqual(matrix.numRows(), 1) self.assertEqual(matrix.numCols(), 1) with self.assertRaises(IllegalArgumentException): IndexedRowMatrix(df.drop("_1"))
def DGEMV(alpha, A, x, beta, y, jsc): # First form y:= beta * y. if (beta != 1.0): if (beta == 0.0): y = Vectors.zeros(y.size) else: y = beta * y if (alpha == 0.0): return y broadcastVector = jsc.broadcast(x) broadcastAlpha = jsc.broadcast(alpha) result = A.rows.map(lambda currentRow: L2.MultiplyRows(currentRow.index, broadcastAlpha.value, currentRow.vector, broadcastVector.value))\ .sortByKey()\ .values()\ .collect() resultVector = DenseVector(result) y = y + resultVector return y
def test_list(self): l = [0, 1] for lst_like in [l, np.array(l), DenseVector(l), SparseVector(len(l), range(len(l)), l), array.array('l', l), xrange(2), tuple(l)]: converted = TypeConverters.toList(lst_like) self.assertEqual(type(converted), list) self.assertListEqual(converted, l)
def train_transform_func(vector): # Remap the value to elimiate NAN result for ease of calculation new_vec = DenseVector.toArray().map(lambda x: 0 if math.isnan(x) else x) arimaModel = ARIMA.fit_model(1, 0, 0, new_vec) forecasted = arimaModel.forecast(new_vec, 5) # 5 days for predict print(type(forecasted)) exit(0)
def _transform(self, row): """Transforms the sparse vector to a dense vector while putting it in a new column.""" sparse_vector = row[self.input_column] dense_vector = DenseVector(sparse_vector.toArray()) new_row = new_dataframe_row(row, self.output_column, dense_vector) return new_row
def cat2Num(self, df, indices): '''sbaronia - extract the categorical data and make df out of it so oneHotEncoding can be run on them''' protocol_ind0 = df.select(df.id,df.rawFeatures[indices[0]].alias("features0")).cache() protocol_ind1 = df.select(df.id,df.rawFeatures[indices[1]].alias("features1")).cache() ind0_enc = self.oneHotEncoding(protocol_ind0,"features0").cache() ind1_enc = self.oneHotEncoding(protocol_ind1,"features1").cache() '''sbaronia - add those hot encoded features columns to original df''' int_join_1 = df.join(ind0_enc, ind0_enc.id == df.id, 'inner').drop(ind0_enc.id).cache() int_join_2 = int_join_1.join(ind1_enc, int_join_1.id == ind1_enc.id, 'inner').drop(int_join_1.id).cache() '''sbaronia - now create a new column features which has converted vector form and drop rest columns''' comb_udf = udf(replaceCat2Num,StringType()) int_join_2 = int_join_2.select(int_join_2.id,int_join_2.rawFeatures, \ comb_udf(int_join_2.rawFeatures, \ int_join_2.num_features0, \ int_join_2.num_features1).alias("features")).cache() '''sbaronia - convert list of numerical features to DenseVector so they can be used in KMeans''' dense_udf = udf(lambda line: DenseVector.parse(line), VectorUDT()) feat = int_join_2.select(int_join_2.id,int_join_2.rawFeatures,dense_udf(int_join_2.features).alias("features")).cache() return feat
def test_load_vectors(self): import shutil data = [[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]] temp_dir = tempfile.mkdtemp() load_vectors_path = os.path.join(temp_dir, "test_load_vectors") try: self.sc.parallelize(data).saveAsTextFile(load_vectors_path) ret_rdd = MLUtils.loadVectors(self.sc, load_vectors_path) ret = ret_rdd.collect() self.assertEqual(len(ret), 2) self.assertEqual(ret[0], DenseVector([1.0, 2.0, 3.0])) self.assertEqual(ret[1], DenseVector([1.0, 2.0, 3.0])) except: self.fail() finally: shutil.rmtree(load_vectors_path)
def gradientSummand(weights, lp): """Calculates the gradient summand for a given weight and `LabeledPoint`. Note: `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably within this function. For example, they both implement the `dot` method. Args: weights (DenseVector): An array of model weights (betas). lp (LabeledPoint): The `LabeledPoint` for a single observation. Returns: DenseVector: An array of values the same length as `weights`. The gradient summand. """ return (weights.dot(DenseVector(lp.features)) - lp.label) * DenseVector( lp.features)
def test_model_setters(self): data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]] model = StandardScaler().fit(self.sc.parallelize(data)) self.assertIsNotNone(model.setWithMean(True)) self.assertIsNotNone(model.setWithStd(True)) self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([-1.0, -1.0, -1.0]))
def cosineSimilarity(candidateTfIdf): frequencyDenseVectors_1 = candidateTfIdf.map( lambda vector: DenseVector(vector.toArray())) y1 = frequencyDenseVectors_1.collect() re = frequencyDenseVectors_0.map(lambda x: (x.dot(y1[0])) / (x.norm(2) * y1[0].norm(2))) return re
def mapFn(row): pvals = [] for predictor in predictors: predictor_index = lookup[predictor] if isinstance(dm[predictor], list): try: encoded_val = dm[predictor].index(row[predictor_index]) if setToFlag == None: pvals.append(encoded_val) else: flags = [0.0] * len(dm[predictor]) flags[encoded_val] = setToFlag pvals += flags except ValueError: if setToFlag == None: pvals.append(None) else: pvals += [0.0] * len(dm[predictor]) else: pval = row[predictor_index] # if pval == None: # pval_min = dm[predictor]["min"] # pval_max = dm[predictor]["max"] # pval=pval_min+(pval_max - pval_min)*0.5 pvals.append(pval) dv = DenseVector(pvals) if target_index == -1: return (row, dv) tval = row[target_index] if isinstance(dm[target], list): # target is categorical try: tval = dm[target].index(tval) except ValueError: tval = None return (row, LabeledPoint(tval, dv))
def parseLine(line): # Get Values label = line.severity # print('label: ', label) features = DenseVector(line.result) # print(features) return LabeledPoint(get_label_point(label), features)
def test_model_transform(self): data = [ [1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0] ] model = StandardScaler().fit(self.sc.parallelize(data)) self.assertEqual(model.transform([1.0, 2.0, 3.0]), DenseVector([1.0, 2.0, 3.0]))
def test_model_transform(self): weight = Vectors.dense([3, 2, 1]) densevec = Vectors.dense([4, 5, 6]) sparsevec = Vectors.sparse(3, [0], [1]) eprod = ElementwiseProduct(weight) self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6])) self.assertEqual(eprod.transform(sparsevec), SparseVector(3, [0], [3]))
def _predict(self, iterator): model = deserialize_keras_model(self.model) for row in iterator: X = np.asarray([row[self.features_column]]) Y = model.predict(X) v = DenseVector(Y[0]) new_row = new_dataframe_row(row, self.output_column, v) yield new_row
def test_list_int(self): for indices in [[1.0, 2.0], np.array([1.0, 2.0]), DenseVector([1.0, 2.0]), SparseVector(2, {0: 1.0, 1: 2.0}), xrange(1, 3), (1.0, 2.0), array.array('d', [1.0, 2.0])]: vs = VectorSlicer(indices=indices) self.assertListEqual(vs.getIndices(), [1, 2]) self.assertTrue(all([type(v) == int for v in vs.getIndices()])) self.assertRaises(TypeError, lambda: VectorSlicer(indices=["a", "b"]))
def parse_imr_w2v_vector(v_str): """ creates a spark DenseVectorvectors from string lines :param v_str: :return: """ from pyspark.mllib.linalg import DenseVector num_vec = map(float, v_str.split(';')[1:]) # TODO:? discard labels return DenseVector(num_vec)
def test_eq(self): v1 = DenseVector([0.0, 1.0, 0.0, 5.5]) v2 = SparseVector(4, [(1, 1.0), (3, 5.5)]) v3 = DenseVector([0.0, 1.0, 0.0, 5.5]) v4 = SparseVector(6, [(1, 1.0), (3, 5.5)]) v5 = DenseVector([0.0, 1.0, 0.0, 2.5]) v6 = SparseVector(4, [(1, 1.0), (3, 2.5)]) dm1 = DenseMatrix(2, 2, [2, 0, 0, 0]) sm1 = SparseMatrix(2, 2, [0, 2, 3], [0], [2]) self.assertEqual(v1, v2) self.assertEqual(v1, v3) self.assertFalse(v2 == v4) self.assertFalse(v1 == v5) self.assertFalse(v1 == v6) # this is done as Dense and Sparse matrices can be semantically # equal while still implementing a different __eq__ method self.assertEqual(dm1, sm1) self.assertEqual(sm1, dm1)
def test_parse_vector(self): a = DenseVector([3, 4, 6, 7]) self.assertTrue(str(a), '[3.0,4.0,6.0,7.0]') self.assertTrue(Vectors.parse(str(a)), a) a = SparseVector(4, [0, 2], [3, 4]) self.assertTrue(str(a), '(4,[0,2],[3.0,4.0])') self.assertTrue(Vectors.parse(str(a)), a) a = SparseVector(10, [0, 1], [4, 5]) self.assertTrue(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)
def test_squared_distance(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1., 2., 3., 4.])) lst = DenseVector([4, 3, 2, 1]) lst1 = [4, 3, 2, 1] arr = pyarray.array('d', [0, 2, 1, 3]) narr = array([0, 2, 1, 3]) self.assertEqual(15.0, _squared_distance(sv, dv)) self.assertEqual(25.0, _squared_distance(sv, lst)) self.assertEqual(20.0, _squared_distance(dv, lst)) self.assertEqual(15.0, _squared_distance(dv, sv)) self.assertEqual(25.0, _squared_distance(lst, sv)) self.assertEqual(20.0, _squared_distance(lst, dv)) self.assertEqual(0.0, _squared_distance(sv, sv)) self.assertEqual(0.0, _squared_distance(dv, dv)) self.assertEqual(0.0, _squared_distance(lst, lst)) self.assertEqual(25.0, _squared_distance(sv, lst1)) self.assertEqual(3.0, _squared_distance(sv, arr)) self.assertEqual(3.0, _squared_distance(sv, narr))
def computeCost(featuresAndPrediction, model): allClusterCenters = [DenseVector(c) for c in model.clusterCenters()] arrayCollection = featuresAndPrediction.rdd.map(array) def error(point, predictedCluster): center = allClusterCenters[predictedCluster] z = point - center return sqrt((z*z).sum()) return arrayCollection.map(lambda row: error(row[0], row[1])).reduce(lambda x, y: x + y)
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1., 2., 3., 4.])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.]]) self.assertEquals(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat))) self.assertEquals(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) self.assertEquals(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat)))
def processDf(df): mlSourceDF = df mlSourceDF.printSchema() mlSourceDF = mlSourceDF.fillna( 0, subset=[x for x in mlSourceDF.columns if 'Lag' in x]) mlSourceDF = mlSourceDF.na.drop( subset=["ServerIP", "SessionStartHourTime"]) columnsForIndex = [ 'dayofweek', 'ServerIP', 'year', 'month', 'weekofyear', 'dayofmonth', 'hourofday', 'Holiday', 'BusinessHour', 'Morning' ] mlSourceDF = mlSourceDF.fillna(0, subset=[x for x in columnsForIndex]) scoreDF = mlSourceDF # indexing scoreDF = indexModel.transform(scoreDF) # encoding scoreDFCat = ohPipelineModel.transform(scoreDF) # feature scaling featuresForScale = [x for x in scoreDFCat.columns if 'Lag' in x] assembler = VectorAssembler(inputCols=featuresForScale, outputCol="features") assembled = assembler.transform(scoreDFCat).select('key', 'features') scaledData = scaler.transform(assembled).select('key', 'scaledFeatures') def extract(row): return (row.key, ) + tuple(float(x) for x in row.scaledFeatures.values) from pyspark.sql.types import Row from pyspark.mllib.linalg import DenseVector rdd = scaledData.rdd.map( lambda x: Row(key=x[0], scaledFeatures=DenseVector(x[1].toArray()))) scaledDf = rdd.map(extract).toDF(["key"]) # rename columns oldColumns = scaledDf.columns scaledColumns = ['scaledKey'] scaledColumns.extend(['scaled' + str(i) for i in featuresForScale]) scaledOutcome = scaledDf.select([ col(oldColumns[index]).alias(scaledColumns[index]) for index in range(0, len(oldColumns)) ]) scaledOutcome.show(1) scaledOutcome.cache() noScaledMLSourceDF = scoreDFCat.select([ column for column in scoreDFCat.columns if column not in featuresForScale ]) noScaledMLSourceDF.cache() noScaledMLSourceDF.printSchema() scaledOutcome.printSchema() newDF = noScaledMLSourceDF.join( scaledOutcome, (noScaledMLSourceDF.key == scaledOutcome.scaledKey), 'outer') return newDF
def to_dense_vector(value, n_dim=2): """Converts the value to a one-hot encoded vector. # Arguments value: float. Value of the single "hot" value. n_dim: int. Dimension of the output vector. """ vector = np.zeros(n_dim) vector[value] = 1.0 return DenseVector(vector)
def toVector(value): """ Convert a value to a MLlib Vector, if possible. """ if isinstance(value, Vector): return value elif TypeConverters._can_convert_to_list(value): value = TypeConverters.toList(value) if all(map(lambda v: TypeConverters._is_numeric(v), value)): return DenseVector(value) raise TypeError("Could not convert %s to vector" % value)
def _transform(self, iterator): rows = [] try: for row in iterator: label = row[self.input_column] v = DenseVector(to_vector(label, self.output_dim).tolist()) new_row = new_dataframe_row(row, self.output_column, v) rows.append(new_row) except TypeError: pass return iter(rows)
def getLabeledPrediction(weights, observation): """Calculates predictions and returns a (label, prediction) tuple. Note: The labels should remain unchanged as we'll use this information to calculate prediction error later. Args: weights (np.ndarray): An array with one weight for each features in `trainData`. observation (LabeledPoint): A `LabeledPoint` that contain the correct label and the features for the data point. Returns: tuple: A (label, prediction) tuple. """ label = observation.label features = DenseVector(observation.features) weights = DenseVector(weights) prediction = DenseVector.dot(weights, features) result = (label, prediction) return result
def linregGradientDescent(trainData, numIters): """Calculates the weights and error for a linear regression model trained with gradient descent. Note: `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably within this function. For example, they both implement the `dot` method. Args: trainData (RDD of LabeledPoint): The labeled data for use in training the model. numIters (int): The number of iterations of gradient descent to perform. Returns: (np.ndarray, np.ndarray): A tuple of (weights, training errors). Weights will be the final weights (one weight per feature) for the model, and training errors will contain an error (RMSE) for each iteration of the algorithm. """ # The length of the training data n = trainData.count() # The number of features in the training data d = len(trainData.take(1)[0].features) w = np.zeros(d) w = DenseVector(w) alpha = 1.0 # We will compute and store the training error after each iteration errorTrain = np.zeros(numIters) for i in range(numIters): # Use getLabeledPrediction from (3b) with trainData to obtain an RDD of (label, prediction) # tuples. Note that the weights all equal 0 for the first iteration, so the predictions will # have large errors to start. labelsAndPredsTrain = trainData.map( lambda lp: getLabeledPrediction(w, lp)) errorTrain[i] = calcRMSE(labelsAndPredsTrain) # Calculate the `gradient`. Make use of the `gradientSummand` function you wrote in (3a). # Note that `gradient` sould be a `DenseVector` of length `d`. gradient = trainData.map(lambda lp: gradientSummand(w, lp)).reduce( lambda a, b: a + b) gradient = DenseVector(gradient) #gradient = trainData.map(lambda lp: gradientSummand(w,lp)).reduce(lambda one, two: one+two); # Update the weights alpha_i = alpha / (n * np.sqrt(i + 1)) w -= alpha_i * gradient return w, errorTrain
def fonction(line): # Parsing csv line values = list(csv.reader(StringIO(line)))[0] if is_train: PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked = values else: PassengerId, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked = values # Sex sex_code = 0 if Sex == "male" else 1 # Age safe_age = float(Age) if Age.isdigit() else mean_age try: safe_fare = float(Fare) except: safe_fare = mean_fare # Embarked safe_embarked = Embarked if Embarked else most_frequent_embarked code_embarked = 0 if safe_embarked == 'S' else 1 if safe_embarked == 'C' else 2 # == 'Q' # Title title = Name.split(",")[1].split(".")[0] code_title = 0 if title == "Mr": code_title = 1 elif title == "Miss": code_title = 2 elif title == "Mrs": code_title = 3 elif title == "Master": code_title = 4 # Pclass code code_pclass = int(Pclass) - 1 # child child_flag = 1 if safe_age <= 6 else 0 features_vector = DenseVector([ sex_code, safe_age, code_pclass, int(Parch), int(SibSp), safe_fare, code_embarked, code_title, child_flag, int(Parch) ]) if is_train: return LabeledPoint(float(Survived), features_vector) else: return features_vector
def _transform(self, row): """Appends the desired binary label column.""" value = row[self.input_column] vector = np.zeros(2) # Check if the name matches. if value == self.label: vector[0] = 1.0 else: vector[1] = 1.0 # Convert to a Spark DenseVector vector = DenseVector(vector) return new_dataframe_row(row, self.output_column, vector)
def gradientSummand(weights, lp): """Calculates the gradient summand for a given weight and `LabeledPoint`. Note: `DenseVector` behaves similarly to a `numpy.ndarray` and they can be used interchangably within this function. For example, they both implement the `dot` method. Args: weights (DenseVector): An array of model weights (betas). lp (LabeledPoint): The `LabeledPoint` for a single observation. Returns: DenseVector: An array of values the same length as `weights`. The gradient summand. """ return (DenseVector.dot(lp.features, weights) - lp.label) * lp.features
def getLabeledPrediction(weights, observation): """Calculates predictions and returns a (label, prediction) tuple. Note: The labels should remain unchanged as we'll use this information to calculate prediction error later. Args: weights (np.ndarray): An array with one weight for each features in `trainData`. observation (LabeledPoint): A `LabeledPoint` that contain the correct label and the features for the data point. Returns: tuple: A (label, prediction) tuple. """ return (observation.label, DenseVector.dot(observation.features, weights))
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1.0, 2.0, 3.0, 4.0])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0], [1.0, 2.0, 3.0, 4.0]]) self.assertEquals(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3.0, 6.0, 9.0, 12.0]), sv.dot(mat))) self.assertEquals(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), dv.dot(mat))) self.assertEquals(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10.0, 20.0, 30.0, 40.0]), lst.dot(mat)))
def test_dot(self): sv = SparseVector(4, {1: 1, 3: 2}) dv = DenseVector(array([1., 2., 3., 4.])) lst = DenseVector([1, 2, 3, 4]) mat = array([[1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.], [1., 2., 3., 4.]]) arr = pyarray.array('d', [0, 1, 2, 3]) self.assertEqual(10.0, sv.dot(dv)) self.assertTrue(array_equal(array([3., 6., 9., 12.]), sv.dot(mat))) self.assertEqual(30.0, dv.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), dv.dot(mat))) self.assertEqual(30.0, lst.dot(dv)) self.assertTrue(array_equal(array([10., 20., 30., 40.]), lst.dot(mat))) self.assertEqual(7.0, sv.dot(arr))
# #### Note that `DenseVector` stores all values as `np.float64`, so even if you pass in an NumPy array of integers, the resulting `DenseVector` will contain floating-point numbers. Also, `DenseVector` objects exist locally and are not inherently distributed. `DenseVector` objects can be used in the distributed setting by either passing functions that contain them to resilient distributed dataset (RDD) transformations or by distributing them directly as RDDs. You'll learn more about RDDs in the spark tutorial. # #### For this exercise, create a `DenseVector` consisting of the values `[3.0, 4.0, 5.0]` and compute the dot product of this vector with `numpyVector`. # In[28]: from pyspark.mllib.linalg import DenseVector # In[31]: # TODO: Replace <FILL IN> with appropriate code numpyVector = np.array([-3, -4, 5]) print "\nnumpyVector:\n{0}".format(numpyVector) # Create a DenseVector consisting of the values [3.0, 4.0, 5.0] myDenseVector = DenseVector([3.0, 4.0, 5.0]) # <FILL IN> # Calculate the dot product between the two vectors. denseDotProduct = myDenseVector.dot(DenseVector(numpyVector)) # <FILL IN> print "myDenseVector:\n{0}".format(myDenseVector) print "\ndenseDotProduct:\n{0}".format(denseDotProduct) # In[32]: # TEST PySpark's DenseVector (3c) Test.assertTrue(isinstance(myDenseVector, DenseVector), "myDenseVector is not a DenseVector") Test.assertTrue(np.allclose(myDenseVector, np.array([3.0, 4.0, 5.0])), "incorrect value for myDenseVector") Test.assertTrue(np.allclose(denseDotProduct, 0.0), "incorrect value for denseDotProduct")
# #### Note that `DenseVector` stores all values as `np.float64`, so even if you pass in an NumPy array of integers, the resulting `DenseVector` will contain floating-point numbers. Also, `DenseVector` objects exist locally and are not inherently distributed. `DenseVector` objects can be used in the distributed setting by either passing functions that contain them to resilient distributed dataset (RDD) transformations or by distributing them directly as RDDs. You'll learn more about RDDs in the spark tutorial. # #### For this exercise, create a `DenseVector` consisting of the values `[3.0, 4.0, 5.0]` and compute the dot product of this vector with `numpyVector`. # In[22]: from pyspark.mllib.linalg import DenseVector # In[25]: # TODO: Replace <FILL IN> with appropriate code numpyVector = np.array([-3, -4, 5]) print '\nnumpyVector:\n{0}'.format(numpyVector) # Create a DenseVector consisting of the values [3.0, 4.0, 5.0] myDenseVector = DenseVector([3.0, 4.0, 5.0]) # Calculate the dot product between the two vectors. denseDotProduct = myDenseVector.dot(numpyVector) print 'myDenseVector:\n{0}'.format(myDenseVector) print '\ndenseDotProduct:\n{0}'.format(denseDotProduct) # In[26]: # TEST PySpark's DenseVector (3c) Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector') Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])), 'incorrect value for myDenseVector') Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct')
# In[97]: from pyspark.mllib.linalg import DenseVector # In[98]: # TODO: Replace <FILL IN> with appropriate code numpyVector = np.array([-3, -4, 5]) print '\nnumpyVector:\n{0}'.format(numpyVector) # Create a DenseVector consisting of the values [3.0, 4.0, 5.0] myDenseVector = DenseVector(np.array([3.0,4.0,5.0])) # Calculate the dot product between the two vectors. denseDotProduct = DenseVector.dot(DenseVector(numpyVector),myDenseVector) print 'myDenseVector:\n{0}'.format(myDenseVector) print '\ndenseDotProduct:\n{0}'.format(denseDotProduct) # In[99]: # TEST PySpark's DenseVector (3c) Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector') Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])), 'incorrect value for myDenseVector') Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct') # ### ** Part 4: Python lambda expressions **
expectedError = [79.72013547, 30.27835699, 9.27842641, 9.20967856, 9.19446483] Test.assertTrue(np.allclose(exampleErrorTrain, expectedError), 'value of exampleErrorTrain is incorrect') # #### ** (3d) Train the model ** # #### Now let's train a linear regression model on all of our training data and evaluate its accuracy on the validation set. Note that the test set will not be used here. If we evaluated the model on the test set, we would bias our final results. # #### We've already done much of the required work: we computed the number of features in Part (1b); we created the training and validation datasets and computed their sizes in Part (1e); and, we wrote a function to compute RMSE in Part (2b). # In[44]: # TODO: Replace <FILL IN> with appropriate code numIters = 50 weightsLR0, errorTrainLR0 = linregGradientDescent(parsedTrainData, numIters); labelsAndPreds = parsedValData.map(lambda lp: (lp.label,DenseVector.dot(weightsLR0,lp.features))) rmseValLR0 = calcRMSE(labelsAndPreds) print 'Validation RMSE:\n\tBaseline = {0:.3f}\n\tLR0 = {1:.3f}'.format(rmseValBase, rmseValLR0) # In[45]: # TEST Train the model (3d) expectedOutput = [22.64535883, 20.064699, -0.05341901, 8.2931319, 5.79155768, -4.51008084, 15.23075467, 3.8465554, 9.91992022, 5.97465933, 11.36849033, 3.86452361] Test.assertTrue(np.allclose(weightsLR0, expectedOutput), 'incorrect value for weightsLR0') # #### ** Visualization 4: Training error **
# In[29]: from pyspark.mllib.linalg import DenseVector # In[31]: # TODO: Replace <FILL IN> with appropriate code numpyVector = np.array([-3, -4, 5]) print '\nnumpyVector:\n{0}'.format(numpyVector) # Create a DenseVector consisting of the values [3.0, 4.0, 5.0] myDenseVector = DenseVector(np.array([3.0,4.0,5.0])) # Calculate the dot product between the two vectors. denseDotProduct = DenseVector.dot(myDenseVector, numpyVector) print 'myDenseVector:\n{0}'.format(myDenseVector) print '\ndenseDotProduct:\n{0}'.format(denseDotProduct) # In[32]: # TEST PySpark's DenseVector (3c) Test.assertTrue(isinstance(myDenseVector, DenseVector), 'myDenseVector is not a DenseVector') Test.assertTrue(np.allclose(myDenseVector, np.array([3., 4., 5.])), 'incorrect value for myDenseVector') Test.assertTrue(np.allclose(denseDotProduct, 0.0), 'incorrect value for denseDotProduct') # ### ** Part 4: Python lambda expressions **
zeros = np.zeros(8) # returns an array of 8 0s [ 0. 0. 0. 0. 0. 0. 0. 0.] ones = np.ones(8) # returns an array of 8 1s [ 1. 1. 1. 1. 1. 1. 1. 1.] print 'zeros:\n{0}'.format(zeros) print '\nones:\n{0}'.format(ones) zerosThenOnes = np.hstack((zeros,ones)) #notice the "((" # hstack will return [ 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.] zerosAboveOnes = np.vstack((zeros,ones)) # A 2 by 8 array # vstack in the above example will return [[ 0. 0. 0. 0. 0. 0. 0. 0.] # [ 1. 1. 1. 1. 1. 1. 1. 1.]] print '\nzerosThenOnes:\n{0}'.format(zerosThenOnes) print '\nzerosAboveOnes:\n{0}'.format(zerosAboveOnes) # When using PySpark, we use DenseVector instead of numpy vector. Example below: from pyspark.mllib.linalg import DenseVector numpyVector = np.array([-3, -4, 5]) print '\nnumpyVector:\n{0}'.format(numpyVector) # Create a DenseVector consisting of the values [3.0, 4.0, 5.0] myDenseVector = DenseVector([3.0, 4.0, 5.0]) # Calculate the dot product between the two vectors. denseDotProduct = DenseVector.dot(myDenseVector, numpyVector) # DenseVector.dot() does the dot product print 'myDenseVector:\n{0}'.format(myDenseVector) print '\ndenseDotProduct:\n{0}'.format(denseDotProduct)