def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) tvsModel = tvs.fit(dataset) tvsPath = temp_path + "/tvs" tvs.save(tvsPath) loadedTvs = TrainValidationSplit.load(tvsPath) self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid) self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid) self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps()) tvsModelPath = temp_path + "/tvsModel" tvsModel.save(tvsModelPath) loadedModel = TrainValidationSplitModel.load(tvsModelPath) self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
def test_save_load(self): temp_path = tempfile.mkdtemp() sqlContext = SQLContext(self.sc) dataset = sqlContext.createDataFrame( [(Vectors.dense([0.0]), 0.0), (Vectors.dense([0.4]), 1.0), (Vectors.dense([0.5]), 0.0), (Vectors.dense([0.6]), 1.0), (Vectors.dense([1.0]), 1.0)] * 10, ["features", "label"]) lr = LogisticRegression() grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build() evaluator = BinaryClassificationEvaluator() cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator) cvModel = cv.fit(dataset) cvPath = temp_path + "/cv" cv.save(cvPath) loadedCV = CrossValidator.load(cvPath) self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid) self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid) self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps()) cvModelPath = temp_path + "/cvModel" cvModel.save(cvModelPath) loadedModel = CrossValidatorModel.load(cvModelPath) self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
def test_nnclassifier_in_pipeline(self): if self.sc.version.startswith("1"): from pyspark.mllib.linalg import Vectors df = self.sqlContext.createDataFrame( [(Vectors.dense([2.0, 1.0]), 1.0), (Vectors.dense([1.0, 2.0]), 2.0), (Vectors.dense([2.0, 1.0]), 1.0), (Vectors.dense([1.0, 2.0]), 2.0), ], ["features", "label"]) scaler = MinMaxScaler().setInputCol("features").setOutputCol("scaled") model = Sequential().add(Linear(2, 2)) criterion = ClassNLLCriterion() classifier = NNClassifier(model, criterion, MLlibVectorToTensor([2]))\ .setBatchSize(4) \ .setLearningRate(0.01).setMaxEpoch(1).setFeaturesCol("scaled") pipeline = Pipeline(stages=[scaler, classifier]) pipelineModel = pipeline.fit(df) res = pipelineModel.transform(df) assert type(res).__name__ == 'DataFrame'
def _get_train_data(self): sql_context = SQLContext(self.sc) l = [ (1, Vectors.dense([1, 2, 3]), 1.0), (2, Vectors.dense([1, 2, 3]), 0.0), (3, Vectors.dense([1, 2, 3]), 1.0), (4, Vectors.dense([1, 2, 3]), 0.0), ] return sql_context.createDataFrame(l, ['id', 'features', 'label'])
def test_model_transform(self): weight = Vectors.dense([3, 2, 1]) densevec = Vectors.dense([4, 5, 6]) sparsevec = Vectors.sparse(3, [0], [1]) eprod = ElementwiseProduct(weight) self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6])) self.assertEqual( eprod.transform(sparsevec), SparseVector(3, [0], [3]))
def test_idf_model(self): data = [ Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]), Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]), Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]), Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9]) ] model = IDF().fit(self.sc.parallelize(data, 2)) idf = model.idf() self.assertEqual(len(idf), 11)
def test_output_columns(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) model = ovr.fit(df) output = model.transform(df) self.assertEqual(output.columns, ["label", "features", "prediction"])
def load_data_rdd(csv_file, shuffle=True, train=True): if shuffle: shuffle_csv(csv_file) data = sc.textFile(data_path + csv_file) data = data.filter(lambda x:x.split(',')[0] != 'id').map(lambda line: line.split(',')) if train: data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:-1]).astype(np.float32)), str(line[-1]).replace('Class_', '')) ) else: data = data.map(lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)), "1") ) return data
def remove_time_dependent_effects(self, ts): """ Given a timeseries, apply inverse operations to obtain the original series of underlying errors. Parameters ---------- ts: Time series of observations with this model's characteristics as a Numpy array returns the time series with removed time-dependent effects as a Numpy array """ destts = Vectors.dense(np.array([0] * len(ts))) result = self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts)) return _java2py(self._ctx, result.toArray())
def test_copy(self): df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)), (1.0, Vectors.sparse(2, [], [])), (2.0, Vectors.dense(0.5, 0.5))], ["label", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01) ovr = OneVsRest(classifier=lr) ovr1 = ovr.copy({lr.maxIter: 10}) self.assertEqual(ovr.getClassifier().getMaxIter(), 5) self.assertEqual(ovr1.getClassifier().getMaxIter(), 10) model = ovr.fit(df) model1 = model.copy({model.predictionCol: "indexed"}) self.assertEqual(model1.getPredictionCol(), "indexed")
def create_rows_for_rdd(x): """ :param x: :return: """ features = list(x[1]) l = len(features) - 1 label = float(features.pop(l)) meta_data = x[0] return Row(label=label, features=Vectors.dense(features), meta_data=Vectors.dense(meta_data))
def load_data_frame(csv_file, shuffle=True, train=True): if shuffle: shuffle_csv(csv_file) data = sc.textFile('/home/minglu/dist_spark/data/' + csv_file) # This is an RDD, which will later be transformed to a data frame data = data.filter(lambda x:x.split(',')[0] != 'label').map(lambda line: line.split(',')) if train: data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)), 'class_'+str(line[0]),int(line[0])) ) else: # Test data gets dummy labels. We need the same structure as in Train data data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),'class_'+str(line[0]),int(line[0])) ) return sqlcontext.createDataFrame(data, ['features', 'category','label'])
def add_time_dependent_effects(self, ts): """ Given a timeseries, apply a model to it. Parameters ---------- ts: Time series of i.i.d. observations as a Numpy array returns the time series with added time-dependent effects as a Numpy array. """ destts = Vectors.dense([0] * len(ts)) result = self._jmodel.addTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts)) return _java2py(self._ctx, result.toArray())
def test_glr_summary(self): from pyspark.mllib.linalg import Vectors df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight", fitIntercept=False) model = glr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertEqual(s.numIterations, 1) # this should default to a single iteration of WLS self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.predictionCol, "prediction") self.assertTrue(isinstance(s.residuals(), DataFrame)) self.assertTrue(isinstance(s.residuals("pearson"), DataFrame)) coefStdErr = s.coefficientStandardErrors self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float)) tValues = s.tValues self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float)) pValues = s.pValues self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float)) self.assertEqual(s.degreesOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedom, 1) self.assertEqual(s.residualDegreeOfFreedomNull, 2) self.assertEqual(s.rank, 1) self.assertTrue(isinstance(s.solver, basestring)) self.assertTrue(isinstance(s.aic, float)) self.assertTrue(isinstance(s.deviance, float)) self.assertTrue(isinstance(s.nullDeviance, float)) self.assertTrue(isinstance(s.dispersion, float)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.deviance, s.deviance)
def forecast(self, ts, nfuture): """ Provided fitted values for timeseries ts as 1-step ahead forecasts, based on current model parameters, and then provide `nFuture` periods of forecast. We assume AR terms prior to the start of the series are equal to the model's intercept term (or 0.0, if fit without and intercept term).Meanwhile, MA terms prior to the start are assumed to be 0.0. If there is differencing, the first d terms come from the original series. Parameters ---------- ts: Timeseries to use as gold-standard. Each value (i) in the returning series is a 1-step ahead forecast of ts(i). We use the difference between ts(i) - estimate(i) to calculate the error at time i, which is used for the moving average terms. Numpy array. nFuture: Periods in the future to forecast (beyond length of ts) Returns a series consisting of fitted 1-step ahead forecasts for historicals and then `nFuture` periods of forecasts. Note that in the future values error terms become zero and prior predictions are used for any AR terms. """ jts = _py2java(self._ctx, Vectors.dense(ts)) jfore = self._jmodel.forecast(jts, nfuture) return _java2py(self._ctx, jfore)
def to_vector(np_array): ''' Convert numpy array to MLlib Vector ''' if len(np_array.shape) == 1: return Vectors.dense(np_array) else: raise Exception("""An MLLib Vector can only be created from a one-dimensional numpy array""")
def test_persistence(self): # Test save/load for LDA, LocalLDAModel, DistributedLDAModel. sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([ [1, Vectors.dense([0.0, 1.0])], [2, Vectors.sparse(2, {0: 1.0})], ], ["id", "features"]) # Fit model lda = LDA(k=2, seed=1, optimizer="em") distributedModel = lda.fit(df) self.assertTrue(distributedModel.isDistributed()) localModel = distributedModel.toLocal() self.assertFalse(localModel.isDistributed()) # Define paths path = tempfile.mkdtemp() lda_path = path + "/lda" dist_model_path = path + "/distLDAModel" local_model_path = path + "/localLDAModel" # Test LDA lda.save(lda_path) lda2 = LDA.load(lda_path) self._compare(lda, lda2) # Test DistributedLDAModel distributedModel.save(dist_model_path) distributedModel2 = DistributedLDAModel.load(dist_model_path) self._compare(distributedModel, distributedModel2) # Test LocalLDAModel localModel.save(local_model_path) localModel2 = LocalLDAModel.load(local_model_path) self._compare(localModel, localModel2) # Clean up try: rmtree(path) except OSError: pass
def save_pca_parameters(pca_model, data_dim): # since there's no good way of doing it in python, simply use an I matrix to retrieve features = [(Vectors.dense(x),) for x in np.eye(data_dim).tolist()] params = pca_embed(sqlContext.createDataFrame(features, ('features',)), pca_model) np.savetxt(PCA_OUT_PATH, np.matrix(params.select('pca').rdd.map(lambda r: r[0]).collect()), fmt='%.6f')
def transformToNumeric(inputStr): attList = inputStr.split(",") values = Vectors.dense([float(attList[28]), \ float(attList[4]),float(attList[5]), float(attList[6]), float(attList[7])]) return values
def buildLabeledPoint(s, classification): features=[] for attr in attributes: features.append(getattr(s, attr + '_1')) for attr in attributes: features.append(getattr(s, attr + '_2')) return LabeledPoint(classification,Vectors.dense(features))
def test_logistic_regression_summary(self): from pyspark.mllib.linalg import Vectors sqlContext = SQLContext(self.sc) df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)), (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "features"]) lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False) model = lr.fit(df) self.assertTrue(model.hasSummary) s = model.summary # test that api is callable and returns expected types self.assertTrue(isinstance(s.predictions, DataFrame)) self.assertEqual(s.probabilityCol, "probability") self.assertEqual(s.labelCol, "label") self.assertEqual(s.featuresCol, "features") objHist = s.objectiveHistory self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float)) self.assertGreater(s.totalIterations, 0) self.assertTrue(isinstance(s.roc, DataFrame)) self.assertAlmostEqual(s.areaUnderROC, 1.0, 2) self.assertTrue(isinstance(s.pr, DataFrame)) self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame)) self.assertTrue(isinstance(s.precisionByThreshold, DataFrame)) self.assertTrue(isinstance(s.recallByThreshold, DataFrame)) # test evaluation (with training dataset) produces a summary with same values # one check is enough to verify a summary is returned, Scala version runs full test sameSummary = model.evaluate(df) self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
def iniKM(): conf = (SparkConf().set("spark.driver.maxResultSize", "5g")) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) data = sc.textFile("/mnt/yi-ad-proj/reduced_data/reduced_data").map(lambda x:x.split(" ")).cache() data = data.map(lambda x:[float(y) for y in x]) df = data.map(lambda x: Row(features=Vectors.dense(x))).toDF() return df
def test_int_to_float(self): from pyspark.mllib.linalg import Vectors df = self.sc.parallelize([ Row(label=1.0, weight=2.0, features=Vectors.dense(1.0))]).toDF() lr = LogisticRegression(elasticNetParam=0) lr.fit(df) lr.setElasticNetParam(0) lr.fit(df)
def log_likelihood(self, ts): """ Returns the log likelihood of the parameters on the given time series. Based on http://www.unc.edu/~jbhill/Bollerslev_GARCH_1986.pdf """ likelihood = self._jmodel.logLikelihood(_py2java(self._ctx, Vectors.dense(ts))) return _java2py(self._ctx, likelihood)
def iniPCA(): conf = (SparkConf().set("spark.driver.maxResultSize", "5g")) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) data = sc.textFile("/mnt/yi-ad-proj/compressed_data/compressed_data").map(lambda x:x.split(" ")).cache() data = data.map(lambda x:(x[0],[float(y) for y in x[1:]])) df = data.map(lambda x: Row(labels=x[0],features=Vectors.dense(x))).toDF() return df
def count_words(record, vocabulary): word_counts = Counter(record['words']) word_vector = [] for word in vocabulary: word_vector.append(word_counts[word]) label = record['label'] features = Vectors.dense(word_vector) return LabeledPoint(label, features)
def parseLine(line): parts = line.split(',') label = float(parts[0]) feature_vector = map(float, parts[1:]) # for feat in parts[1:]: # feature_vector.append(float(feat)) features = Vectors.dense(feature_vector) # features = Vectors.dense([float(x) for x in parts[1].split(' ')]) return LabeledPoint(label, features)
def parse_line(line, perc_keep=1.0, invalid_feat=None): if len(line.strip()) == 0: return invalid_feat if np.random.rand() < perc_keep: try: feature = (Vectors.dense([float(x) for x in line.split(',')]),) return feature except Exception, e: return invalid_feat
def pre_process(datetime, rdd): #print (str(type(time)) + " " + str(type(rdd))) start = time.time() points=rdd.map(lambda p: p[1]).flatMap(lambda a: eval(a)).map(lambda a: Vectors.dense(a)) end_preproc=time.time() count = points.count() output_file.write("KMeans PreProcess, %d, %d, %s, %.5f\n"%(spark_cores, count, NUMBER_PARTITIONS, end_preproc-start)) output_file.flush() return points
def streamingKMeansDataGenerator(self, batches, numPoints, k, d, r, seed, centers=None): rng = random.RandomState(seed) # Generate centers. centers = [rng.randn(d) for i in range(k)] return centers, [[Vectors.dense(centers[j % k] + r * rng.randn(d)) for j in range(numPoints)] for i in range(batches)]
# 将若干文本文件读取为TF向量 sc = SparkContext('local') rdd = sc.wholeTextFiles('P51FeatureExtraction.py').map( lambda text: text[1].split()) tfVectors = tf.transform(rdd) # 对整个RDD进行转化操作 for v in tfVectors.collect(): print(v) # 在 Python 中使用 TF-IDF idf = IDF() idfModel = idf.fit(tfVectors) tfIdVectors = idfModel.transform(tfVectors) print(tfIdVectors) for v in tfIdVectors.collect(): print(v) # 在 Python 中缩放向量 print('--在 Python 中缩放向量--') from pyspark.mllib.linalg import Vectors from pyspark.mllib.feature import StandardScaler vectors = [Vectors.dense([-2.0, 5.0, 1.0]), Vectors.dense([2.0, 0.0, 1.0])] dataset = sc.parallelize(vectors) print(dataset) scaler = StandardScaler(withMean=True, withStd=True) model = scaler.fit(dataset) result = model.transform(dataset) print(result) for v in result.collect(): print(v)
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.classification import NaiveBayes from pyspark.mllib.linalg import Vectors from pyspark import SparkContext sc = SparkContext('local') denseVec1 = LabeledPoint(1.0, Vectors.dense([3.0, 5.0, 1.0])) denseVec2 = LabeledPoint(0.0, Vectors.dense([2.0, 0.0, 1.0])) vectors = [denseVec1, denseVec2] dataset = sc.parallelize(vectors) print(dataset) model = NaiveBayes.train(dataset) denseVec = Vectors.dense([1.5, 2.0, 3.0]) print("predict: %s, theta: %s\n" % (model.predict(denseVec), model.theta)) # predict: 0.0, theta: [[-0.69314718 -1.79175947 -1.09861229] # [-1.09861229 -0.69314718 -1.79175947]]
# See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import PCA from pyspark.mllib.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PCAExample")\ .getOrCreate() # $example on$ data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ), (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ), (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )] df = spark.createDataFrame(data, ["features"]) pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures") model = pca.fit(df) result = model.transform(df).select("pcaFeatures") result.show(truncate=False) # $example off$ spark.stop()
def build_vectors(data_user_info): return data_user_info \ .map(lambda user: Vectors.dense(user))
withstations = tidy.rdd.map(lambda row: Row(station=map_yx_to_station(row.yx), datehour=row.datehour)).toDF() withstations.registerTempTable('stationincidents') incidentcount = sqlc.sql( "select station, datehour, count(1) as incidents from stationincidents group by station, datehour" ) print("we now have incidents by station/hour in incidentcount") incidentcount.show(10) # now join the two tables joined = cleanedaverages.join(incidentcount, ['station', 'datehour'], 'outer') # if incident data doesn't exist for that station/datehour, then it is 0 zeroed = joined.rdd.map(lambda row: Row(station=row.station, datehour=row.datehour, temp=row.temp, wind=row.wind, incidents=row.incidents if row.incidents else 0)).toDF() # if temp/wind data doesn't exist for that station/datehour, then we can't use that row final = zeroed.filter(zeroed.temp.isNotNull()).filter( zeroed.wind.isNotNull()).filter(zeroed.temp != 0) # finally apply correlation test vecs = final.rdd.map( lambda row: Vectors.dense([row.temp, row.wind, row.incidents])) print(Statistics.corr(vecs))
def extract(x): return Vectors.dense(x[1])
join = beatXyear.leftOuterJoin(crime_MR) # replace None to 0 join = join.map(lambda x: (x[0],0) if x[1][1] is None else (x[0],x[1][1])) ## join.collect() # ((beat, year), # of crimes) # change format -> (year, (beat, crime)) join = join.map(lambda x: (x[0][1], (x[0][0],x[1]))) # groupby year crimeYear = join.groupByKey() # sort by beat within value # crimeYear = crimeYear.map(lambda x: (x[0], sorted(x[1], key=lambda y: y[0]))) # drop beat column -> (year, (crime #1, #2, #3 ,...)) crimeYear = crimeYear.map(lambda x: (x[0], [y[1] for y in x[1]])) # vectors -> year = row, beat = col crimeVector = crimeYear.map(lambda x: Vectors.dense(x[1])) # compute correlation matrix corr = Statistics.corr(crimeVector) ## corr.shape # (303,303) # correlation dictionary function of the lower trinagle in the corr matrix def corrDictFunc(corr): # output = ((row, col): corr value) # corr.shape[0] = corr.shape[1]= 303 return dict(((i,j), corr[i][j]) for i in range(corr.shape[0]) for j in range(corr.shape[0]) if i<j) corrDict = corrDictFunc(corr) # find beats with highest correlations def topK(beat, index, k):
from pyspark import SparkContext from pyspark.sql import SQLContext # $example on$ from pyspark.ml.feature import ChiSqSelector from pyspark.mllib.linalg import Vectors # $example off$ if __name__ == "__main__": sc = SparkContext(appName="ChiSqSelectorExample") sqlContext = SQLContext(sc) # $example on$ df = sqlContext.createDataFrame([( 7, Vectors.dense([0.0, 0.0, 18.0, 1.0]), 1.0, ), ( 8, Vectors.dense([0.0, 1.0, 12.0, 0.0]), 0.0, ), ( 9, Vectors.dense([1.0, 0.0, 15.0, 0.1]), 0.0, )], ["id", "features", "clicked"]) selector = ChiSqSelector(numTopFeatures=1, featuresCol="features", outputCol="selectedFeatures", labelCol="clicked")
from pyspark.mllib.linalg import Vectors def mapper(x): return 0,abs( float(x) ) if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: spark-submit lr.py <input residuals file> <output file>", file=sys.stderr) exit(-1) sc = SparkContext(appName="Chi Squared residuals") lines = sc.textFile(sys.argv[1], 1) resid = lines.map(mapper) resid = resid.collect() residuals = [] for r in resid: residuals.append(r[1]) vec = Vectors.dense(residuals) gft = Statistics.chiSqTest(vec) print("%s\n" % gft) sc.stop()
from pyspark.mllib.linalg import Vectors, Matrices from pyspark.feature import LabeledPoint vector = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0] spark_vector = Vectors.dense(vector) label = 45.0 labeled_point = LabeledPoint(label, vector) spark_matrix = Matrices.dense(3, 2, vector)
# Getting the data structure and scaling spark_df = sc.parallelize( spark.read.json("Data/yelp_academic_dataset_business.json").select( "stars", "review_count", "is_open").take(1700)) scaler = StandardScaler(inputCol="_1",\ outputCol="scaled_1") trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map( lambda x: (x, )).toDF() scalerModel = scaler.fit(trial_df) vec_df = spark.createDataFrame( scalerModel.transform(trial_df).select("scaled_1").rdd.map( lambda x: (float(x[0][0]), float(x[0][1]), float(x[0][2])))) # Create RowMatrix from the transpose of spark_df = spark.createDataFrame(vec_df.toPandas().transpose()).rdd vector_df = sc.parallelize(spark_df.map(lambda s: Vectors.dense(s)).collect()) mat = RowMatrix(vector_df) bun = mat.rows.collect() num_clusters = 3 pre = sc.parallelize(mat.columnSimilarities().entries.map( lambda e: (e.i, e.j, e.value)).collect()) model = PowerIterationClustering.train(pre, 3, 20, "random") err = model.assignments().map(lambda x: (Vectors.dense(bun[0][x.id], bun[1][ x.id], bun[2][x.id]), x.cluster)).collect() # Silhoutte value ag = 0 agi = 1700 for er in err: avg = [0] * num_clusters
x = DataSetRDD.take(1) x # COMMAND ---------- x[0][-1] # COMMAND ---------- from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.tree import RandomForest, RandomForestModel from pyspark.mllib.util import MLUtils from pyspark.mllib.linalg import Vectors labeledPoints = DataSetRDD.map( lambda x: LabeledPoint(x[-1], Vectors.dense(x[:-1]))) (trainingData, testData) = labeledPoints.randomSplit([0.7, 0.3]) # COMMAND ---------- labeledPoints.collect() # COMMAND ---------- model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo={}, numTrees=3, featureSubsetStrategy="auto", impurity='variance', maxDepth=4,
def textToVector(x): array = str(x).replace('(', '').replace(')', '').replace('DenseVector', '').split(',') return (int(array[0]), Vectors.dense(array[1:]))
from pyspark.mllib.regression import LabeledPoint from pyspark.sql import SparkSession if __name__ == "__main__": sparkSession = SparkSession\ .builder\ .getOrCreate() # Python list dense_vector1 = [1.0, 0.0, 3.5, 0.0, 5.1] # NumPy array dense_vector2 = np.array([1.0, 0.0, 3.5, 0.0, 5.1]) # Vector dense_vector3 = Vectors.dense([1.0, 0.0, 3.5, 0.0, 5.1]) sparse_vector = Vectors.sparse(5, [0, 2, 4], [1.0, 3.5, 5.1]) print("Vector 1 (Python list) : " + str(dense_vector1)) print("Vector 2 (NumPy Array) : " + str(dense_vector2)) print("Vector 3 (Vectors) : " + str(dense_vector3)) print("Vector 1 (Vectors): " + str(sparse_vector)) labeled_point = LabeledPoint(1.0, dense_vector1) labeled_point2 = LabeledPoint(0.0, Vectors.sparse(5, [2, 4], [5.2, 6.2])) print("Labeled point (Python list): " + str(labeled_point)) print("Labeled point (Sparse vector): " + str(labeled_point2)) sparkSession.stop()
method='pearson'))) ## sampling # sampling methods can be performed on RDD's of key-value pairs data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'), (3, 'f')]) fractions = {1: 0.1, 2: 0.6, 3: 0.3} approxSample = data.sampleByKey(False, fractions) ## hypothesis testing from pyspark.mllib.linalg import Matrices, Vectors from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.stat import Statistics vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25) # compute goodness of fit. either compare two vectors to each other or compare one vector to a uniform distribution goodnessOfFitTestResults = Statistics.chiSqTest(vec) print(goodnessOfFitTestResults) # pearson's independence test on a matrix mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0]) independenceTestResults = Statistics.chiSqTest(mat) print(independenceTestResults) # a contingency table can be constructed from an RDD of LabeledPoint/vector pairs. The resulting test returns # a Chi-squared test results for every feature against the label obs = sc.parallelize([ LabeledPoint(1.0, [1.0, 0.0, 3.0]), LabeledPoint(1.0, [1.0, 2.0, 0.0]), LabeledPoint(1.0, [-1.0, 0.0, -0.5])
from pyspark.mllib.regression import LabeledPoint from pyspark.mllib.clustering import StreamingKMeans from pyspark.streaming import StreamingContext from pyspark.mllib.clustering import KMeans from pyspark import SparkContext # we make an input stream of vectors for training, # as well as a stream of vectors for testing sc = SparkContext(); ssc = StreamingContext(sc, 5); trainingData = sc.textFile("data/datatraining.txt")\ .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr])) centers = KMeans.train(trainingData, 2).centers trainingQueue = [trainingData] trainingStream = ssc.queueStream(trainingQueue) # We create a model with random clusters and specify the number of clusters to find model = StreamingKMeans(k=2, decayFactor=0.3)#.setRandomCenters(5, 1.0, 0) model.setInitialCenters( centers, [1.0,1.0,1.0,1.0,1.0]) # Now register the streams for training and testing and start the job, # printing the predicted cluster assignments on new data points as they arrive.
def build(user): return LabeledPoint(float(user[0]), Vectors.dense(user[1]))
def GetParts(line): parts = line.split(',') return LabeledPoint( float(parts[4]), Vectors.dense(float(parts[1]), float(parts[2]), float(parts[3])))
print(summary.variance()) print(summary.numNonzeros()) print(summary.max()) print(summary.min()) print(summary.count()) print(summary.normL1()) print(summary.normL2()) #correlation x = sc.parallelize(np.random.randn(4, 1)) y = sc.parallelize(np.random.randn(4, 1)) print("Correlation :", str(Statistics.corr(x, y))) #Chi-square #For Vector x = Vectors.dense(np.random.random_sample((5))) y = Vectors.dense(np.random.random_sample((5))) chisqr = Statistics.chiSqTest(x, y) print(chisqr.statistic) print(chisqr.degreesOfFreedom) print(chisqr.pValue) print(chisqr.nullHypothesis) # For Matrices x = Matrices.dense(4, 2, np.random.random_sample((8))) y = Matrices.dense(4, 2, np.random.random_sample((8))) chisqr = Statistics.chiSqTest(x, y) print(chisqr.statistic) print(chisqr.degreesOfFreedom) print(chisqr.pValue) print(chisqr.nullHypothesis)
def parse(lp): label = float(lp[lp.find('(') + 1:lp.find(',')]) vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(',')) return LabeledPoint(label, vec)
if __name__ == "__main__": sc = SparkContext(appName="StandardScalerExample") # SparkContext # $example on$ data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") label = data.map(lambda x: x.label) features = data.map(lambda x: x.features) scaler1 = StandardScaler().fit(features) scaler2 = StandardScaler(withMean=True, withStd=True).fit(features) # data1 will be unit variance. data1 = label.zip(scaler1.transform(features)) # Without converting the features into dense vectors, transformation with zero mean will raise # exception on sparse vector. # data2 will be unit variance and zero mean. data2 = label.zip( scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray())))) # $example off$ print("data1:") for each in data1.collect(): print(each) print("data2:") for each in data2.collect(): print(each) sc.stop()
def f1(line): return Vectors.dense([float(coord) for coord in line.split(" ") if len(coord) > 0])
def parseLine(line): parts = line.split(',') label = float(parts[len(parts)-1]) features = Vectors.dense([float(parts[x]) for x in range(0,len(parts)-1)]) return LabeledPoint(label, features)
if __name__ == "__main__": sc = SparkContext(appName="StreamingKMeansExample") # SparkContext ssc = StreamingContext(sc, 1) # $example on$ # we make an input stream of vectors for training, # as well as a stream of vectors for testing def parse(lp): label = float(lp[lp.find('(') + 1:lp.find(')')]) vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(',')) return LabeledPoint(label, vec) trainingData = sc.textFile("/Users/tung/Documents/spark-2.4.3/data/mllib/kmeans_data.txt")\ .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')])) testingData = sc.textFile( "/Users/tung/Documents/spark-2.4.3/data/mllib/streaming_kmeans_data_test.txt" ).map(parse) trainingQueue = [trainingData] testingQueue = [testingData] trainingStream = ssc.queueStream(trainingQueue) testingStream = ssc.queueStream(testingQueue) # We create a model with random clusters and specify the number of clusters to find model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0) # Now register the streams for training and testing and start the job,
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import PolynomialExpansion from pyspark.mllib.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("PolynomialExpansionExample").getOrCreate() # $example on$ df = spark\ .createDataFrame([(Vectors.dense([-2.0, 2.3]),), (Vectors.dense([0.0, 0.0]),), (Vectors.dense([0.6, -1.1]),)], ["features"]) px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures") polyDF = px.transform(df) for expanded in polyDF.select("polyFeatures").take(3): print(expanded) # $example off$ spark.stop()
#Print out the cluster of each data point print(model.predict(array([0.0, 0.0]))) print(model.predict(array([1.0, 1.0]))) print(model.predict(array([9.0, 8.0]))) print(model.predict(array([8.0, 0.0]))) #Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set. from pyspark.mllib.feature import Normalizer from pyspark.mllib.linalg import Vectors from pyspark import SparkContext from pyspark.mllib.feature import StandardScaler sc = SparkContext() vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])] dataset = sc.parallelize(vs) #all false, do nothing. standardizer = StandardScaler(False, False) model = standardizer.fit(dataset) result = model.transform(dataset) for r in result.collect(): print r print("\n") #deducts the mean standardizer = StandardScaler(True, False) model = standardizer.fit(dataset)
def test_append_bias_with_vector(self): data = Vectors.dense([2.0, 2.0, 2.0]) ret = MLUtils.appendBias(data) self.assertEqual(ret[3], 1.0) self.assertEqual(type(ret), DenseVector)
registerTable(sqlCtx, Table.RAW_DATA_SAMPLE) df = spark.sql("SELECT pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, id FROM trips_sample") print("Done, now starting K-means with K={}".format(int(CLUSTERING_SIZE / K_MEANS_FACTOR))) # Now we compute the clusters. This might take a while. #km = KMeans(int(CLUSTERING_SIZE / K_MEANS_FACTOR)) clusters = KMeans.train(df.select("pickup_longitude", "pickup_latitude") .rdd.sample(False, (CLUSTERING_SIZE*100) / SIZE_ESTIMATE) .map(lambda row: Vectors.dense(row["pickup_longitude"], row["pickup_latitude"])), int(CLUSTERING_SIZE / K_MEANS_FACTOR), 1000) print("K-means is done, clearing any existing data...") # Clean the database before proceeding subprocess.call(["hadoop", "fs", "-rm", "-r", "-f", "/user/csit7/ride_clusters_sample"]) subprocess.call(["hadoop", "fs", "-rm", "-r", "-f", "/user/csit7/cluster_data_sample"]) print("Done, initiating refill") # Now we first refill cluster_data, since we will need its ids later #for c in zip(set(kmData.tolist()), centroids): # cur.execute("INSERT INTO taxi.cluster_data(cluster_id, centroid_long, centroid_lat) VALUES (%s, %s, %s)",
# See the License for the specific language governing permissions and # limitations under the License. # from __future__ import print_function # $example on$ from pyspark.ml.feature import DCT from pyspark.mllib.linalg import Vectors # $example off$ from pyspark.sql import SparkSession if __name__ == "__main__": spark = SparkSession.builder.appName("DCTExample").getOrCreate() # $example on$ df = spark.createDataFrame([(Vectors.dense([0.0, 1.0, -2.0, 3.0]), ), (Vectors.dense([-1.0, 2.0, 4.0, -7.0]), ), (Vectors.dense([14.0, -2.0, -5.0, 1.0]), )], ["features"]) dct = DCT(inverse=False, inputCol="features", outputCol="featuresDCT") dctDf = dct.transform(df) for dcts in dctDf.select("featuresDCT").take(3): print(dcts) # $example off$ spark.stop()
# $example off$ from pyspark.sql import SparkSession """ A simple example demonstrating a bisecting k-means clustering. """ if __name__ == "__main__": spark = SparkSession\ .builder\ .appName("PythonBisectingKMeansExample")\ .getOrCreate() # $example on$ data = spark.read.text("data/mllib/kmeans_data.txt").rdd parsed = data\ .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')]))) training = spark.createDataFrame(parsed) kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features") model = kmeans.fit(training) # Evaluate clustering cost = model.computeCost(training) print("Bisecting K-means Cost = " + str(cost)) centers = model.clusterCenters() print("Cluster Centers: ") for center in centers: print(center) # $example off$