Exemple #1
0
 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     dataset = sqlContext.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     evaluator = BinaryClassificationEvaluator()
     tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     tvsModel = tvs.fit(dataset)
     tvsPath = temp_path + "/tvs"
     tvs.save(tvsPath)
     loadedTvs = TrainValidationSplit.load(tvsPath)
     self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
     self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
     self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())
     tvsModelPath = temp_path + "/tvsModel"
     tvsModel.save(tvsModelPath)
     loadedModel = TrainValidationSplitModel.load(tvsModelPath)
     self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
Exemple #2
0
 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     dataset = sqlContext.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     evaluator = BinaryClassificationEvaluator()
     cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     cvModel = cv.fit(dataset)
     cvPath = temp_path + "/cv"
     cv.save(cvPath)
     loadedCV = CrossValidator.load(cvPath)
     self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
     self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
     self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())
     cvModelPath = temp_path + "/cvModel"
     cvModel.save(cvModelPath)
     loadedModel = CrossValidatorModel.load(cvModelPath)
     self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
    def test_nnclassifier_in_pipeline(self):

        if self.sc.version.startswith("1"):
            from pyspark.mllib.linalg import Vectors

            df = self.sqlContext.createDataFrame(
                [(Vectors.dense([2.0, 1.0]), 1.0),
                 (Vectors.dense([1.0, 2.0]), 2.0),
                 (Vectors.dense([2.0, 1.0]), 1.0),
                 (Vectors.dense([1.0, 2.0]), 2.0),
                 ], ["features", "label"])

            scaler = MinMaxScaler().setInputCol("features").setOutputCol("scaled")
            model = Sequential().add(Linear(2, 2))
            criterion = ClassNLLCriterion()
            classifier = NNClassifier(model, criterion, MLlibVectorToTensor([2]))\
                .setBatchSize(4) \
                .setLearningRate(0.01).setMaxEpoch(1).setFeaturesCol("scaled")

            pipeline = Pipeline(stages=[scaler, classifier])

            pipelineModel = pipeline.fit(df)

            res = pipelineModel.transform(df)
            assert type(res).__name__ == 'DataFrame'
 def _get_train_data(self):
     sql_context = SQLContext(self.sc)
     l = [
         (1, Vectors.dense([1, 2, 3]), 1.0),
         (2, Vectors.dense([1, 2, 3]), 0.0),
         (3, Vectors.dense([1, 2, 3]), 1.0),
         (4, Vectors.dense([1, 2, 3]), 0.0),
     ]
     return sql_context.createDataFrame(l, ['id', 'features', 'label'])
Exemple #5
0
    def test_model_transform(self):
        weight = Vectors.dense([3, 2, 1])

        densevec = Vectors.dense([4, 5, 6])
        sparsevec = Vectors.sparse(3, [0], [1])
        eprod = ElementwiseProduct(weight)
        self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
        self.assertEqual(
            eprod.transform(sparsevec), SparseVector(3, [0], [3]))
Exemple #6
0
 def test_idf_model(self):
     data = [
         Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
         Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]),
         Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]),
         Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9])
     ]
     model = IDF().fit(self.sc.parallelize(data, 2))
     idf = model.idf()
     self.assertEqual(len(idf), 11)
Exemple #7
0
 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "prediction"])
def load_data_rdd(csv_file, shuffle=True, train=True):
    if shuffle:
        shuffle_csv(csv_file)
    data = sc.textFile(data_path + csv_file)
    data = data.filter(lambda x:x.split(',')[0] != 'id').map(lambda line: line.split(','))
    if train:
        data = data.map(
            lambda line: (Vectors.dense(np.asarray(line[1:-1]).astype(np.float32)),
            str(line[-1]).replace('Class_', '')) )
    else:
        data = data.map(lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)), "1") )
    return data
 def remove_time_dependent_effects(self, ts):
     """
     Given a timeseries, apply inverse operations to obtain the original series of underlying errors.
     Parameters
     ----------
     ts:
         Time series of observations with this model's characteristics as a Numpy array
     
     returns the time series with removed time-dependent effects as a Numpy array
     """
     destts = Vectors.dense(np.array([0] * len(ts)))
     result =  self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
     return _java2py(self._ctx, result.toArray())
Exemple #10
0
 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")
Exemple #11
0
def create_rows_for_rdd(x):
    """

    :param x:
    :return:
    """
    features = list(x[1])
    l = len(features) - 1
    label = float(features.pop(l))
    meta_data = x[0]
    return Row(label=label,
               features=Vectors.dense(features),
               meta_data=Vectors.dense(meta_data))
Exemple #12
0
def load_data_frame(csv_file, shuffle=True, train=True):
    if shuffle:
        shuffle_csv(csv_file)
    data = sc.textFile('/home/minglu/dist_spark/data/' + csv_file) # This is an RDD, which will later be transformed to a data frame
    data = data.filter(lambda x:x.split(',')[0] != 'label').map(lambda line: line.split(','))
    if train:
        data = data.map(
            lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),
                          'class_'+str(line[0]),int(line[0])) )
    else:
        # Test data gets dummy labels. We need the same structure as in Train data
        data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),'class_'+str(line[0]),int(line[0])) ) 
    return sqlcontext.createDataFrame(data, ['features', 'category','label'])
Exemple #13
0
 def add_time_dependent_effects(self, ts):
     """
     Given a timeseries, apply a model to it.
     
     Parameters
     ----------
     ts:
         Time series of i.i.d. observations as a Numpy array
     
     returns the time series with added time-dependent effects as a Numpy array.
     """
     destts = Vectors.dense([0] * len(ts))
     result =  self._jmodel.addTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
     return _java2py(self._ctx, result.toArray())
Exemple #14
0
 def test_glr_summary(self):
     from pyspark.mllib.linalg import Vectors
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
                                       fitIntercept=False)
     model = glr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertEqual(s.numIterations, 1)  # this should default to a single iteration of WLS
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.residuals(), DataFrame))
     self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
     self.assertEqual(s.degreesOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedomNull, 2)
     self.assertEqual(s.rank, 1)
     self.assertTrue(isinstance(s.solver, basestring))
     self.assertTrue(isinstance(s.aic, float))
     self.assertTrue(isinstance(s.deviance, float))
     self.assertTrue(isinstance(s.nullDeviance, float))
     self.assertTrue(isinstance(s.dispersion, float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.deviance, s.deviance)
Exemple #15
0
 def forecast(self, ts, nfuture):
     """
     Provided fitted values for timeseries ts as 1-step ahead forecasts, based on current
     model parameters, and then provide `nFuture` periods of forecast. We assume AR terms
     prior to the start of the series are equal to the model's intercept term (or 0.0, if fit
     without and intercept term).Meanwhile, MA terms prior to the start are assumed to be 0.0. If
     there is differencing, the first d terms come from the original series.
    
     Parameters
     ----------
     ts:
         Timeseries to use as gold-standard. Each value (i) in the returning series
         is a 1-step ahead forecast of ts(i). We use the difference between ts(i) -
         estimate(i) to calculate the error at time i, which is used for the moving
         average terms. Numpy array.
     nFuture:
         Periods in the future to forecast (beyond length of ts)
         
     Returns a series consisting of fitted 1-step ahead forecasts for historicals and then
     `nFuture` periods of forecasts. Note that in the future values error terms become
     zero and prior predictions are used for any AR terms.
     
     """
     jts = _py2java(self._ctx, Vectors.dense(ts))
     jfore = self._jmodel.forecast(jts, nfuture)
     return _java2py(self._ctx, jfore)
Exemple #16
0
def to_vector(np_array):
    ''' Convert numpy array to MLlib Vector '''
    if len(np_array.shape) == 1:
        return Vectors.dense(np_array)
    else:
        raise Exception("""An MLLib Vector can only be created
                        from a one-dimensional numpy array""")
Exemple #17
0
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
Exemple #18
0
def save_pca_parameters(pca_model, data_dim):
    # since there's no good way of doing it in python, simply use an I matrix to retrieve
    features = [(Vectors.dense(x),) for x in np.eye(data_dim).tolist()]
    params = pca_embed(sqlContext.createDataFrame(features, ('features',)), pca_model)
    np.savetxt(PCA_OUT_PATH,
               np.matrix(params.select('pca').rdd.map(lambda r: r[0]).collect()),
               fmt='%.6f')
Exemple #19
0
def transformToNumeric(inputStr):
    attList = inputStr.split(",")


    values = Vectors.dense([float(attList[28]), \
                            float(attList[4]),float(attList[5]), float(attList[6]), float(attList[7])])
    return values
def buildLabeledPoint(s, classification):
    features=[]
    for attr in attributes:
        features.append(getattr(s, attr + '_1'))
    for attr in attributes:
        features.append(getattr(s, attr + '_2'))
    return LabeledPoint(classification,Vectors.dense(features))
Exemple #21
0
 def test_logistic_regression_summary(self):
     from pyspark.mllib.linalg import Vectors
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
Exemple #22
0
def iniKM():
	conf = (SparkConf().set("spark.driver.maxResultSize", "5g"))
	sc = SparkContext(conf=conf)
	sqlContext = SQLContext(sc)
	data = sc.textFile("/mnt/yi-ad-proj/reduced_data/reduced_data").map(lambda x:x.split(" ")).cache()
	data = data.map(lambda x:[float(y) for y in x])
	df = data.map(lambda x: Row(features=Vectors.dense(x))).toDF()
	return df
Exemple #23
0
 def test_int_to_float(self):
     from pyspark.mllib.linalg import Vectors
     df = self.sc.parallelize([
         Row(label=1.0, weight=2.0, features=Vectors.dense(1.0))]).toDF()
     lr = LogisticRegression(elasticNetParam=0)
     lr.fit(df)
     lr.setElasticNetParam(0)
     lr.fit(df)
Exemple #24
0
 def log_likelihood(self, ts):
     """
     Returns the log likelihood of the parameters on the given time series.
     
     Based on http://www.unc.edu/~jbhill/Bollerslev_GARCH_1986.pdf
     """
     likelihood = self._jmodel.logLikelihood(_py2java(self._ctx, Vectors.dense(ts)))
     return _java2py(self._ctx, likelihood)
Exemple #25
0
def iniPCA():
	conf = (SparkConf().set("spark.driver.maxResultSize", "5g"))
	sc = SparkContext(conf=conf)
	sqlContext = SQLContext(sc)
	data = sc.textFile("/mnt/yi-ad-proj/compressed_data/compressed_data").map(lambda x:x.split(" ")).cache()
	data = data.map(lambda x:(x[0],[float(y) for y in x[1:]]))
	df = data.map(lambda x: Row(labels=x[0],features=Vectors.dense(x))).toDF()
	return df
 def count_words(record, vocabulary):
     word_counts = Counter(record['words'])
     word_vector = []
     for word in vocabulary:
         word_vector.append(word_counts[word])
     label = record['label']
     features = Vectors.dense(word_vector)
     return LabeledPoint(label, features)
def parseLine(line):
    parts = line.split(',')
    label = float(parts[0])
    feature_vector = map(float, parts[1:]) 
    # for feat in parts[1:]:
    #     feature_vector.append(float(feat))
    features = Vectors.dense(feature_vector)
    # features = Vectors.dense([float(x) for x in parts[1].split(' ')])
    return LabeledPoint(label, features)
Exemple #28
0
def parse_line(line, perc_keep=1.0, invalid_feat=None):
    if len(line.strip()) == 0:
      return invalid_feat
    if np.random.rand() < perc_keep:
        try:
            feature = (Vectors.dense([float(x) for x in line.split(',')]),)
            return feature
        except Exception, e:
            return invalid_feat
Exemple #29
0
def pre_process(datetime, rdd):  
    #print (str(type(time)) + " " + str(type(rdd)))    
    start = time.time()    
    points=rdd.map(lambda p: p[1]).flatMap(lambda a: eval(a)).map(lambda a: Vectors.dense(a))
    end_preproc=time.time()
    count = points.count()
    output_file.write("KMeans PreProcess, %d, %d, %s, %.5f\n"%(spark_cores, count, NUMBER_PARTITIONS, end_preproc-start))
    output_file.flush()
    return points
Exemple #30
0
    def streamingKMeansDataGenerator(self, batches, numPoints,
                                     k, d, r, seed, centers=None):
        rng = random.RandomState(seed)

        # Generate centers.
        centers = [rng.randn(d) for i in range(k)]

        return centers, [[Vectors.dense(centers[j % k] + r * rng.randn(d))
                          for j in range(numPoints)]
                         for i in range(batches)]
# 将若干文本文件读取为TF向量
sc = SparkContext('local')
rdd = sc.wholeTextFiles('P51FeatureExtraction.py').map(
    lambda text: text[1].split())
tfVectors = tf.transform(rdd)  # 对整个RDD进行转化操作
for v in tfVectors.collect():
    print(v)

# 在 Python 中使用 TF-IDF
idf = IDF()
idfModel = idf.fit(tfVectors)
tfIdVectors = idfModel.transform(tfVectors)
print(tfIdVectors)
for v in tfIdVectors.collect():
    print(v)

# 在 Python 中缩放向量
print('--在 Python 中缩放向量--')
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.feature import StandardScaler

vectors = [Vectors.dense([-2.0, 5.0, 1.0]), Vectors.dense([2.0, 0.0, 1.0])]
dataset = sc.parallelize(vectors)
print(dataset)
scaler = StandardScaler(withMean=True, withStd=True)
model = scaler.fit(dataset)
result = model.transform(dataset)
print(result)
for v in result.collect():
    print(v)
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext

sc = SparkContext('local')
denseVec1 = LabeledPoint(1.0, Vectors.dense([3.0, 5.0, 1.0]))
denseVec2 = LabeledPoint(0.0, Vectors.dense([2.0, 0.0, 1.0]))
vectors = [denseVec1, denseVec2]
dataset = sc.parallelize(vectors)
print(dataset)
model = NaiveBayes.train(dataset)
denseVec = Vectors.dense([1.5, 2.0, 3.0])
print("predict: %s, theta: %s\n" % (model.predict(denseVec), model.theta))

# predict: 0.0, theta: [[-0.69314718 -1.79175947 -1.09861229]
#  [-1.09861229 -0.69314718 -1.79175947]]
Exemple #33
0
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import PCA
from pyspark.mllib.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PCAExample")\
        .getOrCreate()

    # $example on$
    data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
            (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
            (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )]
    df = spark.createDataFrame(data, ["features"])
    pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
    model = pca.fit(df)
    result = model.transform(df).select("pcaFeatures")
    result.show(truncate=False)
    # $example off$

    spark.stop()
Exemple #34
0
def build_vectors(data_user_info):
    return data_user_info \
        .map(lambda user: Vectors.dense(user))
Exemple #35
0
withstations = tidy.rdd.map(lambda row: Row(station=map_yx_to_station(row.yx),
                                            datehour=row.datehour)).toDF()

withstations.registerTempTable('stationincidents')
incidentcount = sqlc.sql(
    "select station, datehour, count(1) as incidents from stationincidents group by station, datehour"
)

print("we now have incidents by station/hour in incidentcount")
incidentcount.show(10)

# now join the two tables
joined = cleanedaverages.join(incidentcount, ['station', 'datehour'], 'outer')

# if incident data doesn't exist for that station/datehour, then it is 0
zeroed = joined.rdd.map(lambda row: Row(station=row.station,
                                        datehour=row.datehour,
                                        temp=row.temp,
                                        wind=row.wind,
                                        incidents=row.incidents
                                        if row.incidents else 0)).toDF()

# if temp/wind data doesn't exist for that station/datehour, then we can't use that row
final = zeroed.filter(zeroed.temp.isNotNull()).filter(
    zeroed.wind.isNotNull()).filter(zeroed.temp != 0)

# finally apply correlation test
vecs = final.rdd.map(
    lambda row: Vectors.dense([row.temp, row.wind, row.incidents]))
print(Statistics.corr(vecs))
def extract(x):
    return Vectors.dense(x[1])
Exemple #37
0
join = beatXyear.leftOuterJoin(crime_MR)
# replace None to 0 
join = join.map(lambda x: (x[0],0) if x[1][1] is None else (x[0],x[1][1]))
## join.collect() # ((beat, year), # of crimes)

# change format -> (year, (beat, crime))
join = join.map(lambda x: (x[0][1], (x[0][0],x[1])))
# groupby year 
crimeYear = join.groupByKey()
# sort by beat within value # 
crimeYear = crimeYear.map(lambda x: (x[0], sorted(x[1], key=lambda y: y[0])))
# drop beat column -> (year, (crime #1, #2, #3 ,...)) 
crimeYear = crimeYear.map(lambda x: (x[0], [y[1] for y in x[1]]))

# vectors -> year = row, beat = col 
crimeVector = crimeYear.map(lambda x: Vectors.dense(x[1]))

# compute correlation matrix
corr = Statistics.corr(crimeVector)
## corr.shape # (303,303)

# correlation dictionary function of the lower trinagle in the corr matrix 
def corrDictFunc(corr):
	# output = ((row, col): corr value)
	# corr.shape[0] = corr.shape[1]= 303 
	return dict(((i,j), corr[i][j]) for i in range(corr.shape[0]) for j in range(corr.shape[0]) if i<j)

corrDict = corrDictFunc(corr)

# find beats with highest correlations 
def topK(beat, index, k):
Exemple #38
0
from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import ChiSqSelector
from pyspark.mllib.linalg import Vectors
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="ChiSqSelectorExample")
    sqlContext = SQLContext(sc)

    # $example on$
    df = sqlContext.createDataFrame([(
        7,
        Vectors.dense([0.0, 0.0, 18.0, 1.0]),
        1.0,
    ), (
        8,
        Vectors.dense([0.0, 1.0, 12.0, 0.0]),
        0.0,
    ), (
        9,
        Vectors.dense([1.0, 0.0, 15.0, 0.1]),
        0.0,
    )], ["id", "features", "clicked"])

    selector = ChiSqSelector(numTopFeatures=1,
                             featuresCol="features",
                             outputCol="selectedFeatures",
                             labelCol="clicked")
Exemple #39
0
from pyspark.mllib.linalg import Vectors

def mapper(x):
    return 0,abs( float(x) )

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: spark-submit lr.py <input residuals file> <output file>", file=sys.stderr)
        exit(-1)

    sc = SparkContext(appName="Chi Squared residuals")

    lines = sc.textFile(sys.argv[1], 1)

    resid = lines.map(mapper)

    resid = resid.collect()

    residuals = []

    for r in resid:
	    residuals.append(r[1])

    vec = Vectors.dense(residuals)

    gft = Statistics.chiSqTest(vec)

    print("%s\n" % gft)

    sc.stop()
Exemple #40
0
from pyspark.mllib.linalg import Vectors, Matrices
from pyspark.feature import LabeledPoint

vector = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]

spark_vector = Vectors.dense(vector)

label = 45.0
labeled_point = LabeledPoint(label, vector)
spark_matrix = Matrices.dense(3, 2, vector)
# Getting the data structure and scaling
spark_df = sc.parallelize(
    spark.read.json("Data/yelp_academic_dataset_business.json").select(
        "stars", "review_count", "is_open").take(1700))
scaler = StandardScaler(inputCol="_1",\
         outputCol="scaled_1")
trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(
    lambda x: (x, )).toDF()
scalerModel = scaler.fit(trial_df)
vec_df = spark.createDataFrame(
    scalerModel.transform(trial_df).select("scaled_1").rdd.map(
        lambda x: (float(x[0][0]), float(x[0][1]), float(x[0][2]))))

# Create RowMatrix from the transpose of
spark_df = spark.createDataFrame(vec_df.toPandas().transpose()).rdd
vector_df = sc.parallelize(spark_df.map(lambda s: Vectors.dense(s)).collect())
mat = RowMatrix(vector_df)
bun = mat.rows.collect()
num_clusters = 3

pre = sc.parallelize(mat.columnSimilarities().entries.map(
    lambda e: (e.i, e.j, e.value)).collect())
model = PowerIterationClustering.train(pre, 3, 20, "random")
err = model.assignments().map(lambda x: (Vectors.dense(bun[0][x.id], bun[1][
    x.id], bun[2][x.id]), x.cluster)).collect()

# Silhoutte value
ag = 0
agi = 1700
for er in err:
    avg = [0] * num_clusters
Exemple #42
0
x = DataSetRDD.take(1)
x

# COMMAND ----------

x[0][-1]

# COMMAND ----------

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg import Vectors

labeledPoints = DataSetRDD.map(
    lambda x: LabeledPoint(x[-1], Vectors.dense(x[:-1])))

(trainingData, testData) = labeledPoints.randomSplit([0.7, 0.3])

# COMMAND ----------

labeledPoints.collect()

# COMMAND ----------

model = RandomForest.trainRegressor(trainingData,
                                    categoricalFeaturesInfo={},
                                    numTrees=3,
                                    featureSubsetStrategy="auto",
                                    impurity='variance',
                                    maxDepth=4,
def textToVector(x):
    array = str(x).replace('(', '').replace(')',
                                            '').replace('DenseVector',
                                                        '').split(',')
    return (int(array[0]), Vectors.dense(array[1:]))
Exemple #44
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql import SparkSession

if __name__ == "__main__":
    sparkSession = SparkSession\
        .builder\
        .getOrCreate()

    # Python list
    dense_vector1 = [1.0, 0.0, 3.5, 0.0, 5.1]

    # NumPy array
    dense_vector2 = np.array([1.0, 0.0, 3.5, 0.0, 5.1])

    # Vector
    dense_vector3 = Vectors.dense([1.0, 0.0, 3.5, 0.0, 5.1])

    sparse_vector = Vectors.sparse(5, [0, 2, 4], [1.0, 3.5, 5.1])

    print("Vector 1 (Python list) : " + str(dense_vector1))
    print("Vector 2 (NumPy Array) : " + str(dense_vector2))
    print("Vector 3 (Vectors) : " + str(dense_vector3))
    print("Vector 1 (Vectors): " + str(sparse_vector))

    labeled_point = LabeledPoint(1.0, dense_vector1)
    labeled_point2 = LabeledPoint(0.0, Vectors.sparse(5, [2, 4], [5.2, 6.2]))

    print("Labeled point (Python list): " + str(labeled_point))
    print("Labeled point (Sparse vector): " + str(labeled_point2))

    sparkSession.stop()
Exemple #45
0
                                                         method='pearson')))

## sampling
# sampling methods can be performed on RDD's of key-value pairs
data = sc.parallelize([(1, 'a'), (1, 'b'), (2, 'c'), (2, 'd'), (2, 'e'),
                       (3, 'f')])

fractions = {1: 0.1, 2: 0.6, 3: 0.3}
approxSample = data.sampleByKey(False, fractions)

## hypothesis testing
from pyspark.mllib.linalg import Matrices, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.stat import Statistics

vec = Vectors.dense(0.1, 0.15, 0.2, 0.3, 0.25)
# compute goodness of fit. either compare two vectors to each other or compare one vector to a uniform distribution
goodnessOfFitTestResults = Statistics.chiSqTest(vec)
print(goodnessOfFitTestResults)

# pearson's independence test on a matrix
mat = Matrices.dense(3, 2, [1.0, 3.0, 5.0, 2.0, 4.0, 6.0])
independenceTestResults = Statistics.chiSqTest(mat)
print(independenceTestResults)

# a contingency table can be constructed from an RDD of LabeledPoint/vector pairs. The resulting test returns
# a Chi-squared test results for every feature against the label
obs = sc.parallelize([
    LabeledPoint(1.0, [1.0, 0.0, 3.0]),
    LabeledPoint(1.0, [1.0, 2.0, 0.0]),
    LabeledPoint(1.0, [-1.0, 0.0, -0.5])
Exemple #46
0
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.clustering import StreamingKMeans
from pyspark.streaming import StreamingContext
from pyspark.mllib.clustering import KMeans
from pyspark import SparkContext

# we make an input stream of vectors for training,
# as well as a stream of vectors for testing


sc = SparkContext();
ssc = StreamingContext(sc, 5);


trainingData = sc.textFile("data/datatraining.txt")\
    .map(lambda line: line.split(',')[2:-1]).map(lambda arr: Vectors.dense([float(x) for x in arr]))

centers = KMeans.train(trainingData, 2).centers


trainingQueue = [trainingData]


trainingStream = ssc.queueStream(trainingQueue)


# We create a model with random clusters and specify the number of clusters to find
model = StreamingKMeans(k=2, decayFactor=0.3)#.setRandomCenters(5, 1.0, 0)
model.setInitialCenters( centers, [1.0,1.0,1.0,1.0,1.0])
# Now register the streams for training and testing and start the job,
# printing the predicted cluster assignments on new data points as they arrive.
Exemple #47
0
def build(user):
    return LabeledPoint(float(user[0]), Vectors.dense(user[1]))
def GetParts(line):
    parts = line.split(',')
    return LabeledPoint(
        float(parts[4]),
        Vectors.dense(float(parts[1]), float(parts[2]), float(parts[3])))
Exemple #49
0
print(summary.variance())
print(summary.numNonzeros())
print(summary.max())
print(summary.min())
print(summary.count())
print(summary.normL1())
print(summary.normL2())

#correlation
x = sc.parallelize(np.random.randn(4, 1))
y = sc.parallelize(np.random.randn(4, 1))
print("Correlation :", str(Statistics.corr(x, y)))

#Chi-square
#For Vector
x = Vectors.dense(np.random.random_sample((5)))
y = Vectors.dense(np.random.random_sample((5)))
chisqr = Statistics.chiSqTest(x, y)
print(chisqr.statistic)
print(chisqr.degreesOfFreedom)
print(chisqr.pValue)
print(chisqr.nullHypothesis)

# For Matrices
x = Matrices.dense(4, 2, np.random.random_sample((8)))
y = Matrices.dense(4, 2, np.random.random_sample((8)))
chisqr = Statistics.chiSqTest(x, y)
print(chisqr.statistic)
print(chisqr.degreesOfFreedom)
print(chisqr.pValue)
print(chisqr.nullHypothesis)
Exemple #50
0
def parse(lp):
    label = float(lp[lp.find('(') + 1:lp.find(',')])
    vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(','))
    return LabeledPoint(label, vec)
if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")  # SparkContext

    # $example on$
    data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")
    label = data.map(lambda x: x.label)
    features = data.map(lambda x: x.features)

    scaler1 = StandardScaler().fit(features)
    scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)

    # data1 will be unit variance.
    data1 = label.zip(scaler1.transform(features))

    # Without converting the features into dense vectors, transformation with zero mean will raise
    # exception on sparse vector.
    # data2 will be unit variance and zero mean.
    data2 = label.zip(
        scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
    # $example off$

    print("data1:")
    for each in data1.collect():
        print(each)

    print("data2:")
    for each in data2.collect():
        print(each)

    sc.stop()
Exemple #52
0
def f1(line):
	return Vectors.dense([float(coord) for coord in line.split(" ") if len(coord) > 0])
def parseLine(line):
    parts = line.split(',')
    label = float(parts[len(parts)-1])
    features = Vectors.dense([float(parts[x]) for x in range(0,len(parts)-1)])
    return LabeledPoint(label, features)
Exemple #54
0
if __name__ == "__main__":
    sc = SparkContext(appName="StreamingKMeansExample")  # SparkContext
    ssc = StreamingContext(sc, 1)

    # $example on$
    # we make an input stream of vectors for training,
    # as well as a stream of vectors for testing
    def parse(lp):
        label = float(lp[lp.find('(') + 1:lp.find(')')])
        vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(','))

        return LabeledPoint(label, vec)

    trainingData = sc.textFile("/Users/tung/Documents/spark-2.4.3/data/mllib/kmeans_data.txt")\
        .map(lambda line: Vectors.dense([float(x) for x in line.strip().split(' ')]))

    testingData = sc.textFile(
        "/Users/tung/Documents/spark-2.4.3/data/mllib/streaming_kmeans_data_test.txt"
    ).map(parse)

    trainingQueue = [trainingData]
    testingQueue = [testingData]

    trainingStream = ssc.queueStream(trainingQueue)
    testingStream = ssc.queueStream(testingQueue)

    # We create a model with random clusters and specify the number of clusters to find
    model = StreamingKMeans(k=2, decayFactor=1.0).setRandomCenters(3, 1.0, 0)

    # Now register the streams for training and testing and start the job,
Exemple #55
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import PolynomialExpansion
from pyspark.mllib.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("PolynomialExpansionExample").getOrCreate()

    # $example on$
    df = spark\
        .createDataFrame([(Vectors.dense([-2.0, 2.3]),),
                          (Vectors.dense([0.0, 0.0]),),
                          (Vectors.dense([0.6, -1.1]),)],
                         ["features"])
    px = PolynomialExpansion(degree=2, inputCol="features", outputCol="polyFeatures")
    polyDF = px.transform(df)
    for expanded in polyDF.select("polyFeatures").take(3):
        print(expanded)
    # $example off$

    spark.stop()
Exemple #56
0
#Print out the cluster of each data point
print(model.predict(array([0.0, 0.0])))
print(model.predict(array([1.0, 1.0])))
print(model.predict(array([9.0, 8.0])))
print(model.predict(array([8.0, 0.0])))

#Standardizes features by removing the mean and scaling to unit variance using column summary statistics on the samples in the training set.
from pyspark.mllib.feature import Normalizer
from pyspark.mllib.linalg import Vectors
from pyspark import SparkContext
from pyspark.mllib.feature import StandardScaler

sc = SparkContext()

vs = [Vectors.dense([-2.0, 2.3, 0]), Vectors.dense([3.8, 0.0, 1.9])]

dataset = sc.parallelize(vs)

#all false, do nothing.
standardizer = StandardScaler(False, False)
model = standardizer.fit(dataset)
result = model.transform(dataset)
for r in result.collect():
    print r

print("\n")

#deducts the mean
standardizer = StandardScaler(True, False)
model = standardizer.fit(dataset)
Exemple #57
0
 def test_append_bias_with_vector(self):
     data = Vectors.dense([2.0, 2.0, 2.0])
     ret = MLUtils.appendBias(data)
     self.assertEqual(ret[3], 1.0)
     self.assertEqual(type(ret), DenseVector)
Exemple #58
0
registerTable(sqlCtx, Table.RAW_DATA_SAMPLE)


df = spark.sql("SELECT pickup_longitude, pickup_latitude, dropoff_longitude, dropoff_latitude, id FROM trips_sample")



print("Done, now starting K-means with K={}".format(int(CLUSTERING_SIZE / K_MEANS_FACTOR)))

# Now we compute the clusters. This might take a while.

#km = KMeans(int(CLUSTERING_SIZE / K_MEANS_FACTOR))
clusters = KMeans.train(df.select("pickup_longitude", "pickup_latitude")
                          .rdd.sample(False, (CLUSTERING_SIZE*100) / SIZE_ESTIMATE)
                          .map(lambda row: Vectors.dense(row["pickup_longitude"], row["pickup_latitude"])),
                        int(CLUSTERING_SIZE / K_MEANS_FACTOR), 1000)


print("K-means is done, clearing any existing data...")

# Clean the database before proceeding

subprocess.call(["hadoop", "fs", "-rm", "-r", "-f", "/user/csit7/ride_clusters_sample"])
subprocess.call(["hadoop", "fs", "-rm", "-r", "-f", "/user/csit7/cluster_data_sample"])

print("Done, initiating refill")

# Now we first refill cluster_data, since we will need its ids later
#for c in zip(set(kmData.tolist()), centroids):
#  cur.execute("INSERT INTO taxi.cluster_data(cluster_id, centroid_long, centroid_lat) VALUES (%s, %s, %s)",
Exemple #59
0
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import DCT
from pyspark.mllib.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("DCTExample").getOrCreate()

    # $example on$
    df = spark.createDataFrame([(Vectors.dense([0.0, 1.0, -2.0, 3.0]), ),
                                (Vectors.dense([-1.0, 2.0, 4.0, -7.0]), ),
                                (Vectors.dense([14.0, -2.0, -5.0, 1.0]), )],
                               ["features"])

    dct = DCT(inverse=False, inputCol="features", outputCol="featuresDCT")

    dctDf = dct.transform(df)

    for dcts in dctDf.select("featuresDCT").take(3):
        print(dcts)
    # $example off$

    spark.stop()
Exemple #60
0
# $example off$
from pyspark.sql import SparkSession
"""
A simple example demonstrating a bisecting k-means clustering.
"""

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PythonBisectingKMeansExample")\
        .getOrCreate()

    # $example on$
    data = spark.read.text("data/mllib/kmeans_data.txt").rdd
    parsed = data\
        .map(lambda row: Row(features=Vectors.dense([float(x) for x in row.value.split(' ')])))
    training = spark.createDataFrame(parsed)

    kmeans = BisectingKMeans().setK(2).setSeed(1).setFeaturesCol("features")

    model = kmeans.fit(training)

    # Evaluate clustering
    cost = model.computeCost(training)
    print("Bisecting K-means Cost = " + str(cost))

    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)
    # $example off$