Exemple #1
0
 def test_glr_summary(self):
     from pyspark.mllib.linalg import Vectors
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
                                       fitIntercept=False)
     model = glr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertEqual(s.numIterations, 1)  # this should default to a single iteration of WLS
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.residuals(), DataFrame))
     self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
     self.assertEqual(s.degreesOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedomNull, 2)
     self.assertEqual(s.rank, 1)
     self.assertTrue(isinstance(s.solver, basestring))
     self.assertTrue(isinstance(s.aic, float))
     self.assertTrue(isinstance(s.deviance, float))
     self.assertTrue(isinstance(s.nullDeviance, float))
     self.assertTrue(isinstance(s.dispersion, float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.deviance, s.deviance)
Exemple #2
0
 def test_equals(self):
     indices = [1, 2, 4]
     values = [1., 3., 2.]
     self.assertTrue(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 0., 2.]))
     self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 1., 0., 2.]))
     self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 3., 0., 2.]))
     self.assertFalse(Vectors._equals(indices, values, list(range(5)), [0., 1., 3., 2., 2.]))
Exemple #3
0
 def test_logistic_regression_summary(self):
     from pyspark.mllib.linalg import Vectors
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
Exemple #4
0
 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     dataset = sqlContext.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     evaluator = BinaryClassificationEvaluator()
     cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     cvModel = cv.fit(dataset)
     cvPath = temp_path + "/cv"
     cv.save(cvPath)
     loadedCV = CrossValidator.load(cvPath)
     self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
     self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
     self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())
     cvModelPath = temp_path + "/cvModel"
     cvModel.save(cvModelPath)
     loadedModel = CrossValidatorModel.load(cvModelPath)
     self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
Exemple #5
0
 def test_save_load(self):
     temp_path = tempfile.mkdtemp()
     sqlContext = SQLContext(self.sc)
     dataset = sqlContext.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
     evaluator = BinaryClassificationEvaluator()
     tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     tvsModel = tvs.fit(dataset)
     tvsPath = temp_path + "/tvs"
     tvs.save(tvsPath)
     loadedTvs = TrainValidationSplit.load(tvsPath)
     self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
     self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
     self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())
     tvsModelPath = temp_path + "/tvsModel"
     tvsModel.save(tvsModelPath)
     loadedModel = TrainValidationSplitModel.load(tvsModelPath)
     self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
Exemple #6
0
 def test_append_bias_with_sp_vector(self):
     data = Vectors.sparse(3, {0: 2.0, 2: 2.0})
     expected = Vectors.sparse(4, {0: 2.0, 2: 2.0, 3: 1.0})
     # Returned value must be SparseVector
     ret = MLUtils.appendBias(data)
     self.assertEqual(ret, expected)
     self.assertEqual(type(ret), SparseVector)
    def test_nnclassifier_in_pipeline(self):

        if self.sc.version.startswith("1"):
            from pyspark.mllib.linalg import Vectors

            df = self.sqlContext.createDataFrame(
                [(Vectors.dense([2.0, 1.0]), 1.0),
                 (Vectors.dense([1.0, 2.0]), 2.0),
                 (Vectors.dense([2.0, 1.0]), 1.0),
                 (Vectors.dense([1.0, 2.0]), 2.0),
                 ], ["features", "label"])

            scaler = MinMaxScaler().setInputCol("features").setOutputCol("scaled")
            model = Sequential().add(Linear(2, 2))
            criterion = ClassNLLCriterion()
            classifier = NNClassifier(model, criterion, MLlibVectorToTensor([2]))\
                .setBatchSize(4) \
                .setLearningRate(0.01).setMaxEpoch(1).setFeaturesCol("scaled")

            pipeline = Pipeline(stages=[scaler, classifier])

            pipelineModel = pipeline.fit(df)

            res = pipelineModel.transform(df)
            assert type(res).__name__ == 'DataFrame'
Exemple #8
0
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     sqlContext = SQLContext(self.sc)
     df = sqlContext.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
Exemple #9
0
    def test_model_transform(self):
        weight = Vectors.dense([3, 2, 1])

        densevec = Vectors.dense([4, 5, 6])
        sparsevec = Vectors.sparse(3, [0], [1])
        eprod = ElementwiseProduct(weight)
        self.assertEqual(eprod.transform(densevec), DenseVector([12, 10, 6]))
        self.assertEqual(
            eprod.transform(sparsevec), SparseVector(3, [0], [3]))
Exemple #10
0
 def test_right_number_of_results(self):
     num_cols = 1001
     sparse_data = [
         LabeledPoint(0.0, Vectors.sparse(num_cols, [(100, 2.0)])),
         LabeledPoint(0.1, Vectors.sparse(num_cols, [(200, 1.0)]))
     ]
     chi = Statistics.chiSqTest(self.sc.parallelize(sparse_data))
     self.assertEqual(len(chi), num_cols)
     self.assertIsNotNone(chi[1000])
Exemple #11
0
 def test_parse_vector(self):
     a = DenseVector([3, 4, 6, 7])
     self.assertTrue(str(a), '[3.0,4.0,6.0,7.0]')
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(4, [0, 2], [3, 4])
     self.assertTrue(str(a), '(4,[0,2],[3.0,4.0])')
     self.assertTrue(Vectors.parse(str(a)), a)
     a = SparseVector(10, [0, 1], [4, 5])
     self.assertTrue(SparseVector.parse(' (10, [0,1 ],[ 4.0,5.0] )'), a)
 def _get_train_data(self):
     sql_context = SQLContext(self.sc)
     l = [
         (1, Vectors.dense([1, 2, 3]), 1.0),
         (2, Vectors.dense([1, 2, 3]), 0.0),
         (3, Vectors.dense([1, 2, 3]), 1.0),
         (4, Vectors.dense([1, 2, 3]), 0.0),
     ]
     return sql_context.createDataFrame(l, ['id', 'features', 'label'])
Exemple #13
0
 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "prediction"])
Exemple #14
0
 def test_idf_model(self):
     data = [
         Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]),
         Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]),
         Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]),
         Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9])
     ]
     model = IDF().fit(self.sc.parallelize(data, 2))
     idf = model.idf()
     self.assertEqual(len(idf), 11)
def load_data_rdd(csv_file, shuffle=True, train=True):
    if shuffle:
        shuffle_csv(csv_file)
    data = sc.textFile(data_path + csv_file)
    data = data.filter(lambda x:x.split(',')[0] != 'id').map(lambda line: line.split(','))
    if train:
        data = data.map(
            lambda line: (Vectors.dense(np.asarray(line[1:-1]).astype(np.float32)),
            str(line[-1]).replace('Class_', '')) )
    else:
        data = data.map(lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)), "1") )
    return data
def parseEntry(xx):

    mindate=datetime.datetime(datetime.MINYEAR, 1, 1,1,1)
    xx=xx.split('\t')
    a_virtual=xx[0]
    browser=xx[1]
    referrer=xx[2]
    a_user_key=xx[3]
    try:
        birthyear=int(xx[4])
        age=2015-birthyear
    except Exception as _:
        birthyear=xx[4]
        age=-1
    gender=xx[5]
    #print(xx)
    #print(xx[6])
    if xx[6]!='NAN':
        reg_date=datetime.datetime.strptime(xx[6],'%Y-%m-%d')
    else:
        reg_date=mindate
    device=xx[7]
    date=datetime.datetime.strptime(xx[8],'%d-%m-%Y')
    tdiff=datetime.timedelta(hours=int(xx[9]))
    date=date+tdiff
    year=date.year
    month=date.month
    day=date.day
    hour=int(xx[9])
    weekday=date.weekday()

    if reg_date>mindate:
        days_since_registration=(date-reg_date).days
    else:
        days_since_registration=-1

    metrics=list([int(x.replace(',0','')) for x in xx[10:]])
    visits=metrics[0]
    visits_betalt=metrics[1]
    pageviews=metrics[2]
    pageview_nothome=metrics[3]
    pageview_betalt=metrics[4]

    timegroup_pvs=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],pageviews)])
    timegroup_visit=Vectors.sparse(maxInd,[(intervalIndDict[(weekday,hour)],1.)])

    return Row(browser=browser,a_user_key=a_user_key,age=age,\
               day=day,hour=hour,date=date,weekday=weekday,pv=pageviews,\
               pv_nh=pageview_nothome,pv_bet=pageview_betalt,referrer=referrer,\
               device=device,gender=gender,days_since_registration=days_since_registration,\
               reg_date=reg_date,timegroup_pvs=timegroup_pvs,timegroup_visit=timegroup_visit,\
               a_virtual=a_virtual)
Exemple #17
0
 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")
Exemple #18
0
def load_data_frame(csv_file, shuffle=True, train=True):
    if shuffle:
        shuffle_csv(csv_file)
    data = sc.textFile('/home/minglu/dist_spark/data/' + csv_file) # This is an RDD, which will later be transformed to a data frame
    data = data.filter(lambda x:x.split(',')[0] != 'label').map(lambda line: line.split(','))
    if train:
        data = data.map(
            lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),
                          'class_'+str(line[0]),int(line[0])) )
    else:
        # Test data gets dummy labels. We need the same structure as in Train data
        data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),'class_'+str(line[0]),int(line[0])) ) 
    return sqlcontext.createDataFrame(data, ['features', 'category','label'])
Exemple #19
0
def create_rows_for_rdd(x):
    """

    :param x:
    :return:
    """
    features = list(x[1])
    l = len(features) - 1
    label = float(features.pop(l))
    meta_data = x[0]
    return Row(label=label,
               features=Vectors.dense(features),
               meta_data=Vectors.dense(meta_data))
Exemple #20
0
 def remove_time_dependent_effects(self, ts):
     """
     Given a timeseries, apply inverse operations to obtain the original series of underlying errors.
     Parameters
     ----------
     ts:
         Time series of observations with this model's characteristics as a Numpy array
     
     returns the time series with removed time-dependent effects as a Numpy array
     """
     destts = Vectors.dense(np.array([0] * len(ts)))
     result =  self._jmodel.removeTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
     return _java2py(self._ctx, result.toArray())
Exemple #21
0
 def add_time_dependent_effects(self, ts):
     """
     Given a timeseries, apply a model to it.
     
     Parameters
     ----------
     ts:
         Time series of i.i.d. observations as a Numpy array
     
     returns the time series with added time-dependent effects as a Numpy array.
     """
     destts = Vectors.dense([0] * len(ts))
     result =  self._jmodel.addTimeDependentEffects(_py2java(self._ctx, Vectors.dense(ts)), _py2java(self._ctx, destts))
     return _java2py(self._ctx, result.toArray())
 def ztest_toPandas(self):
     data = [(Vectors.dense([0.1, 0.2]),),
             (Vectors.sparse(2, {0:0.3, 1:0.4}),),
             (Vectors.sparse(2, {0:0.5, 1:0.6}),)]
     df = self.sql.createDataFrame(data, ["features"])
     self.assertEqual(df.count(), 3)
     pd = self.converter.toPandas(df)
     self.assertEqual(len(pd), 3)
     self.assertTrue(isinstance(pd.features[0], csr_matrix),
                     "Expected pd.features[0] to be csr_matrix but found: %s" %
                     type(pd.features[0]))
     self.assertEqual(pd.features[0].shape[0], 3)
     self.assertEqual(pd.features[0].shape[1], 2)
     self.assertEqual(pd.features[0][0,0], 0.1)
     self.assertEqual(pd.features[0][0,1], 0.2)
Exemple #23
0
def add_svec(sv1, sv2):
    assert len(sv1) == len(sv2), "dimension mismatch"
    indices = []
    values = []
    i, j = 0, 0
    while i < len(sv1.indices) and j < len(sv2.indices):
        if sv1.indices[i] == sv2.indices[j]:
            indices.append(sv1.indices[i])
            values.append(sv1.values[i] + sv2.values[j])
            i += 1
            j += 1
        elif sv1.indices[i] < sv2.indices[j]:
            indices.append(sv1.indices[i])
            values.append(sv1.values[i])
            i += 1
        else:
            indices.append(sv2.indices[j])
            values.append(sv2.values[j])
            j += 1
    while i < len(sv1.indices):
        indices.append(sv1.indices[i])
        values.append(sv1.values[i])
        i += 1
    while j < len(sv2.indices):
        indices.append(sv2.indices[j])
        values.append(sv2.values[j])
        j += 1
    return Vectors.sparse(len(sv1), indices, values)
Exemple #24
0
def save_pca_parameters(pca_model, data_dim):
    # since there's no good way of doing it in python, simply use an I matrix to retrieve
    features = [(Vectors.dense(x),) for x in np.eye(data_dim).tolist()]
    params = pca_embed(sqlContext.createDataFrame(features, ('features',)), pca_model)
    np.savetxt(PCA_OUT_PATH,
               np.matrix(params.select('pca').rdd.map(lambda r: r[0]).collect()),
               fmt='%.6f')
Exemple #25
0
 def forecast(self, ts, nfuture):
     """
     Provided fitted values for timeseries ts as 1-step ahead forecasts, based on current
     model parameters, and then provide `nFuture` periods of forecast. We assume AR terms
     prior to the start of the series are equal to the model's intercept term (or 0.0, if fit
     without and intercept term).Meanwhile, MA terms prior to the start are assumed to be 0.0. If
     there is differencing, the first d terms come from the original series.
    
     Parameters
     ----------
     ts:
         Timeseries to use as gold-standard. Each value (i) in the returning series
         is a 1-step ahead forecast of ts(i). We use the difference between ts(i) -
         estimate(i) to calculate the error at time i, which is used for the moving
         average terms. Numpy array.
     nFuture:
         Periods in the future to forecast (beyond length of ts)
         
     Returns a series consisting of fitted 1-step ahead forecasts for historicals and then
     `nFuture` periods of forecasts. Note that in the future values error terms become
     zero and prior predictions are used for any AR terms.
     
     """
     jts = _py2java(self._ctx, Vectors.dense(ts))
     jfore = self._jmodel.forecast(jts, nfuture)
     return _java2py(self._ctx, jfore)
Exemple #26
0
def to_vector(np_array):
    ''' Convert numpy array to MLlib Vector '''
    if len(np_array.shape) == 1:
        return Vectors.dense(np_array)
    else:
        raise Exception("""An MLLib Vector can only be created
                        from a one-dimensional numpy array""")
def buildLabeledPoint(s, classification):
    features=[]
    for attr in attributes:
        features.append(getattr(s, attr + '_1'))
    for attr in attributes:
        features.append(getattr(s, attr + '_2'))
    return LabeledPoint(classification,Vectors.dense(features))
Exemple #28
0
def createSparseVector(histogram):
	indexList = []
	countList = []
	for histogramIndex, count in sorted(histogram, key=getKey):
		indexList.append(histogramIndex)
		countList.append(count)
	return Vectors.sparse(2000, indexList,countList)
Exemple #29
0
    def scoreOnePoint(self, x):

        """
        Compute the log likelihood of 'x' being generated under the current model
        Also returns the probability that 'x' is generated by each component of the mixture

        Parameters
        ----------
        x : array of shape (1,  n_dim)
            Corresponds to a single data point.

        Returns
        -------
        log_likelihood_x :Log likelihood  of 'x'
        prob_x : Resposibility  of each cluster for the data point 'x'

        """
        lpr = (self.log_multivariate_normal_density_diag_Nd(x) + np.log(self.Weights))
        log_likelihood_x = logsumexp(lpr)
        prob_x = np.exp(lpr-log_likelihood_x)

        if self.isSparse == 1:
            temp_wt = np.dot(prob_x[:, np.newaxis], x.toArray()[np.newaxis, :])
            sqVec = Vectors.sparse(x.size, x.indices, x.values**2)
            temp_avg = np.dot(prob_x.T[:, np.newaxis], sqVec.toArray()[np.newaxis, :])

        else:
            temp_wt = np.dot(prob_x.T[:, np.newaxis],  x[np.newaxis, :])
            temp_avg = np.dot(prob_x.T[:, np.newaxis], (x*x)[np.newaxis, :])

        return log_likelihood_x, prob_x, temp_wt, temp_avg
Exemple #30
0
def load_cut_to_rdd(input_file, result_file):
    sc = SparkContext(appName='PythonKMeans',master="mesos://219.224.135.91:5050")
    lines = sc.textFile(input_file)
    data = lines.map(parseKV).cache()

    doc_term_tf = data.reduceByKey(add).cache()

    num_doc = doc_term_tf.map(lambda ((tid, term), tf): tid).distinct().count()
    terms_list = doc_term_tf.map(lambda ((tid, term), tf): term).distinct().collect()
    num_term = len(terms_list)

    term_idf = doc_term_tf.map(
            lambda ((tid, term), tf): (term, 1.0)
            ).reduceByKey(add).mapValues(lambda idf: math.log(float(num_doc) / (idf+1)))
    tfidf_join = doc_term_tf.map(
            lambda ((tid, term), tf): (term, (tid, tf))).join(term_idf)
    tfidf = tfidf_join.map(lambda (term, ((tid, tf), idf)): (tid, (terms_list.index(term), tf*idf)))

    doc_vec = tfidf.groupByKey().mapValues(lambda feature : Vectors.sparse(num_term, feature).toArray()).cache()

    nonzero_count = 0
    f = open(result_file,'w')
    f.write('%s %s\r\n'%(num_doc, num_term))
    for (tid, feature) in doc_vec.collect():
        for num in feature:
            f.write(str(num)+"\t")
        f.write("\n")
    f.close()
    sc.stop()


    return
Exemple #31
0
    decisionTree_model_evaluator = RegressionEvaluator(
        labelCol="MPG", predictionCol="prediction", metricName="rmse")
    rmse = decisionTree_model_evaluator.evaluate(
        decisionTree_model_predictions)
    print(
        "Root Mean Squared Error (RMSE) for Decision Tree on test data = %g" %
        rmse)
    r2_dt = ecisionTree_model_evaluator = RegressionEvaluator(
        labelCol="MPG", predictionCol="prediction", metricName="r2")
    print("R Squared (R2) for Decision Tree on test data = %g" %
          r2_dt.evaluate(decisionTree_model_predictions))

    ############################---RANDOM FOREST REGRESSION---##################################

    train_rdd_rf = train_df.rdd.map(
        lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))
    test_rdd_rf = test_df.rdd.map(
        lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))

    RandomForest_model = RandomForest.trainRegressor(
        train_rdd_rf,
        categoricalFeaturesInfo={},
        numTrees=50,
        featureSubsetStrategy="auto",
        maxDepth=10,
        maxBins=100)

    predictions = RandomForest_model.predict(
        test_rdd_rf.map(lambda x: x.features))
    labelsAndPredictions = test_rdd_rf.map(lambda lp: lp.label).zip(
        predictions)
Exemple #32
0
def parse(lp):
    label = float(lp[lp.find('(') + 1:lp.find(')')])
    vec = Vectors.dense(lp[lp.find('[') + 1:lp.find(']')].split(','))

    return LabeledPoint(label, vec)
Exemple #33
0
import sys
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StandardScalerExample")  # SparkContext

    # $example on$
    data = MLUtils.loadLibSVMFile(sc, sys.argv[1])
    label = data.map(lambda x: x.label)
    features = data.map(lambda x: x.features)

    scaler1 = StandardScaler().fit(features)
    scaler2 = StandardScaler(withMean=True, withStd=True).fit(features)

    # data1 will be unit variance.
    data1 = label.zip(scaler1.transform(features))

    # data2 will be unit variance and zero mean.
    data2 = label.zip(scaler2.transform(features.map(lambda x: Vectors.dense(x.toArray()))))
    # $example off$

    print("data1:")
    for each in data1.collect():
        print(each)

    print("data2:")
    for each in data2.collect():
        print(each)

    sc.stop()
Exemple #34
0
df_train.write.options(
    header="true").csv("hdfs://node1:9000/user/root/exp4/procd_train_real.csv")
df_train.write.parquet(
    "hdfs://node1:9000/user/root/exp4/procd_train_real.parquet")

# %%
#填充缺失值
#第一种策略是将后8个特征所有null值填充为0
df_train_filled = df_train.fillna(0)
df_train_filled.show()

# %%
#将数据转为合适的格式
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
#先转成RDD
df_train_rdd = df_train_filled.rdd
#改成(label,features)的格式
df_train_rdd = df_train_rdd.map(
    lambda line: LabeledPoint(line[2], Vectors.dense(line[3:])))

# %%
#保存为LibSVMFile格式,方便后面训练使用
from pyspark.mllib.util import MLUtils
MLUtils.saveAsLibSVMFile(df_train_rdd,
                         "hdfs://node1:9000/user/root/exp4/procd_train_real")

# %%
#别忘了关掉session
spark.stop()
Exemple #35
0
from test_helper import Test
Test.assertEquals(irisDFZeroIndex.select('label').map(lambda r: r[0]).take(3), [0, 0, 0],
                  'incorrect value for irisDFZeroIndex')

# COMMAND ----------

# MAGIC %md
# MAGIC You'll also notice that we have four values for features and that those values are stored as a `SparseVector`.  We'll reduce those down to two values (for visualization purposes) and convert them to a `DenseVector`.  To do that we'll need to create a `udf` and apply it to our dataset.  Here's a `udf` reference for [Python](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.udf) and for [Scala](https://spark.apache.org/docs/latest/api/scala/index.html#org.apache.spark.sql.UserDefinedFunction).
# MAGIC  
# MAGIC Note that you can call the `toArray` method on a `SparseVector` to obtain an array, and you can convert an array into a `DenseVector` using the `Vectors.dense` method.

# COMMAND ----------

# ANSWER
from pyspark.sql.functions import udf
# Note that VectorUDT and MatrixUDT are found in linalg while other types are in sql.types
# VectorUDT should be the return type of the udf
from pyspark.mllib.linalg import Vectors, VectorUDT

# Take the first two values from a SparseVector and convert them to a DenseVector
firstTwoFeatures = udf(lambda sv: Vectors.dense(sv.toArray()[:2]), VectorUDT())

irisTwoFeatures = irisDFZeroIndex.select(firstTwoFeatures('features').alias('features'), 'label').cache()
display(irisTwoFeatures)

# COMMAND ----------

# TEST
Test.assertEquals(str(irisTwoFeatures.first()), 'Row(features=DenseVector([-0.5556, 0.25]), label=0.0)',
                  'incorrect definition of firstTwoFeatures')
Exemple #36
0
# See the xyz coordinates of each atom in the file
t.xyz

# Find the current shape of the data
t.xyz.shape

# Get the first 1000 frames of xyz data
t_1k = t.xyz[0:1000]

# Convert into spark RDD to run PCA using ML
data = []
# try to find a way to optimize the vectorization
from pyspark.mllib.linalg import Vectors
for frame in t_1k:
  for atom in frame:
    data.append((Vectors.dense(atom),))

# Next, apply PCA with the following:
from pyspark.ml.feature import PCA
df = sqlContext.createDataFrame(data, ["features"])
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
model = pca.fit(df)
model.transform(df).collect()[0].pca_features


data = [(Vectors.dense([1.0, 0.0]),), (Vectors.dense([0.0, -1.0]),)]


### NEW PCA MODEL TO GET COMPONENTS AND EIGENVALUES
import numpy as np
Exemple #37
0
    lambda x: x).distinct().collect()
featur_index = {v: index for index, v in enumerate(featurs, 1)}
featur_index_value = sc.broadcast(featur_index).value

chi_index_map = {v: index for index, v in enumerate(chi_index, 1)}
chi_index_value = sc.broadcast(chi_index_map).value

rdd.map(lambda x: x.label + ' ' + get_feature_index(
    x.feature, featur_index_value)).saveAsTextFile('/user/zlj/tmp/cat3_libsvm')
rdd.map(lambda x: x.tel + ' ' + get_feature_index(
    x.feature, featur_index_value)).saveAsTextFile(
        '/user/zlj/tmp/cat3_libsvm_tel')

lp=rdd.map(lambda x:x.label+' '+get_feature_index(x.feature,featur_index_value))\
    .map(lambda x:MLUtils._parse_libsvm_line(x))\
    .map(lambda x:LabeledPoint(x[0],Vectors.sparse(40000, x[1], x[2])))

model = ChiSqSelector(100).fit(lp)

lp.map(lambda x: (x[0], model.transform(x[1])))
model.transform(lp)

sc.parallelize(
    sc.textFile('/user/zlj/tmp/cat3_libsvm/part-00092').take(30)
    [0]).saveAsTextFile('/user/zlj/tmp/test1')

values = MLUtils._parse_libsvm_line(t1.take(20)[3])[1]


def check(value):
    size = len(value)
Exemple #38
0
    def fit(self, data, n_components, n_iter, ct):
        """
        Estimate model parameters with the expectation-maximization
        algorithm.

        Parameters
        ----------
        data - RDD of data points
        n_components - Number of components
        n_iter - Number of iterations. Default to 100

        Attributes
        ----------

        covariance_type : Type of covariance matrix.
            Supports only diagonal covariance matrix.

        ct : Threshold value to check the convergence criteria.
            Defaults to 1e-3

        min_covar : Floor on the diagonal of the covariance matrix to prevent
            overfitting.  Defaults to 1e-3.

        converged : True once converged False otherwise.

        Weights : array of shape (1,  n_components)
            weights for each mixture component.

        Means : array of shape (n_components, n_dim)
            Mean parameters for each mixture component.

        Covars : array of shape (n_components, n_dim)
            Covariance parameters for each mixture component

        """
        sc = data.context
        covariance_type = 'diag'
        converged = False
        self.min_covar = 1e-3

        #  observation statistics
        self.s0 = 0
        self.s1 = 0
        #  To get the no of data points
        n_points = data.count()
        #  To get the no of dimensions
        n_dim = data.first().size

        if (n_points == 0):
            raise ValueError('Dataset cannot be empty')
        if (n_points < n_components):
            raise ValueError(
                'Not possible to make (%s) components from (%s) datapoints' %
                (n_components, n_points))

        # Initialize Covars(diagonal covariance matrix)
        if hasattr(data.first(), 'indices'):
            self.isSparse = 1

            def convert_to_kvPair(eachV):
                g = []
                for i in range(eachV.indices.size):
                    g.append(
                        (eachV.indices[i],
                         (eachV.values[i], eachV.values[i] * eachV.values[i])))
                return g

            def computeVariance(x):
                mean = x[1][0] / n_points
                sumSq = x[1][1] / n_points
                return x[0], sumSq - mean * mean

            cov = []
            kvPair = data.flatMap(convert_to_kvPair)
            res = kvPair.reduceByKey(np.add).map(computeVariance)
            cov = Vectors.sparse(n_dim, res.collectAsMap()).toArray() + 1e-3
            self.Covars = np.tile(cov, (n_components, 1))

        else:
            self.isSparse = 0
            cov = []
            for i in range(n_dim):
                cov.append(
                    data.map(lambda m: m[i]).variance() + self.min_covar)
            self.Covars = np.tile(cov, (n_components, 1))

        # Initialize Means using MLlib KMeans
        self.Means = np.array(KMeans().train(data,
                                             n_components).clusterCenters)
        # Initialize Weights with the value 1/n_components for each component
        self.Weights = np.tile(1.0 / n_components, n_components)
        #  EM algorithm
        # loop until number of iterations  or convergence criteria is satisfied
        for i in range(n_iter):

            logging.info("GMM running iteration %s " % i)
            # broadcasting means,covars and weights
            self.meansBc = sc.broadcast(self.Means)
            self.covarBc = sc.broadcast(self.Covars)
            self.weightBc = sc.broadcast(self.Weights)
            # Expectation Step
            EstepOut = data.map(self.scoreOnePoint)
            # Maximization step
            MstepIn = EstepOut.reduce(lambda (w1, x1, y1, z1), (
                w2, x2, y2, z2): (w1 + w2, x1 + x2, y1 + y2, z1 + z2))
            self.s0 = self.s1
            self.mStep(MstepIn[0], MstepIn[1], MstepIn[2], MstepIn[3])

            #  Check for convergence.
            if i > 0 and abs(self.s1 - self.s0) < ct:
                converged = True
                logging.info("Converged at iteration %s" % i)
                break

        return self
Exemple #39
0
def load_cut_to_rdd(input_file, result_file, cluster_num=CLUSTER_NUM, clu_iter=CLUSTERING_ITER,\
        ini_iter=INITIAL_ITER, rb_iter=RB_ITER, con_dist=convergeDist, filter_scale=FILTER_SCALE):
    sc = SparkContext(appName='PythonKMeans',
                      master="mesos://219.224.135.91:5050")
    lines = sc.textFile(input_file)
    data = lines.map(parseKV).cache()

    doc_term_tf = data.reduceByKey(add).cache()

    num_doc = doc_term_tf.map(lambda ((tid, term), tf): tid).distinct().count()

    initial_term_idf = doc_term_tf.map(lambda ((tid, term), tf):
                                       (term, 1.0)).reduceByKey(add)
    # filter
    initial_num_term = initial_term_idf.count()
    print 'initial_num_term', initial_num_term
    idf_sum = initial_term_idf.values().sum()
    print 'idf_sum', idf_sum

    idf_average = idf_sum / (initial_num_term * filter_scale)
    term_idf = initial_term_idf.filter(
        lambda (term, idf): idf_average < idf <
        (idf_average * (filter_scale - 1))).mapValues(
            lambda idf: math.log(float(num_doc) / (idf + 1)))
    terms_list = term_idf.keys().collect()
    num_term = len(terms_list)
    print 'num_term', num_term

    tfidf_join = doc_term_tf.map(lambda ((tid, term), tf):
                                 (term, (tid, tf))).join(term_idf)
    tfidf = tfidf_join.map(lambda (term, ((tid, tf), idf)):
                           (tid, (terms_list.index(term), tf * idf)))

    doc_vec = tfidf.groupByKey().mapValues(lambda feature: csr_matrix(
        Vectors.sparse(num_term, feature).toArray())).cache()
    global_center = doc_vec.mapValues(lambda x: x / num_doc).values().reduce(
        add)
    g_length = vector_length(global_center)

    # initial 2-way clustering
    maximum_total_variance = 0
    best_kPoints = []
    print 'initial', now()
    for i in range(ini_iter):
        kPoints, tempDist, iter_count = clustering(doc_vec, K, con_dist,
                                                   clu_iter)
        # evaluation
        cluster_variance, total_variance = cluster_evaluation(doc_vec, kPoints)
        ex_value = external_evaluation(kPoints, global_center, g_length)
        obj_value = total_variance[0] / ex_value

        # choose the best initial cluster
        if obj_value > maximum_total_variance:
            maximum_total_variance = obj_value
            best_kPoints = kPoints
    # global_distance = sum(cosine_dist(best_kPoints[x][1], global_center, best_kPoints[x][2], g_length) for x in range(len(best_kPoints)))

    f = open(result_file, 'w')
    f.write(
        str(iter_count) + "\t" + str(num_doc) + "\t" + str(num_term) + "\n")
    for index in range(len(terms_list)):
        f.write(terms_list[index].encode('utf-8') + '\t')
    """
    for (term, ((tid,tf), idf)) in tfidf_join.collect():
        f.write(term.encode('utf-8')+'\t'+str(tid)+'\t'+str(tf)+'\t'+str(idf)+'\n')
    print >> f, "%0.9f" % tempDist
    print >> f, "total_variance", total_variance[0], total_variance[1]
    print >> f, "global_dist", global_distance
    f.write("center:"+"\t")
    for dim in global_center:
        f.write(str(dim)+"\t")
    f.write("\n")
    for i in range(len(best_kPoints)):
        f.write(str(i))
        for unit in best_kPoints[i][1]:
            f.write("\t")
            f.write(str(unit))
        f.write("\n")
    for (index, (dist, num)) in cluster_variance.collect():
        f.write(str(index))
        f.write("\t")
        f.write(str(dist))
        f.write("\t")
        f.write(str(num))
        f.write("\n")
    """
    f.close()
    #repeated bisect
    #choose cluster

    updated_dict = {}
    updated_points_dict = {}
    total_delta_variance = 0
    updated_dict[total_delta_variance] = doc_vec
    updated_points_dict[total_delta_variance] = best_kPoints

    print 'repeated', now()
    for j in range(2, cluster_num + 1):
        if not (total_delta_variance in updated_dict):
            print "no cluster to divide"
            break

        print 'cluster to divide', total_delta_variance, updated_dict[
            total_delta_variance]
        best_cluster = updated_dict[total_delta_variance]
        global_best_kPoints = updated_points_dict[total_delta_variance]
        del updated_dict[total_delta_variance]
        del updated_points_dict[total_delta_variance]
        closest = best_cluster.map(lambda (tid, feature): (closestPoint(
            feature, global_best_kPoints), (tid, feature))).cache()
        print 'total_count', closest.count()

        total_delta_variance = float("-inf")  # clear to zero
        for key in updated_dict:
            if key > total_delta_variance:
                total_delta_variance = key

        for i in range(K):
            single_cluster = closest.filter(
                lambda (index, (tid, feature)): index == i).values().cache()
            print 'count', i, single_cluster.count()

            maximum_total_variance = 0
            best_kPoints = []
            in_value = cal_cluster_variance(single_cluster)
            ex_value = cosine_dist(global_best_kPoints[i][1], global_center,
                                   global_best_kPoints[i][2], g_length)
            initial_distance = in_value / ex_value
            for j in range(rb_iter):
                # clustering
                kPoints, tempDist, iter_count = clustering(
                    single_cluster, K, con_dist, clu_iter)
                # evaluation
                cluster_variance, total_variance = cluster_evaluation(
                    single_cluster, kPoints)
                ex_value = external_evaluation(kPoints, global_center,
                                               g_length)
                obj_value = total_variance[0] / ex_value

                if obj_value > maximum_total_variance:
                    maximum_total_variance = obj_value
                    best_kPoints = kPoints

            improvement = maximum_total_variance - initial_distance
            updated_dict[improvement] = single_cluster  # update dict
            updated_points_dict[improvement] = best_kPoints
            print 'improvement', improvement, maximum_total_variance, initial_distance

            if improvement > total_delta_variance:
                total_delta_variance = improvement
                print 'length', cluster_variance.count()

    count = 0
    for key in updated_dict:
        count += 1
        print 'key', key
        per_cluster = updated_dict[key]

        total_similarity = cal_cluster_variance(per_cluster)
        f = open('results/cluster_' + str(count), 'w')
        print >> f, key, total_similarity

        results_list = per_cluster.values().reduce(add).toarray()
        for row in results_list:
            for index in range(len(row)):
                value = row[index]
                if value != 0:
                    f.write('(' + str(index) + ',' + str(value) + ')\t')
        f.write('\n')
        for (tid, feature) in per_cluster.collect():
            f.write(tid)
            """
            for row in feature.toarray():
                for unit in range(len(row)):
                    f.write('\t')
                    f.write(str(row[unit]))
            """
            f.write('\n')
        f.close()

    sc.stop()
    return
from __future__ import print_function

# $example on$
from pyspark.ml.feature import PolynomialExpansion
from pyspark.mllib.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PolynomialExpansionExample")\
        .getOrCreate()

    # $example on$
    df = spark\
        .createDataFrame([(Vectors.dense([-2.0, 2.3]),),
                          (Vectors.dense([0.0, 0.0]),),
                          (Vectors.dense([0.6, -1.1]),)],
                         ["features"])
    px = PolynomialExpansion(degree=2,
                             inputCol="features",
                             outputCol="polyFeatures")
    polyDF = px.transform(df)
    for expanded in polyDF.select("polyFeatures").take(3):
        print(expanded)
    # $example off$

    spark.stop()
Exemple #41
0
        for i in range(1, k):
            if 'f:'+str(i) in line:
                indexList.append(i)
                valList.append(line['f:'+str(i)])
        label = int(line['l:'+str(col)])
        if label == -1:
            label = 0
        features.append((Vectors.sparse(k, indexList, valList),label))
    features = sc.parallelize(features)
    #sclines = sc.parallelize(lines)
    #features = sclines.map(featuresToSparseVecFromLine)
    featureDataFrame = spark.createDataFrame(features, ["features", "label"])
    pca = PCA(k=100, inputCol="features", outputCol="pcaFeatures")
    model = pca.fit(featureDataFrame)
    #pcaresult = model.transform(featureDataFrame).select("pcaFeatures").collect()
    #lp = []
    #c = 0
    #for com in pcaresult:
    #    lp.append(LabeledPoint(lines[c]['l:' + str(col)], mllibVectors.fromML(com.pcaFeatures)))
    #    c += 1
    #lp = sc.parallelize(lp)
    pcaresult = model.transform(featureDataFrame).rdd
    lp = pcaresult.map(lambda r: LabeledPoint(r.label, mllibVectors.fromML(r.pcaFeatures)))
    model = SVMWithSGD.train(lp)
    model.save(sc, "svm/SVM" + str(col))
    labelsAndPreds = lp.map(lambda p: (p.label, model.predict(p.features)))
    err = labelsAndPreds.filter(lambda (v, p): v != p).count() / float(parsedData.count())
    print("err at node " + str(col) + " = " + str(err))

sc.stop()
Exemple #42
0
        wordsFiltered.append(w)

txt = " ".join(wordsFiltered).lower()

data = sc.parallelize([
    txt
]).zipWithIndex().map(lambda val: Row(idd=val[1], words=val[0].split(" ")))

docDF = spark.createDataFrame(data)
Vector = CountVectorizer(inputCol="words", outputCol="vectors")
model = Vector.fit(docDF)
result = model.transform(docDF)

corpus = result.select(
    "idd",
    "vectors").rdd.map(lambda val: [val[0], Vectors.fromML(val[1])]).cache()

# Cluster the documents into three topics using LDA
ldaModel = LDA.train(corpus, k=3, maxIterations=700, optimizer='online')
topics = ldaModel.topicsMatrix()
vocabArray = model.vocabulary

wordNumbers = 5  # number of words per topic
topicIndices = sc.parallelize(
    ldaModel.describeTopics(maxTermsPerTopic=wordNumbers))


def topic_render(topic):  # specify vector id of words to actual words
    terms = topic[0]
    result = []
    for i in range(wordNumbers):
Exemple #43
0
sc = SparkContext(conf=conf)

#
row_data = sc.textFile(
    "/user-program/python/MachineLearningSpark/Data/ml-100k/u.data")
row_ratings = row_data.map(lambda line: line.split('\t')).map(
    lambda r: Rating(int(r[0]), int(r[1]), float(r[2])))
print(row_ratings.first())

#
row_ratings.cache()

#
als_model = ALS.train(row_ratings, 50, 10, 0.1)
movie_factors = als_model.productFeatures().map(lambda (id, factor):
                                                (id, Vectors.dense(factor)))
movie_vectors = movie_factors.map(lambda (id, vector): vector)
#print(movie_vectors.first())
user_factors = als_model.userFeatures().map(lambda (id, factor):
                                            (id, Vectors.dense(factor)))
user_vectors = user_factors.map((lambda (id, vector): vector))
#print(user_vectors.first())

# train
movie_cluster_model = KMeans().train(movie_vectors,
                                     k=5,
                                     maxIterations=10,
                                     runs=3)
print("movie cluster model kmeans :")
print(movie_cluster_model)
user_cluster_model = KMeans().train(user_vectors,
Exemple #44
0
 def parseTrainingData(line):
     cell = line.split(",")
     return Vectors.dense([float(cell[0]), float(cell[1])])
 def __str__(self):
     return "(" + ",".join((str(self.label), Vectors.stringify(self.features))) + ")"
Exemple #46
0
    .appName("KMeans") \
    .config("spark.some.config.option", "Angadpreet-KMeans") \
    .getOrCreate()
today = dt.datetime.today()
spark_df = sc.parallelize(
    spark.read.json("Data/yelp_academic_dataset_user.json").select(
        "review_count", "average_stars", "yelping_since").rdd.map(lambda x: (x[
            0], x[1], (today - par.parse(x[2])).days)).collect()[:1200])
scaler = MinMaxScaler(inputCol="_1",\
         outputCol="scaled_1")
# Getting the input data
trial_df = spark_df.map(lambda x: pyspark.ml.linalg.Vectors.dense(x)).map(
    lambda x: (x, )).toDF()
scalerModel = scaler.fit(trial_df)
vector_df = scalerModel.transform(trial_df).select("scaled_1").rdd.map(
    lambda x: Vectors.dense(x))

# Initialize GMM
start = timer()
gmm = GaussianMixture.train(vector_df, k=4, maxIterations=20, seed=2018)
end = timer()
print(end - start)
df = pandas.DataFrame({'features': [], 'cluster': []})
i = 0
for v in vector_df.collect():
    df.loc[i] = [[float(v[0]), float(v[1]), float(v[2])], int(gmm.predict(v))]
    i += 1

print df

err = spark.createDataFrame(df).rdd.map(lambda x: (x[0], int(x[1]))).collect()
Exemple #47
0
## Notice the differences between the uncorrelated(PCA uniform, PCA gaussian2)
## and source plots(Uniform, Gaussian). In case of Gaussian they look alike while 
## uncorrelated Uniform needs a rotation to get there. By removing correlation
## in the gaussian case, we have achieved independence between variables.
## If the source variables are gaussian ICA is not required and PCA is sufficient.
    
    
# Code for PCA and whitening the dataset.

from pyspark.mllib.linalg.distributed import IndexedRowMatrix, IndexedRow, BlockMatrix
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.linalg import Vectors, DenseMatrix, Matrix
from sklearn import datasets
# create the standardizer model for standardizing the dataset

X_rdd = sc.parallelize(X).map(lambda x:Vectors.dense(x) )
scaler = StandardScaler(withMean = True, withStd = False).fit(iris_rdd)

X_sc = scaler.transform(X_rdd)


#create the IndexedRowMatrix from rdd
X_rm = IndexedRowMatrix(X_sc.zipWithIndex().map(lambda x: (x[1], x[0])))

# compute the svd factorization of the matrix. First the number of columns and second a boolean stating whether 
# to compute U or not. 
svd_o = X_rm.computeSVD(X_rm.numCols(), True)

# svd_o.V is of shape n * k not k * n(as in sklearn)

P_comps = svd_o.V.toArray().copy()
Exemple #48
0
    movie_factors = cvModel.bestModel.itemFactors
    print movie_factors
    movie_factors.show()

    movie_factors.registerTempTable('movie_factors')

    midDF = sqlContext.sql("""
        SELECT id, features
        FROM movie_factors
        """)

    midRDD = midDF.rdd
    #midRDD.collect()
    vectorRDD = midRDD.map(
        lambda (x, y): Row(id=x, features=Vectors.dense(y))).cache()
    vectorRDD.collect()
    kmeans_input = sqlContext.createDataFrame(vectorRDD).cache()
    kmeans = KMeans(featuresCol="features",
                    predictionCol="prediction").setK(50)
    kmeans_df = kmeans.fit(kmeans_input)

    kmeans_transformed = kmeans_df.transform(kmeans_input)
    kmeans_transformed.show()

    kmeans_transformed.registerTempTable('kmeans_table')

    movie_items = sc.textFile("u.item")
    movienameRDD = movie_items.map(lambda x: x.split('|')).map(
        lambda p: Row(movieId=int(p[0]), movieName=p[1]))
    movienamesDF = sqlContext.createDataFrame(movienameRDD).cache()
Exemple #49
0
#Vector assembler
fAssembler = VectorAssembler(
    inputCols=["C1Vector", "C15Vector", "C16Vector", "C18Vector", "C19Vector", "C21Vector", "i_app_category_Vector", "i_device_type_Vector", "i_site_category_Vector"],
    outputCol="features")

#pipeline to sum up all the stringIndexers and OneHotEncoders and VectorAssemebler
data_P = Pipeline(stages=[c1I, c15I, c16I, c18I, c19I, c21I, appcatI, devtypeI, sitecatI, 
	c1E, c15E, c16E, c18E, c19E, c21E, appcatE, devtypeE, sitecatE, fAssembler])

model = data_P.fit(df)
data_t = model.transform(df)

###### Part 1 ends here #####

# Making the labelpoints to train the data with LR
parsedData=data_t.select('click', 'features').rdd.map(lambda row: LabeledPoint(float(row.click),Vectors.dense((row.features).toArray())))

# split the dataset
training,test = parsedData.randomSplit([0.6, 0.4], seed=11L)
training.cache()

# Train the data using a version of logistic regression that optimizes the parameters with Stochastic Gradient Descent(SGD)
model = LogisticRegressionWithSGD.train(training, step=0.1, miniBatchFraction=0.1, regType=None)


##### PART 3 ######
# Using the stochastic gradient descent solution
# Test the model using the test data - Getting the Accuracy , FPR and AU - ROC

# 1- Accuracy
labelsAndPreds = test.map(lambda p: (float(model.predict(p.features)), p.label))
Exemple #50
0
def parse_line(line):
	parts = line.split(',')
	label = float(parts[-1])
	features = Vectors.dense([float(x) for x in parts[0:-1]])
	return LabeledPoint(label,features)
Exemple #51
0
      .filter(lambda year: year[17] in ['2015', '2014', '2013', '2012', '2011'])\
      .map(lambda x: ((x[2][0:2] + x[2][5:10]), x[10]))
  
  # identify all beats
  beats = lines.map(lambda x: x[1])\
      .distinct().collect()
  
  # key = beats, values = list of crime month/year
  unfilled = lines.reduceByKey(lambda x, y: x + "," + y)\
      .map(lambda x: (x[0], x[1].split(",")))

  # count number of crimes per day per beat, fill no-crime values with zero
  filled = unfilled.map(lambda x: (x[0], fill(x[1], beats)))

  # convert to vectors
  vectors = filled.map(lambda x: Vectors.dense(x[1]))

  # calculate correlation
  pearsonCorr = Statistics.corr(vectors)

  # identify top 30 correlated beats
  pearsonCorr = pd.DataFrame(pearsonCorr, index = beats, columns = beats)
  unstacked = pearsonCorr.unstack()
  unstacked = pd.DataFrame(unstacked).reset_index()
  unstacked.columns = ["beat1", "beat2", "correlation"]
  unstacked = unstacked[unstacked.beat1 != unstacked.beat2]
  final = unstacked.nlargest(300, "correlation")

  # write final to csv
  final.to_csv("greenwood_2b.csv", index=False)
Exemple #52
0
def to_sparse(v):
  values = {i: e for i,e in enumerate(v) if e != 0}
  return Vectors.sparse(v.size, values)
Exemple #53
0
from pyspark.ml.regression import RandomForestRegressor
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession	
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.tree import RandomForestModel
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.evaluation import MulticlassMetrics
from prettytable import PrettyTable

sc = SparkContext()
spark = SparkSession(sc)
inputDF = spark.read.csv('s3://assignmentcs643/TrainingDataset.csv',header='true', inferSchema='true', sep=';')


datadf= inputDF.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))
model = RandomForestModel.load(sc,"s3://assignmentcs643/randomforestmodel.model")

predictions = model.predict(datadf.map(lambda x: x.features))

labels_and_predictions = datadf.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(datadf.count())


metrics = MulticlassMetrics(labels_and_predictions)
f1 = metrics.fMeasure()
recall = metrics.recall()
precision = metrics.precision()

#evaluation values 
print("Model accuracy: %.3f%%" % (acc * 100))
Exemple #54
0
if __name__ == "__main__":
    if len(sys.argv) != 3:
        print(
            "Usage: spark-submit generate_similarity_matrix.py <input path to hdfs file> <hdfs output path>",
            file=sys.stderr)
        exit(-1)
    #convert and process raw input to (bookid, [features])
    def processFeatures(raw):
        features_str = raw.split()
        book_id = int(features_str[0])
        features = []
        for i in range(1, len(features_str)):
            features.append(float(features_str[i]))
        return (book_id, features)

    sc = SparkContext(appName="BookRecSystem")
    spark = SQLContext(sc)
    featureRdd = sc.textFile(sys.argv[1])
    featureRdd = featureRdd.map(processFeatures)
    labels = featureRdd.map(lambda x: x[0])  #label_rdd
    fvecs = featureRdd.map(lambda x: Vectors.dense(x[1]))  #feature_rdd
    data = labels.zip(fvecs)
    mat = IndexedRowMatrix(data).toBlockMatrix(
    )  #convert to block-matrix for pairwise cosine similarity
    dot = mat.multiply(mat.transpose()).toIndexedRowMatrix().rows.map(
        lambda x: (x.index, x.vector.toArray())).sortByKey().map(
            lambda x: str(x[0]) + ' '.join(map(str, x[1]))
        )  #pairwise_cosine_similarity to rdd
    dot.saveAsTextFile(sys.argv[2])  #save output
    sc.stop()
#creation of model using mllib 
from pyspark.mllib.linalg import Vectors
from pyspark.ml.regression import RandomForestRegressor
from pyspark.mllib.regression import LabeledPoint
from pyspark import SparkContext, SparkConf
from pyspark.sql.session import SparkSession	
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.tree import RandomForest






spark_session = SparkSession.builder.appName('wine_model').getOrCreate()
file1 = spark_session.read.csv('s3://cloud-proj2/TrainingDataset.csv',header='true', inferSchema='true', sep=';')
select_col = [c for c in file1.columns if c != 'quality']


data_set= file1.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[0:-1])))
# model = LogisticRegression.trainClassifier(transformed_df,numClasses=10,categoricalFeaturesInfo={}, numTrees=50, maxBins=64, maxDepth=20, seed=33)
# LogisticRegression.trainClassifier()
# LogisticRegression()
#   .setMaxIter(10)
#   .setRegParam(0.3)
#   .setElasticNetParam(0.8)
#   .setFamily("multinomial")
model = RandomForest.trainClassifier(data_set,numClasses=10,categoricalFeaturesInfo={}, numTrees=50, maxBins=64, maxDepth=20, seed=33)
model.save(spark_session.sparkContext,"s3://cloud-proj2/model_created.model")

                    rating=temp['rating']))

        cats = (set(
            pd.read_csv('yelp_dataset/cat100.csv', squeeze=True).unique()) -
                regions - {'Food', 'Restaurants'})
        v = v[v['categories'].isin(cats)]

        le = LabelEncoder()
        v['categories'] = le.fit_transform(v['categories'])

        v2 = v.groupby(level=0).apply(
            lambda g: {x: y
                       for x, y in zip(g['categories'], g['rating'])})

        rdd = sc.parallelize(
            v2.tolist()).map(lambda x: Vectors.sparse(len(cats), x))
        rdd.cache()
        mat = RowMatrix(rdd)
        svd = mat.computeSVD(len(regions), computeU=True)
        U = svd.U  # The U factor is a RowMatrix.
        s = svd.s  # The singular values are stored in a local dense vector.
        V = svd.V  # The V factor is a local dense matrix.
        vectors = V.toArray()

        cat_df = pd.DataFrame(
            {'category': le.inverse_transform(np.arange(vectors.shape[0]))})
        cluster = AgglomerativeClustering(n_clusters=len(regions),
                                          affinity='cosine',
                                          linkage='complete')
        cat_df = cat_df.assign(cat34_label=cluster.fit_predict(
            vectors)).set_index('category').cat34_label
Exemple #57
0
# -*- coding: utf-8 -*-
from pyspark import SparkContext
from pyspark.mllib.clustering import LDA, LDAModel
from pyspark.mllib.linalg import Vectors
from pyspark.sql import SQLContext, Row
sc = SparkContext()
# input file is a term-document matrix, which is generated by make_tdm.py
data = sc.textFile(
    "/Users/Zhen/Desktop/Courses/BigData/stackexchange/topicModeling/result/matrix.csv"
)
header = data.first()  #extract header
data = data.filter(lambda x: x != header)
data = data.map(
    lambda line: Vectors.dense([float(x) for x in line.strip().split(',')]))

# Index documents with unique IDs
corpus = data.zipWithIndex().map(lambda x: [x[1], x[0]]).cache()

# Cluster the documents into k topics using LDA
ldaModel = LDA.train(corpus, k=30)

# Output topics. Each is a distribution over words (matching word count vectors)
print("Learned topics (as distributions over vocab of " +
      str(ldaModel.vocabSize()) + " words):")
topics = ldaModel.topicsMatrix()
# for topic in range(3):
#     print("Topic " + str(topic) + ":")
#     for word in range(0, ldaModel.vocabSize()):
#         print(" " + str(topics[word]))

import numpy
Exemple #58
0
#
located = remapped.map(lambda (d, h, l): (locate(l, \
spatial.KDTree(array( \
[[37.7816834,-122.3887657],\
[37.7469112,-122.4821759],\
[37.7411022,-120.804151],\
[37.4834543,-122.3187302],\
[37.7576436,-122.3916382],\
[37.7970013,-122.4140409],\
[37.748496,-122.4567461],\
[37.7288155,-122.4210133],\
[37.5839487,-121.9499339],\
[37.7157156,-122.4145311],\
[37.7329613,-122.5051491],\
[37.7575891,-122.3923824],\
[37.7521169,-122.4497687]])),
                                                 ["SF18", "SF04", "SF15", "SF17", "SF36", "SF37",\
"SF07", "SF11", "SF12", "SF14", "SF16", "SF19", "SF34"] ),d,h))

counted = located.map(lambda (l, d, h): ((l, d, h), 1))
incidentsreduced = counted.reduceByKey(lambda a, b: a + b)

joined = windaveraged.join(incidentsreduced)

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.stat import Statistics

vecs = joined.map(lambda ((s, d, h), ((t, w), i)): Vectors.dense([t, w, i]))
print(Statistics.corr(vecs))
Exemple #59
0
 print(
     "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n###################################################################################################"
 )
 print("Start Creating Customer Preferences Block Matrix")
 print(
     "###################################################################################################"
 )
 index_ct = customer_persona.drop("analytic_id")
 index_anaId = customer_persona.select("id", "analytic_id")
 index_ct.registerTempTable("index_ct")
 ontop_pref_price = ontop_preferences.select("id", "Price_XS", "Price_S",
                                             "Price_M", "Price_L",
                                             "Price_XL")
 ontop_pref_price = ontop_pref_price.orderBy(asc("id"))
 bmB_1 = IndexedRowMatrix(
     ontop_pref_price.rdd.map(lambda x: IndexedRow(x[0], Vectors.dense(x[
         1:])))).toBlockMatrix(rowsPerBlock=222)
 count = customer_persona.count()
 print(
     "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n###################################################################################################"
 )
 print("Finished Creating Customer Preferences Block Matrix")
 print(
     "###################################################################################################"
 )
 loop = int(count / 200000)
 startId = 1
 i = 0
 res = index_ct
 del customer_persona
 print(
     "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n###################################################################################################"
Exemple #60
0
from pyspark import SparkConf, SparkContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayesModel, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.rdd import RDD

conf = SparkConf().setAppName("myApp").setMaster("local")
sc = SparkContext(conf=conf)

vMale = Vectors.dense(1, 0, 1, 0, 1, 0)
length = 6
index = [0, 1, 2, 3, 5]
values = [1, 1, 1, 1, 1]
vFemale = Vectors.sparse(length, index, values)

train_one = LabeledPoint(1.0, vMale)
train_two = LabeledPoint(2.0, vFemale)
train_three = LabeledPoint(2.0, Vectors.dense(0, 1, 1, 1, 0, 1))

trains = list()
trains.append(train_one)
trains.append(train_two)
trains.append(train_three)
trainingRDD = sc.parallelize(trains)
nb = NaiveBayes()
nb_model = NaiveBayes.train(trainingRDD)

evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

dTest = [0, 1, 1, 0, 0, 1]