Ejemplo n.º 1
0
    def test_save_load_simple_estimator(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)
        self.assertEqual(loadedCV.getEstimatorParamMaps(), cv.getEstimatorParamMaps())

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
Ejemplo n.º 2
0
    def test_save_load_simple_estimator(self):
        # This tests saving and loading the trained model only.
        # Save/load for TrainValidationSplit will be added later: SPARK-13786
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        tvsModel = tvs.fit(dataset)

        tvsPath = temp_path + "/tvs"
        tvs.save(tvsPath)
        loadedTvs = TrainValidationSplit.load(tvsPath)
        self.assertEqual(loadedTvs.getEstimator().uid, tvs.getEstimator().uid)
        self.assertEqual(loadedTvs.getEvaluator().uid, tvs.getEvaluator().uid)
        self.assertEqual(loadedTvs.getEstimatorParamMaps(), tvs.getEstimatorParamMaps())

        tvsModelPath = temp_path + "/tvsModel"
        tvsModel.save(tvsModelPath)
        loadedModel = TrainValidationSplitModel.load(tvsModelPath)
        self.assertEqual(loadedModel.bestModel.uid, tvsModel.bestModel.uid)
Ejemplo n.º 3
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])
        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                                   collectSubModels=True)
        tvsModel = tvs.fit(dataset)
        self.assertEqual(len(tvsModel.subModels), len(grid))

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testTrainValidationSplitSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        tvsModel.save(savingPathWithSubModels)
        tvsModel3 = TrainValidationSplitModel.load(savingPathWithSubModels)
        self.assertEqual(len(tvsModel3.subModels), len(grid))
        tvsModel4 = tvsModel3.copy()
        self.assertEqual(len(tvsModel4.subModels), len(grid))

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        tvsModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        tvsModel2 = TrainValidationSplitModel.load(savingPathWithoutSubModels)
        self.assertEqual(tvsModel2.subModels, None)

        for i in range(len(grid)):
            self.assertEqual(tvsModel.subModels[i].uid, tvsModel3.subModels[i].uid)
Ejemplo n.º 4
0
 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, parallelism=1)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
Ejemplo n.º 5
0
 def test_support_for_weightCol(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
                                      (1.0, Vectors.sparse(2, [], []), 1.0),
                                      (2.0, Vectors.dense(0.5, 0.5), 1.0)],
                                     ["label", "features", "weight"])
     # classifier inherits hasWeightCol
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, weightCol="weight")
     self.assertIsNotNone(ovr.fit(df))
     # classifier doesn't inherit hasWeightCol
     dt = DecisionTreeClassifier()
     ovr2 = OneVsRest(classifier=dt, weightCol="weight")
     self.assertIsNotNone(ovr2.fit(df))
Ejemplo n.º 6
0
 def test_parallelism_doesnt_change_output(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1)
     modelPar1 = ovrPar1.fit(df)
     ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2)
     modelPar2 = ovrPar2.fit(df)
     for i, model in enumerate(modelPar1.models):
         self.assertTrue(np.allclose(model.coefficients.toArray(),
                                     modelPar2.models[i].coefficients.toArray(), atol=1E-4))
         self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
Ejemplo n.º 7
0
    def test_offset(self):

        df = self.spark.createDataFrame(
            [(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)),
             (0.5, 2.1, 0.5, Vectors.dense(1.0, 2.0)),
             (0.9, 0.4, 1.0, Vectors.dense(2.0, 1.0)),
             (0.7, 0.7, 0.0, Vectors.dense(3.0, 3.0))], ["label", "weight", "offset", "features"])

        glr = GeneralizedLinearRegression(family="poisson", weightCol="weight", offsetCol="offset")
        model = glr.fit(df)
        self.assertTrue(np.allclose(model.coefficients.toArray(), [0.664647, -0.3192581],
                                    atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, -1.561613, atol=1E-4))
Ejemplo n.º 8
0
 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")
Ejemplo n.º 9
0
    def test_binomial_logistic_regression_with_bound(self):

        df = self.spark.createDataFrame(
            [(1.0, 1.0, Vectors.dense(0.0, 5.0)),
             (0.0, 2.0, Vectors.dense(1.0, 2.0)),
             (1.0, 3.0, Vectors.dense(2.0, 1.0)),
             (0.0, 4.0, Vectors.dense(3.0, 3.0)), ], ["label", "weight", "features"])

        lor = LogisticRegression(regParam=0.01, weightCol="weight",
                                 lowerBoundsOnCoefficients=Matrices.dense(1, 2, [-1.0, -1.0]),
                                 upperBoundsOnIntercepts=Vectors.dense(0.0))
        model = lor.fit(df)
        self.assertTrue(
            np.allclose(model.coefficients.toArray(), [-0.2944, -0.0484], atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, 0.0, atol=1E-4))
Ejemplo n.º 10
0
 def test_bisecting_kmeans_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     bkm = BisectingKMeans(k=2)
     model = bkm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 20)
Ejemplo n.º 11
0
 def test_kmeans_summary(self):
     data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
             (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 1)
Ejemplo n.º 12
0
def reduce(inputpath,alg,k):
	n_data = 0
	n_features = 0
	result = "successful!"
	inputdir = os.path.dirname(inputpath)
	print "inputdir: " + inputdir + result
	inputfile = open(inputpath,'r')
	for line in inputfile:
                input_n = len(line.split(" "))
                n_data += 1
		#print "Selected data set has " + str(input_n) + " features"
                #break
        inputfile.close()

       # result = "File: " + os.path.basename(output_data) + '</br>'
       # result += "Path: " + os.path.dirname(output_data) +  '/' + alg + str(k) + "_Features/" + '</br>'
       # result += "Dimension: " + str(n_data) + " x " + str(n_features) + "</br>"
       # context = {'result': result}
       # yield context

	if int(k) >= input_n:
                print "reduced features must be smaller than input features."
                result =  "reduced features must be smaller than input features."
	else:
#		os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'")
#		conf = (SparkConf().set("spark.driver.maxResultSize", "5g"))
 #               sc = SparkContext(conf=conf)
  #              sqlContext = SQLContext(sc)
                lines = sc.textFile(inputpath).map(lambda x:x.split(" "))
                lines = lines.map(lambda x:(x[0],[float(y) for y in x[1:]]))
                df = lines.map(lambda x: Row(labels=x[0],features=Vectors.dense(x[1]))).toDF()
	
		if alg == "pca":
			output_data = pca(inputdir,df,alg,k)
			#os.system("spark-submit /home/ubuntu/yi-imPro/imagepro/pca.py " + inputpath + " " + k)

		output_data = inputdir + "/" + alg + str(k) + "_Data"
		inputfile = open(output_data, 'r')
	       	file_size = str(os.stat(output_data).st_size )
        	counter = 0
  	     	n_features = '0'
        	for line in inputfile:
                	input_n = len(line.split(" "))
                	n_features = str(input_n)
                	counter += 1

        	inputfile.close()
        	n_data = str(counter)

                result = "File: " + os.path.basename(output_data) + '</br>'
                result += "Path: " + os.path.dirname(output_data) +  '/' + alg + str(k) + "_Features/" + '</br>'
                result += "Dimension: " + n_data + " x " + n_features + "</br>"
                result += "Size: " + file_size + ' bytes'
		print result
#		sc.stop()		

        print "Dimension reduction finished!"

        context = {'n_data': n_data, 'n_features': n_features, 'result': result}
	return context
Ejemplo n.º 13
0
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     df = self.spark.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
Ejemplo n.º 14
0
    def test_tweedie_distribution(self):

        df = self.spark.createDataFrame(
            [(1.0, Vectors.dense(0.0, 0.0)),
             (1.0, Vectors.dense(1.0, 2.0)),
             (2.0, Vectors.dense(0.0, 0.0)),
             (2.0, Vectors.dense(1.0, 1.0)), ], ["label", "features"])

        glr = GeneralizedLinearRegression(family="tweedie", variancePower=1.6)
        model = glr.fit(df)
        self.assertTrue(np.allclose(model.coefficients.toArray(), [-0.4645, 0.3402], atol=1E-4))
        self.assertTrue(np.isclose(model.intercept, 0.7841, atol=1E-4))

        model2 = glr.setLinkPower(-1.0).fit(df)
        self.assertTrue(np.allclose(model2.coefficients.toArray(), [-0.6667, 0.5], atol=1E-4))
        self.assertTrue(np.isclose(model2.intercept, 0.6667, atol=1E-4))
Ejemplo n.º 15
0
    def test_java_object_gets_detached(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight",
                              fitIntercept=False)

        model = lr.fit(df)
        summary = model.summary

        self.assertIsInstance(model, JavaWrapper)
        self.assertIsInstance(summary, JavaWrapper)
        self.assertIsInstance(model, JavaParams)
        self.assertNotIsInstance(summary, JavaParams)

        error_no_object = 'Target Object ID does not exist for this gateway'

        self.assertIn("LinearRegression_", model._java_obj.toString())
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        model.__del__()

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        try:
            summary.__del__()
        except:
            pass

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            summary._java_obj.toString()
Ejemplo n.º 16
0
 def test_kmean_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     data = [(Vectors.dense([0.0, 0.0]),), (Vectors.dense([1.0, 1.0]),),
             (Vectors.dense([9.0, 8.0]),), (Vectors.dense([8.0, 9.0]),)]
     df = self.spark.createDataFrame(data, ["features"])
     kmeans = KMeans(k=2, seed=1)
     model = kmeans.fit(df)
     path = tempfile.mkdtemp()
     km_path = path + "/km-pmml"
     model.write().format("pmml").save(km_path)
     pmml_text_list = self.sc.textFile(km_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
Ejemplo n.º 17
0
    def test_vector_size_hint(self):
        df = self.spark.createDataFrame(
            [(0, Vectors.dense([0.0, 10.0, 0.5])),
             (1, Vectors.dense([1.0, 11.0, 0.5, 0.6])),
             (2, Vectors.dense([2.0, 12.0]))],
            ["id", "vector"])

        sizeHint = VectorSizeHint(
            inputCol="vector",
            handleInvalid="skip")
        sizeHint.setSize(3)
        self.assertEqual(sizeHint.getSize(), 3)

        output = sizeHint.transform(df).head().vector
        expected = DenseVector([0.0, 10.0, 0.5])
        self.assertEqual(output, expected)
Ejemplo n.º 18
0
 def test_parallel_evaluation(self):
     dataset = self.spark.createDataFrame(
         [(Vectors.dense([0.0]), 0.0),
          (Vectors.dense([0.4]), 1.0),
          (Vectors.dense([0.5]), 0.0),
          (Vectors.dense([0.6]), 1.0),
          (Vectors.dense([1.0]), 1.0)] * 10,
         ["features", "label"])
     lr = LogisticRegression()
     grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
     evaluator = BinaryClassificationEvaluator()
     tvs = TrainValidationSplit(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
     tvs.setParallelism(1)
     tvsSerialModel = tvs.fit(dataset)
     tvs.setParallelism(2)
     tvsParallelModel = tvs.fit(dataset)
     self.assertEqual(tvsSerialModel.validationMetrics, tvsParallelModel.validationMetrics)
Ejemplo n.º 19
0
 def test_clustering_evaluator_with_cosine_distance(self):
     featureAndPredictions = map(lambda x: (Vectors.dense(x[0]), x[1]),
                                 [([1.0, 1.0], 1.0), ([10.0, 10.0], 1.0), ([1.0, 0.5], 2.0),
                                  ([10.0, 4.4], 2.0), ([-1.0, 1.0], 3.0), ([-100.0, 90.0], 3.0)])
     dataset = self.spark.createDataFrame(featureAndPredictions, ["features", "prediction"])
     evaluator = ClusteringEvaluator(predictionCol="prediction", distanceMeasure="cosine")
     self.assertEqual(evaluator.getDistanceMeasure(), "cosine")
     self.assertTrue(np.isclose(evaluator.evaluate(dataset),  0.992671213, atol=1e-5))
Ejemplo n.º 20
0
def convert_to_flat_by_sparkpy_v3(df):
    vectorize = udf(lambda vs: Vectors.dense(list(chain.from_iterable(vs))), VectorUDT())
    spark_df = df
    spark_df = df.orderBy("key", "subkey")
    spark_df = spark_df.groupBy("key").agg(first(col("parameter")).alias("label"), collect_list("reference").alias("features"))
    spark_df = spark_df.withColumn('features', vectorize('features'))
    spark_df = spark_df.select("label", "features")
    return spark_df
Ejemplo n.º 21
0
 def test_gaussian_mixture_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     gmm = GaussianMixture(k=2)
     model = gmm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertTrue(isinstance(s.probability, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 3)
Ejemplo n.º 22
0
 def test_onevsrest(self):
     temp_path = tempfile.mkdtemp()
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))] * 10,
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self._compare_pipelines(ovr, loadedOvr)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     self._compare_pipelines(model, loadedModel)
Ejemplo n.º 23
0
    def test_type_error(self):
        df = self.spark.createDataFrame([("a", 0), ("b", 0)]).toDF("features", "key")
        keyedPCA = KeyedEstimator(sklearnEstimator=PCA())
        self.assertRaises(TypeError, keyedPCA.fit, df)

        df = self.spark.createDataFrame([(Vectors.dense([i]), [i], 0) for i in range(10)])
        df = df.toDF("features", "y", "key")
        keyedLR = KeyedEstimator(sklearnEstimator=LinearRegression(), yCol="y")
        self.assertRaises(TypeError, keyedLR.fit, df)
Ejemplo n.º 24
0
def mldemo():

    spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
    df = spark.createDataFrame(data, ["features"])
    
    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))
    
    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
Ejemplo n.º 25
0
    def test_expose_sub_models(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()

        numFolds = 3
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
                            numFolds=numFolds, collectSubModels=True)

        def checkSubModels(subModels):
            self.assertEqual(len(subModels), numFolds)
            for i in range(numFolds):
                self.assertEqual(len(subModels[i]), len(grid))

        cvModel = cv.fit(dataset)
        checkSubModels(cvModel.subModels)

        # Test the default value for option "persistSubModel" to be "true"
        testSubPath = temp_path + "/testCrossValidatorSubModels"
        savingPathWithSubModels = testSubPath + "cvModel3"
        cvModel.save(savingPathWithSubModels)
        cvModel3 = CrossValidatorModel.load(savingPathWithSubModels)
        checkSubModels(cvModel3.subModels)
        cvModel4 = cvModel3.copy()
        checkSubModels(cvModel4.subModels)

        savingPathWithoutSubModels = testSubPath + "cvModel2"
        cvModel.write().option("persistSubModels", "false").save(savingPathWithoutSubModels)
        cvModel2 = CrossValidatorModel.load(savingPathWithoutSubModels)
        self.assertEqual(cvModel2.subModels, None)

        for i in range(numFolds):
            for j in range(len(grid)):
                self.assertEqual(cvModel.subModels[i][j].uid, cvModel3.subModels[i][j].uid)
Ejemplo n.º 26
0
 def applyEstimator(estimator, x):
     if not estimator:
         return None
     if oneDimensional:
         x = [[x]]
     else:
         x = x.toArray().reshape(1, -1)
     if shouldPredict:
         return cast(estimator.predict(x)[0])
     else:
         return Vectors.dense(estimator.transform(x)[0])
Ejemplo n.º 27
0
 def setUp(self):
     super(MLlibTestCase, self).setUp()
     self.sc = self.spark.sparkContext
     self.sql = self.spark
     self.X = np.array([[1,2,3],
                        [-1,2,3], [1,-2,3], [1,2,-3],
                        [-1,-2,3], [1,-2,-3], [-1,2,-3],
                        [-1,-2,-3]])
     self.y = np.array([1, 0, 1, 1, 0, 1, 0, 0])
     data = [(float(self.y[i]), Vectors.dense(self.X[i])) for i in range(len(self.y))]
     self.df = self.sql.createDataFrame(data, ["label", "features"])
Ejemplo n.º 28
0
    def test_parallel_evaluation(self):
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        lr = LogisticRegression()
        grid = ParamGridBuilder().addGrid(lr.maxIter, [5, 6]).build()
        evaluator = BinaryClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
        cv.setParallelism(1)
        cvSerialModel = cv.fit(dataset)
        cv.setParallelism(2)
        cvParallelModel = cv.fit(dataset)
        self.assertEqual(cvSerialModel.avgMetrics, cvParallelModel.avgMetrics)
Ejemplo n.º 29
0
    def test_apply_binary_term_freqs(self):

        df = self.spark.createDataFrame([(0, ["a", "a", "b", "c", "c", "c"])], ["id", "words"])
        n = 10
        hashingTF = HashingTF()
        hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True)
        output = hashingTF.transform(df)
        features = output.select("features").first().features.toArray()
        expected = Vectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]).toArray()
        for i in range(0, n):
            self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + str(i) +
                                   ": expected " + str(expected[i]) + ", got " + str(features[i]))
Ejemplo n.º 30
0
    def test_save_load_nested_estimator(self):
        temp_path = tempfile.mkdtemp()
        dataset = self.spark.createDataFrame(
            [(Vectors.dense([0.0]), 0.0),
             (Vectors.dense([0.4]), 1.0),
             (Vectors.dense([0.5]), 0.0),
             (Vectors.dense([0.6]), 1.0),
             (Vectors.dense([1.0]), 1.0)] * 10,
            ["features", "label"])

        ova = OneVsRest(classifier=LogisticRegression())
        lr1 = LogisticRegression().setMaxIter(100)
        lr2 = LogisticRegression().setMaxIter(150)
        grid = ParamGridBuilder().addGrid(ova.classifier, [lr1, lr2]).build()
        evaluator = MulticlassClassificationEvaluator()

        # test save/load of CrossValidator
        cv = CrossValidator(estimator=ova, estimatorParamMaps=grid, evaluator=evaluator)
        cvModel = cv.fit(dataset)
        cvPath = temp_path + "/cv"
        cv.save(cvPath)
        loadedCV = CrossValidator.load(cvPath)
        self.assertEqual(loadedCV.getEstimator().uid, cv.getEstimator().uid)
        self.assertEqual(loadedCV.getEvaluator().uid, cv.getEvaluator().uid)

        originalParamMap = cv.getEstimatorParamMaps()
        loadedParamMap = loadedCV.getEstimatorParamMaps()
        for i, param in enumerate(loadedParamMap):
            for p in param:
                if p.name == "classifier":
                    self.assertEqual(param[p].uid, originalParamMap[i][p].uid)
                else:
                    self.assertEqual(param[p], originalParamMap[i][p])

        # test save/load of CrossValidatorModel
        cvModelPath = temp_path + "/cvModel"
        cvModel.save(cvModelPath)
        loadedModel = CrossValidatorModel.load(cvModelPath)
        self.assertEqual(loadedModel.bestModel.uid, cvModel.bestModel.uid)
Ejemplo n.º 31
0
import numpy as np
import os
from pyspark.ml.linalg import Vectors

npz_files_path = "/home/lmtruong1512/Codes/BTL_CSDLDPT/extracted_files/extracted_SIFT100"

file_names = os.listdir(npz_files_path)[0:10]
np_arrs = [
    np.load(os.path.join(npz_files_path, file_name))['arr_0']
    for file_name in file_names
]
dataset = map(lambda x: Vectors.dense(x, ), np_arrs)
print(dataset)
Ejemplo n.º 32
0
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 20 19:58:59 2019

@author: amitabh.gunjan
"""

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
import pyspark as spark


spark= spark.SparkSession.builder.getOrCreate()

df = spark.SQLContext.createDataFrame(data=[Row(label=0.0, weight=0.1, features=Vectors.dense([0.0, 0.0]))
    ,Row(label=0.0, weight=0.5, features=Vectors.dense([0.0, 1.0]))
    ,Row(label=1.0, weight=1.0, features=Vectors.dense([1.0, 0.0]))])
nb = spark.ml.classification.NaiveBayes(smoothing=1.0, modelType="multinomial", weightCol="weight")
model = nb.fit(df)
model.pi
model.theta
test0 = sc.parallelize([Row(features=Vectors.dense([1.0, 0.0]))]).toDF()
result = model.transform(test0).head()
result.prediction
result.probability
result.rawPrediction
test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF()
model.transform(test1).head().prediction
nb_path = temp_path + "/nb"
nb.save(nb_path)
nb2 = NaiveBayes.load(nb_path)
Ejemplo n.º 33
0
    print(
        '***** Creating opcodes list for each document *********************************************'
    )
    # >> (opcode, ((docid, hash, label), cnt))
    rdd_opcode = rdd_opcode_cnt.map(lambda x: (x[0][1], (x[0][0], x[1])))\
                        .leftOuterJoin(rdd_opcode_distinct)\
                        .map(lambda x: (x[1][0][0], (x[1][1], x[1][0][1])))\
                        .groupByKey().map(lambda x: (x[0], list(x[1])))

    print(
        '***** Creating opcodes list with document information *************************************'
    )
    # >> (docid, hash, label, vector.dense(opcode))
    opcode = rdd_train.map(lambda x: (x[1], (x[0], x[2]))).leftOuterJoin(rdd_opcode)\
                    .map(lambda x: (x[1][0][0], x[0], x[1][0][1], list(numpy_cartesian(x[1][1], N))))\
                    .map(lambda x: (x[0], x[1], x[2], Vectors.dense(x[3])))

    print(
        '***** RF feature selection ****************************************************************'
    )
    opcode_imp = RF_features_select(opcode)
    # >> (index, feature_importance)
    rdd_opcode_imp = sc.parallelize(opcode_imp)
    # opcode_r >> (docid, hash, label, vectors.dense(opcode))
    # rdd_opcode_distinct_r >> (opcode, index_r)
    opcode_r, rdd_opcode_distinct_r, N_r = feature_filter(
        rdd_opcode_imp, rdd_opcode_distinct, rdd_opcode_cnt, rdd_train)

    print(
        '***** Transforming RDD into Dateframe *****************************************************'
    )
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder.appName('data').getOrCreate()

df = spark.read.csv(
    'hdfs:///user/maria_dev/MachineLearning/fake_customers.csv',
    inferSchema=True,
    header=True)
df.show()

df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"),
                            (5, "c")], ["user_id", "category"])
df.show()

indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
indexed = indexer.fit(df).transform(df)
indexed.show()

dataset = spark.createDataFrame(
    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
    ["id", "hour", "mobile", "userFeatures", "clicked"])
dataset.show()

assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"],
                            outputCol="features")

output = assembler.transform(dataset)
print(
    "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'"
)
output.select("features", "clicked").show()
from pyspark.ml.regression import AFTSurvivalRegression
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

training = spark.createDataFrame([(1.218, 1.0, Vectors.dense(1.560, -0.605)),
                                  (2.949, 0.0, Vectors.dense(0.346, 2.158)),
                                  (3.627, 0.0, Vectors.dense(1.380, 0.231)),
                                  (0.273, 1.0, Vectors.dense(0.520, 1.151)),
                                  (4.199, 0.0, Vectors.dense(0.795, -0.226))],
                                 ["label", "censor", "features"])

quantileProbabilities = [0.3, 0.6]

aft = AFTSurvivalRegression(quantileProbabilities=quantileProbabilities,
                            quantilesCol="quantiles")
model = aft.fit(training)

print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))
print("Scale: " + str(model.scale))
model.transform(training).show(truncate=False)

spark.stop()
Ejemplo n.º 36
0
def f(x, y):
    ret = {}
    ret['features'] = Vectors.dense(float(x[0]), float(x[1]), float(x[2]),
                                    float(x[3]))
    ret['label'] = str(y)
    return ret
Ejemplo n.º 37
0
def join_vec(term_len, tf_title, tf_desc):
    return Vectors.dense([int(term_len), int(tf_title), int(tf_desc)])
Ejemplo n.º 38
0
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("EstimatorTransformerParamExample")\
        .getOrCreate()

    # $example on$
    # Prepare training data from a list of (label, features) tuples.
    training = spark.createDataFrame([
        (1.0, Vectors.dense([0.0, 1.1, 0.1])),
        (0.0, Vectors.dense([2.0, 1.0, -1.0])),
        (0.0, Vectors.dense([2.0, 1.3, 1.0])),
        (1.0, Vectors.dense([0.0, 1.2, -0.5]))], ["label", "features"])

    # Create a LogisticRegression instance. This instance is an Estimator.
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    # Print out the parameters, documentation, and any default values.
    print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

    # Learn a LogisticRegression model. This uses the parameters stored in lr.
    model1 = lr.fit(training)

    # Since model1 is a Model (i.e., a transformer produced by an Estimator),
    # we can view the parameters it used during fit().
    # This prints the parameter (name: value) pairs, where names are unique IDs for this
Ejemplo n.º 39
0
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("MinMaxScalerExample")\
        .getOrCreate()

#    # $example on$
#    dataFrame = spark.createDataFrame([
#        (0, Vectors.dense([1.0, 0.1, -1.0]),),
#        (1, Vectors.dense([2.0, 1.1, 1.0]),),
#        (2, Vectors.dense([3.0, 10.1, 3.0]),)
#    ], ["id", "features"])

    dataFrame = spark.createDataFrame([
        (0, Vectors.dense([1.0, 0.1, -8.0]),),
        (1, Vectors.dense([2.0, 1.0, -4.0]),),
        (2, Vectors.dense([4.0, 10.0, 8.0]),)
    ], ["id", "features"])
    dataFrame.show()




    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")

    # Compute summary statistics and generate MinMaxScalerModel
    scalerModel = scaler.fit(dataFrame)

    # rescale each feature to range [min, max].
    scaledData = scalerModel.transform(dataFrame)
Ejemplo n.º 40
0
from __future__ import print_function

from pyspark.sql import SparkSession
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import FValueTest
# $example off$

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("FValueTestExample") \
        .getOrCreate()

    # $example on$
    data = [(4.6, Vectors.dense(6.0, 7.0, 0.0, 7.0, 6.0, 0.0)),
            (6.6, Vectors.dense(0.0, 9.0, 6.0, 0.0, 5.0, 9.0)),
            (5.1, Vectors.dense(0.0, 9.0, 3.0, 0.0, 5.0, 5.0)),
            (7.6, Vectors.dense(0.0, 9.0, 8.0, 5.0, 6.0, 4.0)),
            (9.0, Vectors.dense(8.0, 9.0, 6.0, 5.0, 4.0, 4.0)),
            (9.0, Vectors.dense(8.0, 9.0, 6.0, 4.0, 0.0, 0.0))]
    df = spark.createDataFrame(data, ["label", "features"])

    ftest = FValueTest.test(df, "features", "label").head()
    print("pValues: " + str(ftest.pValues))
    print("degreesOfFreedom: " + str(ftest.degreesOfFreedom))
    print("fvalue: " + str(ftest.fValues))
    # $example off$

    spark.stop()
Ejemplo n.º 41
0
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import *
from pyspark.ml.clustering import BisectingKMeans, BisectingKMeansModel, BisectingKMeansSummary
#from pyspark.mllib.clustering import KMeans, KMeansModel
import numpy as np
from math import sqrt
from operator import add

df = spark.read.parquet("regex_table.parquet")
df1 = df.rdd.map(lambda x: (x[0],x[1],Vectors.dense(x[2])))
df1 = df1.toDF().withColumnRenamed('_1','table').withColumnRenamed('_2','colunm').withColumnRenamed('_3','features')
#df = spark.createDataFrame([["a", "a1", Vectors.dense([0.5,0.5,0.0,0.0])],\
#["a", "a2", Vectors.dense([0.1,0.2,0.3,0.4])],\
#["a", "a3", Vectors.dense([0.2,0.1,0.3,0.4])],\
#["b", "b1", Vectors.dense([0.3,0.1,0.2,0.4])],\
#["b", "b2", Vectors.dense([0.4,0.1,0.2,0.3])],\
#["b", "b3", Vectors.dense([0.5,0.5,0.0,0.0])]],\
#["table", "column", "features"])

#vso = df.rdd.map(lambda x:np.array((x[0],x[1]),x[2]))
transformed.select()sort($'prediction'.asc).show()

def model_list():
    clist = []
    df2 = df1.select('features')
    df2.cache
    df1.cache
    for i in range(2,20):
        kmeans = BisectingKMeans(k=i, minDivisibleClusterSize=1.0)
        model = kmeans.fit(df2)
        WSSSE = model.computeCost(df1)
Ejemplo n.º 42
0
def vectorize(data):
    return data.rdd.map(lambda r: [r[0], Vectors.dense(r[1:])]).toDF(['label','features'])
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession

if __name__ == "__main__":

    spark_session = SparkSession \
        .builder \
        .appName("Spark ML SVM") \
        .getOrCreate()

    model = LinearSVCModel.load("SVMModel")
    print("Model loaded")

    test = spark_session.createDataFrame([
        (0, Vectors.dense([1.0, 1.2])),
        (1, Vectors.dense([5.3, 2.4])),
        (2, Vectors.dense([1.2, 1.3])),
        (3, Vectors.dense([5.1, 2.3]))],
        ["label", "features"]) \
        .cache()

    for row in test.collect():
        print(row)

    prediction = model.transform(test)
    prediction.printSchema()
    prediction.show()

    selected = prediction.select("label", "prediction")
    selected.printSchema()
Ejemplo n.º 44
0
from __future__ import print_function

from pyspark.ml.regression import LinearRegression

from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors

if __name__ == "__main__":

    # Create a SparkSession (Note, the config section is only for Windows!)
    spark = SparkSession.builder.appName("LinearRegression").getOrCreate()

    # Load up our data and convert it to the format MLLib expects.
    inputLines = spark.sparkContext.textFile("data/regression.txt")
    data = inputLines.map(lambda x: x.split(",")).map(
        lambda x: (float(x[0]), Vectors.dense(float(x[1]))))

    # Convert this RDD to a DataFrame
    colNames = ["label", "features"]
    df = data.toDF(colNames)

    # Note, there are lots of cases where you can avoid going from an RDD to a DataFrame.
    # Perhaps you're importing data from a real database. Or you are using structured streaming
    # to get your data.

    # Let's split our data into training data and testing data
    trainTest = df.randomSplit([0.5, 0.5])
    trainingDF = trainTest[0]
    testDF = trainTest[1]

    # Now create our linear regression model
# Copyright 2017 Mario Juez. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Ejemplo de prediccion usando el modelo de regresion lineal.
# Se debe ejecutar desde PySpark, en la consola SSH del cluster.
#
# Autor: Mario Juez <*****@*****.**>

from pyspark.sql import Row
from pyspark.ml.linalg import Vectors
from pyspark.ml.regression import LinearRegressionModel

test = spark.sparkContext.parallelize(
    [Row(features=Vectors.dense(27, 30, 41, 63, 9))]).toDF()
model = LinearRegressionModel.load(
    "gs://seminario-gcp/ml-models/pyspark-natality-lr-model")
result = model.transform(test).head()

print result.prediction
def concat_rf_feats(line):
        all_feature = list(line["chi"]) + list(line["as"]) + list(line["geo"]) + list(line["mob"])
        return Row(feats=Vectors.dense(all_feature), label=line["label"])
    sc.parallelize([("Preprocessing", preprocess_time),
                    ("Training time", training_time),
                    ("Testing time", testing_time),
                    ("Total time", total_timetaken)],
                   1).saveAsTextFile(sys.argv[2] + "_Time_taken")

    data1 = crimes.where(crimes.Latitude.isNotNull()
                         & crimes.Longitude.isNotNull()
                         & crimes.ID.isNotNull())

    # Choosing latitude and longitude after removing rows with null values and outliers for better results
    data_frame = data1 \
               .rdd \
               .filter(lambda x: (40.0<float(x[19])<42.0))\
               .filter(lambda x: (-88.0<float(x[20])<-86.0))\
               .map(lambda x: (x[0], Vectors.dense(float(x[19]), float(x[20])))).toDF(["ID", "features"])

    (trainingData1, testData1) = data_frame.randomSplit([0.7, 0.3], seed=100)

    # Number of cluster choosen
    k = 6

    kmeans = KMeans().setK(k).setSeed(1)
    kmeans_model = kmeans.fit(trainingData1)

    # Evaluate clustering by computing Within Set Sum of Squared Errors.
    wssse = kmeans_model.computeCost(trainingData1)
    print("Within Set Sum of Squared Errors = " + str(wssse))

    # cluster center reults
    centers = kmeans_model.clusterCenters()
Ejemplo n.º 48
0
 def parsePoint(line):
     return (line[-1], Vectors.dense(line[:-1]))
                                                                  float(p[17],float(p[18],float(p[19],float(p[20],float(p[21])))


# In[338]:

# Create a DataFrame
lending_df = spark.createDataFrame(lend_RDD)
lending_df.show(10)


# In[339]:

# Convert feature type to vector
lending_df_vectors = lending_df.rdd.map(lambda row: Row(
    label=row["lable"],
    features = Vectors.dense(row["featuresList"])
)).toDF()


# In[340]:

lending_df_vectors


# In[341]:

# Scale the data 
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

Ejemplo n.º 50
0
def fill_nan(vec: np.array, num=0):
    return Vectors.dense(np.nan_to_num(vec, num))
Ejemplo n.º 51
0
def predict(features_tab, tab_out, model_path, veh):
    # 1 配置
    spark = SparkSession \
        .builder \
        .master("yarn") \
        .appName("tianzw_vol_fading_predict_second_versions") \
        .config("spark.sql.warehouse.dir","hdfs://neicluster/user/hive/warehouse") \
        .enableHiveSupport() \
        .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    # 2 准备数据
    sql = """
        SELECT  vin             ,
                sta_time        ,
                mils_1000km     ,
                sta_soc         ,
                charge_c        ,
                hours           ,
                temp            ,
                days            ,
                mils_dif        ,
                cnt_cha         ,
                vol_cha         ,
                vol_avg_cha     ,
                hou_cha         ,
                c_avg           ,
                sta_soc_avg_cha ,
                end_soc_avg_cha ,
                dep_soc_avg_cha ,
                sta_soc_mid_cha ,
                end_soc_mid_cha ,
                dep_soc_mid_cha ,
                cnt_tem         ,
                tem_mid_yea     ,
                tem_avg_yea     ,
                tem_dif_yea     ,
                tem_var_yea     
        FROM    """ + features_tab + """
        WHERE   veh_head = SUBSTR('""" + veh + """',0,1)
        AND     veh = '""" + veh + """'
        AND     SUBSTR(vin,-1,1) = '""" + vin_tail + """'
            """
    rdd_origin = spark.sql(sql).rdd
    features_rdd = rdd_origin.map(lambda x: (
        x.vin,
        x.sta_time,
        Vectors.dense([
            x.mils_1000km, x.sta_soc, x.charge_c, x.hours, x.temp, x.days, x.
            mils_dif, x.cnt_cha, x.vol_cha, x.vol_avg_cha, x.hou_cha, x.c_avg,
            x.sta_soc_avg_cha, x.end_soc_avg_cha, x.dep_soc_avg_cha, x.
            sta_soc_mid_cha, x.end_soc_mid_cha, x.dep_soc_mid_cha, x.cnt_tem, x
            .tem_mid_yea, x.tem_avg_yea, x.tem_dif_yea, x.tem_var_yea
        ]),
    ))
    features_list = features_rdd.collect()
    print("数据提取成功")
    spark_df = spark.createDataFrame(features_list,
                                     ["vin", "sta_time", "features"])
    # 3 模型预测
    # model = GBTRegressor.load(model_path)
    model = GBTRegressionModel.load(model_path)
    print("模型导入成功")
    predictions = model.transform(spark_df)
    print("计算成功")
    new_list = [(x.vin, x.sta_time, x.prediction)
                for x in predictions.collect()]
    result_df = spark.createDataFrame(new_list,
                                      ["vin", "sta_time", "vol_fading"])
    result_df = result_df.repartition(1)
    result_df.createOrReplaceTempView("table_temp")
    # 数据写入 hive 表
    # createSQL = """
    #             CREATE TABLE IF NOT EXISTS """ + tab_out + """
    #             (
    #                 vin         STRING      COMMENT '车架号',
    #                 sta_time    BIGINT      COMMENT '充电开始时间(s)',
    #                 vol_fading  DOUBLE      COMMENT '容量衰减百分比预测值(2位小数)'
    #             )
    #             PARTITIONED BY(veh        STRING     COMMENT '车型名',
    #                            vin_tail   STRING     COMMENT '车架号尾号')
    #             ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
    # """
    insertSql = """
                INSERT OVERWRITE TABLE """ + tab_out + """
                PARTITION(veh = '""" + veh + """',vin_tail = '""" + vin_tail + """')
                SELECT      vin,
                            sta_time,
                            ROUND(vol_fading,2)     AS vol_fading
                FROM        table_temp
    """
    # spark.sql("DROP TABLE IF EXISTS " + tab_out)
    # spark.sql(createSQL)
    spark.sql(insertSql)
    print(tab_out + "写入成功")
Ejemplo n.º 52
0
def cluster(inputpath,alg,k):
	
	n_data = 0
	n_features = 0
	result = "successful!"
	inputdir = os.path.dirname(inputpath)
	print "inputdir: " + inputdir + result
	inputfile = open(inputpath,'r')
	for line in inputfile:
                input_n = len(line.split(" "))
                n_data += 1
		#print "Selected data set has " + str(input_n) + " features"
                #break
        inputfile.close()

       # result = "File: " + os.path.basename(output_data) + '</br>'
       # result += "Path: " + os.path.dirname(output_data) +  '/' + alg + str(k) + "_Features/" + '</br>'
       # result += "Dimension: " + str(n_data) + " x " + str(n_features) + "</br>"
       # context = {'result': result}
       # yield context

	if int(k) == 1:
                print "k should be greater than 1"
                result =  "k should be greater than 1"
	else:
#		os.system("export _JAVA_OPTIONS='-Xms1g -Xmx40g'")
#		conf = (SparkConf().set("spark.driver.maxResultSize", "5g"))
 #               sc = SparkContext(conf=conf)
  #              sqlContext = SQLContext(sc)
                lines = sc.textFile(inputpath).map(lambda x:x.split(" "))
                lines = lines.map(lambda x:(x[0],[float(y) for y in x[1:]]))
                df = lines.map(lambda x: Row(labels=x[0],features=Vectors.dense(x[1]))).toDF()
	
		if alg == "kmeans":
			output_data = kmeans(inputdir,df,alg,k)
			#os.system("spark-submit /home/ubuntu/yi-imPro/imagepro/pca.py " + inputpath + " " + k)

		output_data = inputdir + "/" + alg + str(k) + "_Data"
		inputfile = open(output_data, 'r')
	       	file_size = str(os.stat(output_data).st_size )
        	counter = 0
  	     	n_features = '0'
        	for line in inputfile:
                	input_n = len(line.split(" "))
                	n_features = str(input_n)
                	counter += 1

        	inputfile.close()
        	n_data = str(counter)

                result = "File: " + os.path.basename(output_data) + '</br>'
                result += "Path: " + os.path.dirname(output_data) +  '/' + alg + str(k) + "_Features/" + '</br>'
                result += "Dimension: " + n_data + " x " + n_features + "</br>"
                result += "Size: " + file_size + ' bytes'
		print result
#		sc.stop()		

        print "Clustering finished!"

        context = {'n_data': n_data, 'n_features': n_features, 'result': result}
	return context
Ejemplo n.º 53
0
def tmpDouble2vec(x):
    return Vectors.dense(x)
Ejemplo n.º 54
0
 def func(x):
     features_data = []
     for feature in feature_indexs:
         features_data.append(x[feature])
     return Row(label=x[label_index], features=Vectors.dense(features_data))
Ejemplo n.º 55
0
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

data = [(Vectors.dense([21.0, 110, 3.90]), ),
        (Vectors.dense([22.8, 93, 3.85]), ),
        (Vectors.dense([18.1, 105, 2.76]), )]

df = spark.createDataFrame(data, ['features'])
r1 = Correlation.corr(df, 'features', 'pearson').head()
r2 = Correlation.corr(df, 'features', 'spearman').head()

print 'Data:'
print df.show()

print 'Pearson Correlation:'
print str(r1[0])

print 'Spearman Correlation:'
print str(r2[0])

spark.stop()
Ejemplo n.º 56
0
def vector_from_inputs(r):
  return (r["weight_pounds"], Vectors.dense(float(r["mother_age"]),
                                            float(r["father_age"]),
                                            float(r["gestation_weeks"]),
                                            float(r["weight_gain_pounds"]),
                                            float(r["apgar_5min"])))
Ejemplo n.º 57
0
def get_vectors(row):
    '''根据原始数据的情况对特征集插值'''
    if row['pe_feature_12']:
        pe_feature_12_risk = row['pe_feature_12']['risk']
    else:
        pe_feature_12_risk = 0.
    if row['pe_feature_13']:
        pe_feature_13_risk = row['pe_feature_13']['risk']
    else:
        pe_feature_13_risk = 0.

    if row['feature_26']:
        return (Vectors.dense([
            row['pe_feature_1'],
            row['pe_feature_2'],
            row['pe_feature_3'],
            row['pe_feature_4'],
            row['pe_feature_5'],
            row['pe_feature_6'],
            row['pe_feature_7'],
            row['pe_feature_8'],
            row['pe_feature_9'],
            row['pe_feature_10'],
            row['pe_feature_11'],
            pe_feature_12_risk,
            pe_feature_13_risk,
            row['feature_1']['r'],
            row['feature_2']['c'],
            row['feature_3']['z'],
            row['feature_4']['k'],
            row['feature_5']['l'],
            row['feature_6']['z'],
            row['feature_7']['z'],
            row['feature_8']['y'],
            row['feature_9']['n'],
            row['feature_10']['z'],
            row['feature_11']['z'],
            row['feature_12']['z'],
            row['feature_13']['z'],
            row['feature_14']['z'],
            row['feature_15']['r'],
            row['feature_16']['r'],
            row['feature_17']['r'],
            row['feature_18']['z'],
            row['feature_19']['z'],
            row['feature_20']['g'],
            row['feature_21']['y'],
            row['feature_22']['z'],
            row['feature_23']['y'],
            row['feature_24']['z'],
            row['feature_26']['a_1'],
            row['feature_26']['a_4'],
            row['feature_26']['a_5'],
            row['feature_26']['a_6'],
            row['feature_26']['b_1'],
            row['feature_26']['b_2'],
            row['feature_26']['b_3'],
            row['feature_26']['c_1'],
            row['feature_26']['d_2'],
        ]), row['bbd_qyxx_id'], row['company_name'])
    elif row['feature_1']:
        return (Vectors.dense([
            row['pe_feature_1'],
            row['pe_feature_2'],
            row['pe_feature_3'],
            row['pe_feature_4'],
            row['pe_feature_5'],
            row['pe_feature_6'],
            row['pe_feature_7'],
            row['pe_feature_8'],
            row['pe_feature_9'],
            row['pe_feature_10'],
            row['pe_feature_11'],
            pe_feature_12_risk,
            pe_feature_13_risk,
            row['feature_1']['r'],
            row['feature_2']['c'],
            row['feature_3']['z'],
            row['feature_4']['k'],
            row['feature_5']['l'],
            row['feature_6']['z'],
            row['feature_7']['z'],
            row['feature_8']['y'],
            row['feature_9']['n'],
            row['feature_10']['z'],
            row['feature_11']['z'],
            row['feature_12']['z'],
            row['feature_13']['z'],
            row['feature_14']['z'],
            row['feature_15']['r'],
            row['feature_16']['r'],
            row['feature_17']['r'],
            row['feature_18']['z'],
            row['feature_19']['z'],
            row['feature_20']['g'],
            row['feature_21']['y'],
            row['feature_22']['z'],
            row['feature_23']['y'],
            row['feature_24']['z'],
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
        ]), row['bbd_qyxx_id'], row['company_name'])
    else:
        return (Vectors.dense([
            row['pe_feature_1'],
            row['pe_feature_2'],
            row['pe_feature_3'],
            row['pe_feature_4'],
            row['pe_feature_5'],
            row['pe_feature_6'],
            row['pe_feature_7'],
            row['pe_feature_8'],
            row['pe_feature_9'],
            row['pe_feature_10'],
            row['pe_feature_11'],
            pe_feature_12_risk,
            pe_feature_13_risk,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
            0.,
        ]), row['bbd_qyxx_id'], row['company_name'])
Ejemplo n.º 58
0
    StructField("pdef", DoubleType(), True),
    StructField("pbeau", DoubleType(), True),
    StructField("pnum", IntegerType(), True),
    StructField("s_term_score", DoubleType(), True),
    StructField("sumclick", LongType(), True),
    StructField("sumshow", LongType(), True),
    StructField("uid", LongType(), True)
])

print "begin to map input"
train_set = spark.read.csv(
    "gs://dataproc-0e3e0110-db09-4037-98cc-dc355651aba0-asia-southeast1/tensorflow/data/picfeed/train_feature_test/part-*",
    schema=fieldSchema)
train_set_r = train_set.rdd.map(
    lambda p: Row(label=p.label,
                  features=Vectors.dense(p.ctr, p.pnum, p.pdef, p.pbeau, p.
                                         s_term_score, p.sumclick, p.sumshow)))
print train_set_r.take(5)

print "finish map input"
train_set_d = spark.createDataFrame(train_set_r)
(training, test) = train_set_d.randomSplit([0.9, 0.1])
#train
lr = LogisticRegression(maxIter=10, regParam=0.3)
lrModel = lr.fit(training)
print "coefficients"
print lrModel.coefficients
print "intercept"
print lrModel.intercept
#summary
# $example on$
# Extract the summary from the returned LogisticRegressionModel instance trained
Ejemplo n.º 59
0
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("e").getOrCreate()

training = spark.createDataFrame([(1.0, Vectors.dense([0.0, 1.1, 0.1])),
                                  (0.0, Vectors.dense([2.0, 1.0, -1.0])),
                                  (0.0, Vectors.dense([2.0, 1.3, 1.0])),
                                  (1.0, Vectors.dense([0.0, 1.2, -0.5]))],
                                 ["label", "features"])

lr = LogisticRegression(maxIter=10, regParam=0.01)
model_1 = lr.fit(training)

param_map = dict()
param_map[lr.maxIter] = 30
param_map.update({lr.regParam: 0.1, lr.threshold: 0.55})

param_map_new = {lr.probabilityCol: "my_probability"}
param_map_combined = param_map.copy()
param_map_combined.update(param_map_new)

model_2 = lr.fit(training, params=param_map_combined)

test = spark.createDataFrame([(1.0, Vectors.dense([-1.0, 1.5, 1.3])),
                              (0.0, Vectors.dense([3.0, 2.0, -0.1])),
                              (1.0, Vectors.dense([0.0, 2.2, -1.5]))],
                             ["label", "features"])

predict = model_2.transform(test)
Ejemplo n.º 60
0
import pyspark
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import mlflow
import mlflow.pyspark.ml

if __name__ == '__main__':
    print("MLflow version: {}".format(mlflow.__version__))
    spark = pyspark.sql.SparkSession.builder.appName("BestParams") \
        .getOrCreate()
    dataset = spark.createDataFrame([(Vectors.dense([0.0]), 0.0),
                                     (Vectors.dense([0.4]), 1.0),
                                     (Vectors.dense([0.5]), 0.0),
                                     (Vectors.dense([0.6]), 1.0),
                                     (Vectors.dense([1.0]), 1.0)] * 10,
                                    ["features", "label"])
    lr = LogisticRegression()
    grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=lr,
                        estimatorParamMaps=grid,
                        evaluator=evaluator,
                        parallelism=2)

    mlflow.pyspark.ml.autolog()
    cvModel = cv.fit(dataset)

    print("Average Metric: {}".format(cvModel.avgMetrics[0]))
    print("Number of folds: {}".format(cvModel.getNumFolds()))