Ejemplo n.º 1
0
 def ztest_toPandas(self):
     data = [(Vectors.dense([0.1, 0.2]),),
             (Vectors.sparse(2, {0:0.3, 1:0.4}),),
             (Vectors.sparse(2, {0:0.5, 1:0.6}),)]
     df = self.sql.createDataFrame(data, ["features"])
     self.assertEqual(df.count(), 3)
     pd = self.converter.toPandas(df)
     self.assertEqual(len(pd), 3)
     self.assertTrue(isinstance(pd.features[0], csr_matrix),
                     "Expected pd.features[0] to be csr_matrix but found: %s" %
                     type(pd.features[0]))
     self.assertEqual(pd.features[0].shape[0], 3)
     self.assertEqual(pd.features[0].shape[1], 2)
     self.assertEqual(pd.features[0][0,0], 0.1)
     self.assertEqual(pd.features[0][0,1], 0.2)
Ejemplo n.º 2
0
    def test_java_object_gets_detached(self):
        df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                         (0.0, 2.0, Vectors.sparse(1, [], []))],
                                        ["label", "weight", "features"])
        lr = LinearRegression(maxIter=1, regParam=0.0, solver="normal", weightCol="weight",
                              fitIntercept=False)

        model = lr.fit(df)
        summary = model.summary

        self.assertIsInstance(model, JavaWrapper)
        self.assertIsInstance(summary, JavaWrapper)
        self.assertIsInstance(model, JavaParams)
        self.assertNotIsInstance(summary, JavaParams)

        error_no_object = 'Target Object ID does not exist for this gateway'

        self.assertIn("LinearRegression_", model._java_obj.toString())
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        model.__del__()

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        self.assertIn("LinearRegressionTrainingSummary", summary._java_obj.toString())

        try:
            summary.__del__()
        except:
            pass

        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            model._java_obj.toString()
        with self.assertRaisesRegexp(py4j.protocol.Py4JError, error_no_object):
            summary._java_obj.toString()
Ejemplo n.º 3
0
 def test_persistence(self):
     # Test save/load for LDA, LocalLDAModel, DistributedLDAModel.
     df = self.spark.createDataFrame([
         [1, Vectors.dense([0.0, 1.0])],
         [2, Vectors.sparse(2, {0: 1.0})],
     ], ["id", "features"])
     # Fit model
     lda = LDA(k=2, seed=1, optimizer="em")
     distributedModel = lda.fit(df)
     self.assertTrue(distributedModel.isDistributed())
     localModel = distributedModel.toLocal()
     self.assertFalse(localModel.isDistributed())
     # Define paths
     path = tempfile.mkdtemp()
     lda_path = path + "/lda"
     dist_model_path = path + "/distLDAModel"
     local_model_path = path + "/localLDAModel"
     # Test LDA
     lda.save(lda_path)
     lda2 = LDA.load(lda_path)
     self._compare(lda, lda2)
     # Test DistributedLDAModel
     distributedModel.save(dist_model_path)
     distributedModel2 = DistributedLDAModel.load(dist_model_path)
     self._compare(distributedModel, distributedModel2)
     # Test LocalLDAModel
     localModel.save(local_model_path)
     localModel2 = LocalLDAModel.load(local_model_path)
     self._compare(localModel, localModel2)
     # Clean up
     try:
         rmtree(path)
     except OSError:
         pass
Ejemplo n.º 4
0
def mldemo():

    spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
    df = spark.createDataFrame(data, ["features"])
    
    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))
    
    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
Ejemplo n.º 5
0
 def test_output_columns(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, parallelism=1)
     model = ovr.fit(df)
     output = model.transform(df)
     self.assertEqual(output.columns, ["label", "features", "rawPrediction", "prediction"])
Ejemplo n.º 6
0
 def test_copy(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     ovr1 = ovr.copy({lr.maxIter: 10})
     self.assertEqual(ovr.getClassifier().getMaxIter(), 5)
     self.assertEqual(ovr1.getClassifier().getMaxIter(), 10)
     model = ovr.fit(df)
     model1 = model.copy({model.predictionCol: "indexed"})
     self.assertEqual(model1.getPredictionCol(), "indexed")
Ejemplo n.º 7
0
 def test_parallelism_doesnt_change_output(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))],
                                     ["label", "features"])
     ovrPar1 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=1)
     modelPar1 = ovrPar1.fit(df)
     ovrPar2 = OneVsRest(classifier=LogisticRegression(maxIter=5, regParam=.01), parallelism=2)
     modelPar2 = ovrPar2.fit(df)
     for i, model in enumerate(modelPar1.models):
         self.assertTrue(np.allclose(model.coefficients.toArray(),
                                     modelPar2.models[i].coefficients.toArray(), atol=1E-4))
         self.assertTrue(np.allclose(model.intercept, modelPar2.models[i].intercept, atol=1E-4))
Ejemplo n.º 8
0
 def test_support_for_weightCol(self):
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
                                      (1.0, Vectors.sparse(2, [], []), 1.0),
                                      (2.0, Vectors.dense(0.5, 0.5), 1.0)],
                                     ["label", "features", "weight"])
     # classifier inherits hasWeightCol
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr, weightCol="weight")
     self.assertIsNotNone(ovr.fit(df))
     # classifier doesn't inherit hasWeightCol
     dt = DecisionTreeClassifier()
     ovr2 = OneVsRest(classifier=dt, weightCol="weight")
     self.assertIsNotNone(ovr2.fit(df))
Ejemplo n.º 9
0
 def test_bisecting_kmeans_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     bkm = BisectingKMeans(k=2)
     model = bkm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 20)
Ejemplo n.º 10
0
 def test_linear_regression_pmml_basic(self):
     # Most of the validation is done in the Scala side, here we just check
     # that we output text rather than parquet (e.g. that the format flag
     # was respected).
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LinearRegression(maxIter=1)
     model = lr.fit(df)
     path = tempfile.mkdtemp()
     lr_path = path + "/lr-pmml"
     model.write().format("pmml").save(lr_path)
     pmml_text_list = self.sc.textFile(lr_path).collect()
     pmml_text = "\n".join(pmml_text_list)
     self.assertIn("Apache Spark", pmml_text)
     self.assertIn("PMML", pmml_text)
Ejemplo n.º 11
0
def parse(line):
    obj = json.loads(line)
    fc = obj[featureCol]
    if "size" not in fc and "type" not in fc:
        feature_size = len(fc)
        dic = [(i, a) for i, a in enumerate(fc)]
        sv = SparseVector(len(fc), dic)
    elif "size" not in fc and "type" in fc and fc["type"] == 1:
        values = fc["values"]
        feature_size = len(values)
        dic = [(i, a) for i, a in enumerate(values)]
        sv = SparseVector(len(values), dic)

    else:
        feature_size = fc["size"]
        sv = Vectors.sparse(fc["size"], list(zip(fc["indices"], fc["values"])))
    return sv
Ejemplo n.º 12
0
 def test_gaussian_mixture_summary(self):
     data = [(Vectors.dense(1.0),), (Vectors.dense(5.0),), (Vectors.dense(10.0),),
             (Vectors.sparse(1, [], []),)]
     df = self.spark.createDataFrame(data, ["features"])
     gmm = GaussianMixture(k=2)
     model = gmm.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertTrue(isinstance(s.probability, DataFrame))
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     self.assertTrue(isinstance(s.cluster, DataFrame))
     self.assertEqual(len(s.clusterSizes), 2)
     self.assertEqual(s.k, 2)
     self.assertEqual(s.numIter, 3)
Ejemplo n.º 13
0
 def test_onevsrest(self):
     temp_path = tempfile.mkdtemp()
     df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8)),
                                      (1.0, Vectors.sparse(2, [], [])),
                                      (2.0, Vectors.dense(0.5, 0.5))] * 10,
                                     ["label", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01)
     ovr = OneVsRest(classifier=lr)
     model = ovr.fit(df)
     ovrPath = temp_path + "/ovr"
     ovr.save(ovrPath)
     loadedOvr = OneVsRest.load(ovrPath)
     self._compare_pipelines(ovr, loadedOvr)
     modelPath = temp_path + "/ovrModel"
     model.save(modelPath)
     loadedModel = OneVsRestModel.load(modelPath)
     self._compare_pipelines(model, loadedModel)
Ejemplo n.º 14
0
 def test_binary_logistic_regression_summary(self):
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight", fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.probabilityCol, "probability")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     self.assertEqual(s.predictionCol, "prediction")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.labels, list))
     self.assertTrue(isinstance(s.truePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.falsePositiveRateByLabel, list))
     self.assertTrue(isinstance(s.precisionByLabel, list))
     self.assertTrue(isinstance(s.recallByLabel, list))
     self.assertTrue(isinstance(s.fMeasureByLabel(), list))
     self.assertTrue(isinstance(s.fMeasureByLabel(1.0), list))
     self.assertTrue(isinstance(s.roc, DataFrame))
     self.assertAlmostEqual(s.areaUnderROC, 1.0, 2)
     self.assertTrue(isinstance(s.pr, DataFrame))
     self.assertTrue(isinstance(s.fMeasureByThreshold, DataFrame))
     self.assertTrue(isinstance(s.precisionByThreshold, DataFrame))
     self.assertTrue(isinstance(s.recallByThreshold, DataFrame))
     self.assertAlmostEqual(s.accuracy, 1.0, 2)
     self.assertAlmostEqual(s.weightedTruePositiveRate, 1.0, 2)
     self.assertAlmostEqual(s.weightedFalsePositiveRate, 0.0, 2)
     self.assertAlmostEqual(s.weightedRecall, 1.0, 2)
     self.assertAlmostEqual(s.weightedPrecision, 1.0, 2)
     self.assertAlmostEqual(s.weightedFMeasure(), 1.0, 2)
     self.assertAlmostEqual(s.weightedFMeasure(1.0), 1.0, 2)
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned, Scala version runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.areaUnderROC, s.areaUnderROC)
Ejemplo n.º 15
0
 def test_linear_regression_summary(self):
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     lr = LinearRegression(maxIter=5, regParam=0.0, solver="normal", weightCol="weight",
                           fitIntercept=False)
     model = lr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertGreater(s.totalIterations, 0)
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertEqual(s.labelCol, "label")
     self.assertEqual(s.featuresCol, "features")
     objHist = s.objectiveHistory
     self.assertTrue(isinstance(objHist, list) and isinstance(objHist[0], float))
     self.assertAlmostEqual(s.explainedVariance, 0.25, 2)
     self.assertAlmostEqual(s.meanAbsoluteError, 0.0)
     self.assertAlmostEqual(s.meanSquaredError, 0.0)
     self.assertAlmostEqual(s.rootMeanSquaredError, 0.0)
     self.assertAlmostEqual(s.r2, 1.0, 2)
     self.assertAlmostEqual(s.r2adj, 1.0, 2)
     self.assertTrue(isinstance(s.residuals, DataFrame))
     self.assertEqual(s.numInstances, 2)
     self.assertEqual(s.degreesOfFreedom, 1)
     devResiduals = s.devianceResiduals
     self.assertTrue(isinstance(devResiduals, list) and isinstance(devResiduals[0], float))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned
     # The child class LinearRegressionTrainingSummary runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.explainedVariance, s.explainedVariance)
Ejemplo n.º 16
0
 def test_glr_summary(self):
     from pyspark.ml.linalg import Vectors
     df = self.spark.createDataFrame([(1.0, 2.0, Vectors.dense(1.0)),
                                      (0.0, 2.0, Vectors.sparse(1, [], []))],
                                     ["label", "weight", "features"])
     glr = GeneralizedLinearRegression(family="gaussian", link="identity", weightCol="weight",
                                       fitIntercept=False)
     model = glr.fit(df)
     self.assertTrue(model.hasSummary)
     s = model.summary
     # test that api is callable and returns expected types
     self.assertEqual(s.numIterations, 1)  # this should default to a single iteration of WLS
     self.assertTrue(isinstance(s.predictions, DataFrame))
     self.assertEqual(s.predictionCol, "prediction")
     self.assertEqual(s.numInstances, 2)
     self.assertTrue(isinstance(s.residuals(), DataFrame))
     self.assertTrue(isinstance(s.residuals("pearson"), DataFrame))
     coefStdErr = s.coefficientStandardErrors
     self.assertTrue(isinstance(coefStdErr, list) and isinstance(coefStdErr[0], float))
     tValues = s.tValues
     self.assertTrue(isinstance(tValues, list) and isinstance(tValues[0], float))
     pValues = s.pValues
     self.assertTrue(isinstance(pValues, list) and isinstance(pValues[0], float))
     self.assertEqual(s.degreesOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedom, 1)
     self.assertEqual(s.residualDegreeOfFreedomNull, 2)
     self.assertEqual(s.rank, 1)
     self.assertTrue(isinstance(s.solver, basestring))
     self.assertTrue(isinstance(s.aic, float))
     self.assertTrue(isinstance(s.deviance, float))
     self.assertTrue(isinstance(s.nullDeviance, float))
     self.assertTrue(isinstance(s.dispersion, float))
     # test evaluation (with training dataset) produces a summary with same values
     # one check is enough to verify a summary is returned
     # The child class GeneralizedLinearRegressionTrainingSummary runs full test
     sameSummary = model.evaluate(df)
     self.assertAlmostEqual(sameSummary.deviance, s.deviance)
Ejemplo n.º 17
0
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

prediction_features = [
    'overall', 'doctor', 'specialty', 'procedure', 'priority'
]
change_to_month_func = udf(
    lambda record: int(
        datetime.strftime(datetime.strptime(record, '%d/%m/%Y'), '%Y%m')),
    IntegerType())
change_to_date_func = udf(
    lambda record: datetime.strptime(str(record), '%Y%m'), DateType())
change_date_to_month = udf(
    lambda record: datetime(record.year, record.month, 1), DateType())
to_vector = udf(lambda record: Vectors.dense(record), VectorUDT())
to_vectors = udf(lambda col_a, col_b: Vectors.sparse(col_a, col_b))

# Creating Spark Context and Spark Session
scobj = SparkContext.getOrCreate()
spark = SparkSession(scobj).builder.config('spark.sql.crossJoin.enabled',
                                           'true').getOrCreate()


def perform_prediction(csv, predict_by='Overall', predict_period=3):
    """
    The function is the entry point to the prediction module.
    :param csv: --string: path to the csv file containing the data
    :param predict_by: -- string: The choices should be 'Overall', 'Doctor', 'Specialty', 'Procedure',
           and 'Priority'
    :param predict_period: -- integer: The choices should be 3, 6, 12, 24, 36
    :return:
Ejemplo n.º 18
0
    Returning dataframe with the coresponding cluster of each data.
    Input:  - model
            - dataframe yang sesuai dengan input training
    Output: dataframe dengan colom features dan pca_features.
    '''
    transformed = model.transform(df)
    return transformed


# =============================================================================
# Test and examples
# =============================================================================

print()
print("Data========================")
data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )]
df = spark.createDataFrame(data, ["features"])

data2 = [(Vectors.sparse(5, [(4, 10.0), (3, 7.0)]), ),
         (Vectors.dense([20.0, 8.0, 0.3, 400.0, 5.0]), ),
         (Vectors.dense([40.0, 10.0, 20.0, 600.0, 700.0]), ),
         (Vectors.dense([4.0, 10.0, 200.0, 600.0, 700.0]), ),
         (Vectors.dense([3.0, 100.0, 0.0, 6000.0, 7000.0]), )]
df2 = spark.createDataFrame(data2, ["features"])

print()
print('data')
print(df.show())
print('data2')
Ejemplo n.º 19
0
                    matrix[cnt].setdefault(shingle, shingles.get(shingle))
                else:
                    shingles.setdefault(shingle, sh_count)
                    matrix[cnt].setdefault(shingle, sh_count)
                    sh_count += 1

        line = fp.readline().split(" ")
        cnt += 1
size = len(list(shingles))
cnt = 0
for key, value in tqdm(matrix.items()):
    aux = []
    for index, sh in value.items():
        aux.append(sh)
    data.append(
        (key, Vectors.sparse(size, sorted(list(aux)),
                             np.ones(len(list(aux))))))
next_prime = sieve_of_eratosthenes(size * 2, size)
sc = spark.sparkContext
distData = sc.parallelize(data)

#df = spark.createDataFrame(data, ["id", "features"])
df = spark.createDataFrame(distData, ["id", "features"])

key = Vectors.dense([1.0, 0.0])

mh = MinHashLSH(inputCol="features",
                outputCol="hashes",
                numHashTables=5,
                seed=next_prime)
model = mh.fit(df)
dft = model.transform(df)
Ejemplo n.º 20
0
    .cols.unnest(["col_int"])\
    .table()

# ### Spits in 3 parts

df\
    .cols.unnest(["two strings"], splits= 3, mark = "-")\
    .table()

# ### Unnest a Vector

# +
from pyspark.ml.linalg import Vectors

df1 = op.sc.parallelize([("assert", Vectors.dense([1, 2, 3])),
                         ("require", Vectors.sparse(3, {1: 2}))
                         ]).toDF(["word", "vector"])
# -

df1\
    .cols.unnest(["vector"])\
    .table()

df = df.cols.append("new_col_1", 1)

# ## Impute

# ### Fill missing data

# +
df_fill = op.spark.createDataFrame([(1.0, float("nan")), (2.0, float("nan")),
Ejemplo n.º 21
0
 def build(self):
     vec = Vectors.sparse(self.size, (self.indices, self.data))
Ejemplo n.º 22
0
    if not hasattr(os, "mlsql_models"):
        setattr(os, "mlsql_models", {})
    if modelPath not in os.mlsql_models:
        print("Load sklearn model %s" % modelPath)
        os.mlsql_models[modelPath] = pickle.load(open(modelPath, "rb"))

    model = os.mlsql_models[modelPath]
    rawVector = pickle.loads(items[0])
    feature = VectorUDT().deserialize(rawVector)
    y = model.predict([feature.toArray()])
    return [VectorUDT().serialize(Vectors.dense(y))]


if run_for_test:
    import json

    model_path = '/tmp/__mlsql__/3242514c-4113-4105-bdc5-9987b28f9764/0'
    data_path = '/Users/allwefantasy/Downloads/data1/part-00000-03769d42-1948-499b-8d8f-4914562bcfc8-c000.json'

    with open(file=data_path) as f:
        for line in f.readlines():
            s = []
            wow = json.loads(line)['features']
            feature = Vectors.sparse(wow["size"],
                                     list(zip(wow["indices"], wow["values"])))
            s.insert(0, pickle.dumps(VectorUDT().serialize(feature)))
            s.insert(1, pickle.dumps([model_path]))
            print(VectorUDT().deserialize(predict(1, s)[0]))

python_fun.udf(predict)
Ejemplo n.º 23
0
def sparse_vec(r, count):
    list = set(r[1])
    list = sorted(list)
    length = len(list)
    ones = [1.0 for i in range(length)]
    return r[0], Vectors.sparse(count, list, ones)
Ejemplo n.º 24
0
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
# datetime:2019/3/1 14:28
from mmlspark import LightGBMRegressor
from pyspark.ml.linalg import Vectors
from pyspark.sql import SparkSession
# spark=SparkSession.builder \
#       .appName("normalizer") \
#       .master("local[2]") \
#       .getOrCreate()

svec=Vectors.sparse(4,{1:4.0,3:3.0})
dvec=Vectors.dense([3.0,-4.0])
print(svec)
print(dvec)

Ejemplo n.º 25
0
countTokens = udf(lambda words: len(words), IntegerType())
# 注意每次transform都是新增列,不会删除之前的列
tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

##################################################################################
# 通过主成分分析提取主要特征
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )]
df = spark.createDataFrame(data, ["features"])

pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
model = pca.fit(df)

result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)

# 多项式特征
from pyspark.ml.feature import PolynomialExpansion
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([(Vectors.dense([2.0, 1.0]), ),
from pyspark.ml.linalg import Vectors
denseVec = Vectors.dense(1.0, 2.0, 3.0)
size = 3
idx = [1, 2] # locations of non-zero elements in vector
values = [2.0, 3.0]
sparseVec = Vectors.sparse(size, idx, values)


# COMMAND ----------

df = spark.read.json("/data/simple-ml")
df.orderBy("value2").show()


# COMMAND ----------

from pyspark.ml.feature import RFormula
supervised = RFormula(formula="lab ~ . + color:value1 + color:value2")


# COMMAND ----------

fittedRF = supervised.fit(df)
preparedDF = fittedRF.transform(df)
preparedDF.show()


# COMMAND ----------

train, test = preparedDF.randomSplit([0.7, 0.3])
Ejemplo n.º 27
0
    def data_describe(self):
        print('start to read data for rdd:')
        rawRdd_nlp = self.read_rdd('track2_title.txt').map(lambda line : eval(line))

        # print(rawRdd_nlp.take(10))
        #转化为dataframe,在不指定schema的情况下会自动推断
        sqlContext = SQLContext(self.sc)
        labels=[
            ('item_id',typ.IntegerType()),
            ('title_features',typ.MapType(typ.StringType(), typ.IntegerType()))]
        Schema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])
        df = sqlContext.createDataFrame(rawRdd_nlp,Schema)
        # df.show(10)
        # df.printSchema()

        print("统计title中不同词的个数unique,以及title的长度")
        gdf=df.select("item_id",fn.explode(fn.col("title_features"))).groupBy("item_id")
        df2=gdf.agg(fn.count("key").alias("title_words_unique"))

        df3=gdf.agg(fn.sum("value").alias("title_length"))

        df=df.join(df2,"item_id","left") \
             .join(df3,"item_id","left")
        df=df.drop("title_features")
        df.printSchema()


        print('start to deal with the title_features col,and compute the title topic')

        tokens=df.rdd.map(lambda d:d[1]).map(lambda d:list(d.keys()))  #每个titile对应的tokens

        local_tokens=tokens.flatMap(lambda d :[int(token) for token in d]).distinct()
        print('local_tokens最大的值')
        print(local_tokens.top(1))
        vocab_size=max(local_tokens.top(1))+1

        #将title_feature列转化为向量
        toInt=udf(lambda counts :{int(token) :float(counts[token]) for token in counts}, typ.StringType())
        df = df.withColumn("title_features_1", toInt(df.title_features))


        toVector=udf(lambda vs: Vectors.sparse(vocab_size,vs), VectorUDT())
        rescaledData = df.withColumn("features", toVector(df.title_features_1)).select("item_id", "features")

        df=df.drop("title_features_1")
        # del df
        # gc.collect()
        rescaledData.cache()
        lda = LDA(k=50,maxIter=200)
        #lda = LDA(k=2,maxIter=5)
        ldaModel = lda.fit(rescaledData)


        transformed = ldaModel.transform(rescaledData)   #.select("topicDistribution")
        #结果显示 每个文档各个类别的权重, transformed表各列名
        #主题分布向量转化为类别
        # transformed.show(truncate=False)

        def to_array(col):
            def to_array_(v):
                return v.toArray().tolist()
            return psf.udf(to_array_, typ.ArrayType(typ.DoubleType()))(col)
        df_topic=transformed.withColumn("topic", to_array(psf.col("topicDistribution"))).select(["item_id"] + [psf.col("topic")[i] for i in range(50)])

        topicCol=df_topic.columns
        topicCol.remove("item_id")
        print('查看列名')
        print(topicCol)
        def getTopicID(p):  #改用key-value的形式,再排序,找出最大value对应的key
            d={}
            for c in topicCol: #构建字典
                d[c]=p[c]
            z = list(d.keys())[list(d.values()).index(max(d.values()))]
            return int(z.replace("topic[",'').replace("]",''))

        df_topic1=df_topic.rdd.map(lambda p: (p.item_id, getTopicID(p)))

        labels=[
            ('item_id',typ.IntegerType()),
            ('title_topic',typ.IntegerType())]
        Schema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])
        df_topic2 = sqlContext.createDataFrame(df_topic1,Schema)

        # df_topic2 = df_topic1.toDF(['item_id','topic'])
        # print('观看topic是否为想要的数据格式,并保存于topic2中')
        df_topic2.show(5)

        df_nlp=df.join(df_topic2,"item_id","left")   #UnboundLocalError: local variable 'df' referenced before assignment
        df_nlp.printSchema()
        #item_id|title_features |title_words_unique|title_length|title_features1 |title_topic|

        print('-------5.保存数据预处理结果-------')
        file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'nlp_topic_feature2'
        os.system("hadoop fs -rm -r {}".format(file_path))
        df_nlp.rdd.map(tuple).saveAsPickleFile(file_path)
        print('数据保存结束')


        print('start to read act data  only for uid and item_id :')
        rawRdd_train = self.read_rdd('final_track2_train.txt').map(lambda line : line.split('\t'))
        rawRdd_test = self.read_rdd('final_track2_test_no_anwser.txt').map(lambda line : line.split('\t'))
        actionLogRdd_train = rawRdd_train.map(
            lambda x :(int(x[0]), int(x[2])))
        # total = actionLogRdd_train.count()
        # print('total: ' + str(total))
        actionLogRdd_test = rawRdd_test.map(
            lambda x :(int(x[0]), int(x[2])))

        sqlContext = SQLContext(self.sc)
        labels=[('uid',typ.IntegerType()),
            ('item_id',typ.IntegerType())
            ]

        actionLogSchema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels])

        dfactionLog_train = sqlContext.createDataFrame(actionLogRdd_train, actionLogSchema)
        dfactionLog_test = sqlContext.createDataFrame(actionLogRdd_test, actionLogSchema)

        #根据item_id进行关联
        # item_id|title_features||title_words_unique|title_length|title_features_1|title_topic
        df_nlp=df_nlp.select(["item_id","title_words_unique","title_length"])

        df_uid_nlp_test=dfactionLog_test.select(["uid","item_id"]).join(df_nlp,'item_id','left').drop("item_id")
        df_uid_nlp_train=dfactionLog_train.select(["uid","item_id"]).join(df_nlp,'item_id','left').drop("item_id")
        del dfactionLog_test
        del dfactionLog_train
        gc.collect()

        #进行处理
        gdf=df_uid_nlp_train.groupby("uid")
        df1=gdf.agg(fn.max("title_words_unique").alias("uid_max_title_words_unique"),fn.avg("title_words_unique").alias("uid_avg_title_words_unique"),\
                    fn.max("title_length").alias("uid_max_title_length"),fn.avg("title_length").alias("uid_avg_title_length")
                    )
        df1.show(1,truncate=False)
        df_uid_train=df_uid_nlp_train.join(df1,'uid','left').drop("title_words_unique").drop("title_length")
        df_uid_test=df_uid_nlp_test.join(df1,'uid','left').drop("title_words_unique").drop("title_length")

        print("理论上应该只有uid,uid_max_beauty,uid_avg_beauty,uid_male_ratio")
        df_uid_train.printSchema()
        df_uid_test.printSchema()

        print('-------保存df_uid_nlp数据-------')
        file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'df_uid_nlp_train'
        os.system("hadoop fs -rm -r {}".format(file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_uid_train.rdd.map(tuple).saveAsPickleFile(file_path)

        file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'df_uid_nlp_test'
        os.system("hadoop fs -rm -r {}".format(file_path))  #os.system(command) 其参数含义如下所示: command 要执行的命令
        df_uid_test.rdd.map(tuple).saveAsPickleFile(file_path)
        print('数据保存结束')

        #检验上面创建lda模型中使用的参数 ll越大越好,lp越小越好
        '''
        ll = ldaModel.logLikelihood(rescaledData)
        lp = ldaModel.logPerplexity(rescaledData)
        print(ll)
        print(lp)
        '''

        #保存ldaModel,训练集转化的时候直接加载该模型,目前没有必要保存模型,保存df_topic即可
        print("开始保存模型")
        distributed_model_path = self.parser.get("hdfs_path", "hdfs_data_path") + "lda_distributed_model"
        ldaModel.save(distributed_model_path)
        print("保存模型结束")
        #加载的语句
        print("加载模型")
        sameLdaModel = DistributedLDAModel.load(distributed_model_path)
        print("加载模型结束")

        # ---------------------------------3 模型及描述------------------------------
        # 模型通过describeTopics、topicsMatrix来描述
        '''
        topicIndices = ldaModel.describeTopics(maxTermsPerTopic=5)
        topicIndices.show(truncate=False)
        #*主题    主题包含最重要的词语序号                     各词语的权重
        '''


        '''
Ejemplo n.º 28
0
 def to_sparse_vector(indices, values):
     indices, values = zip(*sorted(zip(indices, values)))
     return Vectors.sparse(max_id, indices, values)
Ejemplo n.º 29
0
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("VectorSlicerExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame([
        Row(userFeatures=Vectors.sparse(3, {0: -2.0, 1: 2.3}),),
        Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]),)])

    slicer = VectorSlicer(inputCol="userFeatures", outputCol="features", indices=[1])

    output = slicer.transform(df)

    output.select("userFeatures", "features").show()
    # $example off$

    spark.stop()
Ejemplo n.º 30
0
"""
An example demonstrating MinHashLSH.
Run with:
  bin/spark-submit examples/src/main/python/ml/min_hash_lsh_example.py
"""

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("MinHashLSHExample") \
        .getOrCreate()

    # $example on$
    dataA = [(
        0,
        Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),
    ), (
        1,
        Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),
    ), (
        2,
        Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),
    )]
    dfA = spark.createDataFrame(dataA, ["id", "features"])

    dataB = [(
        3,
        Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),
    ), (
        4,
        Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),
Ejemplo n.º 31
0
Run with:
  bin/spark-submit examples/src/main/python/ml/correlation_example.py
"""
# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("CorrelationExample") \
        .getOrCreate()

    # $example on$
    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]), ),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]), ),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]), ),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]), )]
    df = spark.createDataFrame(data, ["features"])

    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))

    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
    # $example off$

    spark.stop()
Ejemplo n.º 32
0
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation, Summarizer
from pyspark.mllib.stat import Statistics
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark_session = SparkSession \
        .builder \
        .getOrCreate()

    logger = spark_session._jvm.org.apache.log4j
    logger.LogManager.getLogger("org").setLevel(logger.Level.WARN)

    data_list = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]

    data_frame = spark_session.createDataFrame(data_list, ["features"])
    data_frame.printSchema()
    data_frame.show()

    r1 = Correlation.corr(data_frame, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))

    r2 = Correlation.corr(data_frame, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))

    rdd_data = data_frame.rdd
    print(rdd_data.collect())
Ejemplo n.º 33
0
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Spark MLlib") \
        .config("spark.master", "local") \
        .getOrCreate()

######################################################
#   Example 1 - Dense & Sparse Vectors
######################################################

from pyspark.ml.linalg import Vectors
denseVec = Vectors.dense(1.0, 2.0, 3.0, 4.0, 5.0, 6.0)
size = 12
idx = [1, 2, 10, 11]  # locations of non-zero elements in vector
values = [12.0, 32.0, 110.0, 27.0]
sparseVec = Vectors.sparse(size, idx, values)

print("denseVec: ", denseVec)
print("sparseVec: ", sparseVec)

spark.stop()
Ejemplo n.º 34
0
def make_click_pattern_vector(features, size):
    vec = Vectors.sparse(size, features)
Ejemplo n.º 35
0
协方差矩阵
希望投影后的投影值尽可能的分散
协方差


"""
try:
    from pyspark.ml.feature import PCA
    from pyspark.ml.linalg import Vectors
    from pyspark.sql import SparkSession

    print("Successfully  imported Spark Modules")
except ImportError as e:
    print("Can not import Spark Modules", e)
    sys.exit(1)

spark = SparkSession.builder.appName("PACExample").getOrCreate()

data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
        (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
        (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )]
print(data)
df = spark.createDataFrame(data, ["features"])
pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
# 表示用数据df来训练PCA模型
model = pca.fit(df)
# 当模型训练好后,对于新输入的数据,都可以用transform方法来降维.
result = model.transform(df).select("pcaFeatures")
result.show(truncate=False)
spark.stop()
Ejemplo n.º 36
0
def toSparseVector(index, values):
    day_list_index, qty_list_values = zip(*sorted(zip(index, values)))
    #367 for bisextile year (1 to 366 +1)
    return Vectors.sparse(366, day_list_index, qty_list_values)
Ejemplo n.º 37
0
def array2vec(genreIndexes, indexSize):
    genreIndexes.sort()
    fill_list = [1.0 for _ in range(len(genreIndexes))]
    return Vectors.sparse(indexSize, genreIndexes, fill_list)
Ejemplo n.º 38
0
from pyspark.ml.linalg import Vectors
from pyspark.ml.classification import LogisticRegression

bdf = sc.parallelize([
    Row(label=1.0, weight=1.0, features=Vectors.dense(0.0, 5.0)),
    Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)),
    Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)),
    Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0))
]).toDF()

blor = LogisticRegression(regParam=0.01, weightCol="weight")
blorModel = blor.fit(bdf)
blorModel.coefficients
blorModel.intercept

test1 = sc.parallelize([Row(features=Vectors.sparse(2, [0], [1.0]))]).toDF()
blorModel.transform(test1).head().prediction

save_path = "C:\\PySpark\\spark_ml\\saved_models\\logistic_regression_example_1\\"
estimator_path = save_path + "lr"
# Save the estimator
blor.save(estimator_path)
lr2 = LogisticRegression.load(estimator_path)
lr2.getRegParam()

#save the model
model_path = save_path + "lr_model"
blorModel.save(model_path)

from pyspark.ml.classification import LogisticRegressionModel
model2 = LogisticRegressionModel.load(model_path)
Ejemplo n.º 39
0
def sparseify(users_num, user_index, ratings):
    feature = Vectors.sparse(users_num, user_index, ratings)
    return feature
Ejemplo n.º 40
0
# Task 1: Correlation between fields

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# How to represent vectors in Spark
from pyspark.ml.linalg import Vectors
# 1.	Dense vector
#create a vector of 4 features
Vectors.dense([1, 2, 3, 4])

# 2.	Sparse vector
#create a vector of 4 features
Vectors.sparse(4, [(0, 1), (2, 3)])

#Read data
# df = spark.read.csv("/home/s_kante/spark/data/developers_survey_training.csv", header='true')
df = spark.read.csv(
    "/home/student/jac_spark/lecture2/data/Task1_2_3/developers_survey_training.csv",
    header='true')

#Replace IsDeveloper value with integer 1 or 0

#Approach1
df.createOrReplaceTempView("inputData")
df1 = spark.sql(
    "SELECT CASE IsDeveloper WHEN 'Yes' THEN 1 ELSE 0 END AS IsDeveloper, CAST(YearsOfExp AS FLOAT) AS YearsOfExp, CAST(Salary AS FLOAT) AS Salary FROM inputData "
)

#Approach2
Ejemplo n.º 41
0
def trans2sparse(line):
    indices = line["chi"]["indices"]
    values = line["chi"]["values"]
    vec = DenseVector(Vectors.sparse(2000, indices, values).toArray())
    return Row(chi=vec, window=line["window"])
Ejemplo n.º 42
0
def dataset_multinomial(spark_session):
    return spark_session.createDataFrame(
        [(1.0, Vectors.dense(1.0)), (0.0, Vectors.sparse(1, [], [])),
         (2.0, Vectors.dense(0.5))] * 100,
        ["label", "features"],
    ).cache()
Ejemplo n.º 43
0
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.sql import SparkSession
from pyspark import SparkConf

spark = SparkSession.builder.appName(
    "CorrelationExample").getOrCreate()

data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),), (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
        # (c1,c2,c3,..)圆括号里的是column
        (Vectors.dense([6.0, 7.0, 0.0, 8.0]),), (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
# df = spark.createDataFrame(data, ["features"])  # 每一行都是features
df = spark.createDataFrame(data, ['features'])
print(df.collect())

r1 = Correlation.corr(df, "features").head()
print("Pearson correlation matrix:\n" + str(r1[0]))

r2 = Correlation.corr(df, "features", "spearman").head()
print("Spearman correlation matrix:\n" + str(r2[0]))
# $example off$

spark.stop()
Ejemplo n.º 44
0
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

# $example on$
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PCAExample")\
        .getOrCreate()

    # $example on$
    data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
            (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),),
            (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]),)]
    df = spark.createDataFrame(data, ["features"])
    pca = PCA(k=3, inputCol="features", outputCol="pcaFeatures")
    model = pca.fit(df)
    result = model.transform(df).select("pcaFeatures")
    result.show(truncate=False)
    # $example off$

    spark.stop()
Ejemplo n.º 45
0
    def data_describe(self):
        print('start to read data for rdd:')
        #rawRdd_nlp = self.read_rdd('track2_title.txt').map(lambda line : eval(line))
        #rawRdd_nlp = self.read_rdd('track2_title_500.txt').map(lambda line : eval(line))
        rawRdd_nlp = self.sc.textFile(
            '/user/hadoop/icmechallenge2019/track2/test/track2_title_500.txt'
        ).map(lambda line: eval(line))

        # print(rawRdd_nlp.take(10))

        sqlContext = SQLContext(self.sc)
        labels = [('item_id', typ.IntegerType()),
                  ('title_features',
                   typ.MapType(typ.StringType(), typ.IntegerType()))]
        Schema = typ.StructType(
            [typ.StructField(e[0], e[1], True) for e in labels])
        df = sqlContext.createDataFrame(rawRdd_nlp, Schema)
        df.show(5)
        # df.printSchema()

        print(
            'start to deal with the title_features col,and compute the title topic'
        )

        tokens = df.rdd.map(lambda d: d[1]).map(lambda d: list(d.keys()))

        local_tokens = tokens.flatMap(
            lambda d: [int(token) for token in d]).distinct()

        print(local_tokens.top(1))
        vocab_size = max(local_tokens.top(1)) + 1

        toInt = udf(
            lambda counts:
            {int(token): float(counts[token])
             for token in counts}, typ.StringType())
        df = df.withColumn("title_features_1", toInt(df.title_features))

        toVector = udf(lambda vs: Vectors.sparse(vocab_size, vs), VectorUDT())
        rescaledData = df.withColumn("features",
                                     toVector(df.title_features_1)).select(
                                         "item_id", "features")

        rescaledData.cache()
        # lda = LDA(k=50,maxIter=200)
        lda = LDA(k=2, maxIter=5)
        ldaModel = lda.fit(rescaledData)

        print("begin save model")
        distributed_model_path = "/user/hadoop/icmechallenge2019/track2/test/" + "lda_distributed_model_pyspark"
        ldaModel.write().overwrite().save(distributed_model_path)
        print("model saved")

        print("load model")
        sameLdaModel = LocalLDAModel.load(distributed_model_path)
        print("model loaded")

        transformed = sameLdaModel.transform(
            rescaledData)  #.select("topicDistribution")

        transformed.show(truncate=False)
Ejemplo n.º 46
0
data1 = sc.parallelize([
    Row(label=1.0, features=Vectors.dense(1.0, 1.0, 1.0)),
    Row(label=0.0, features=Vectors.dense(1.0, 2.0, 3.0)),
    Row(label=1.0, features=Vectors.dense(2.0, 2.0, 3.0)),
    Row(label=0.0, features=Vectors.dense(4.0, 2.0, 3.0))
]).toDF()

data2 = sc.parallelize([
    Row(label=1.0, weight=1.0, features=Vectors.dense(0.0, 5.0)),
    Row(label=0.0, weight=2.0, features=Vectors.dense(1.0, 2.0)),
    Row(label=1.0, weight=3.0, features=Vectors.dense(2.0, 1.0)),
    Row(label=0.0, weight=4.0, features=Vectors.dense(3.0, 3.0))
]).toDF()

data3 = spark.createDataFrame([(1.0, Vectors.dense(1.0)),
                               (0.0, Vectors.sparse(1, [], []))],
                              ["label", "features"])


def svc_classifier(df, conf):
    max_iter = conf["params"].get("maxIter")
    reg_param = conf["params"].get("regParam")
    svm = LinearSVC(maxIter=max_iter, regParam=reg_param)
    if conf["tuning"].get("crossval"):
        grid = ParamGridBuilder().addGrid(svm.maxIter, [0, 1]).build()
        evaluator = BinaryClassificationEvaluator()
        cv = CrossValidator(estimator=svm,
                            estimatorParamMaps=grid,
                            evaluator=evaluator)
        model = cv.fit(df)
    else:
from pyspark.ml.feature import VectorSlicer
from pyspark.ml.linalg import Vectors
from pyspark.sql.types import Row

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

df = spark.createDataFrame([
    Row(userFeatures=Vectors.sparse(3, {
        0: -2.0,
        1: 2.3
    })),
    Row(userFeatures=Vectors.dense([-2.0, 2.3, 0.0]))
])

slicer = VectorSlicer(inputCol="userFeatures",
                      outputCol="features",
                      indices=[1])
output = slicer.transform(df)
output.select("userFeatures", "features").show()

spark.stop()
Ejemplo n.º 48
0
# $example on$
from pyspark.ml.feature import MinHashLSH
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("MinHashLSHExample") \
        .getOrCreate()

    # $example on$
    dataA = [(0, Vectors.sparse(6, [0, 1, 2], [1.0, 1.0, 1.0]),),
             (1, Vectors.sparse(6, [2, 3, 4], [1.0, 1.0, 1.0]),),
             (2, Vectors.sparse(6, [0, 2, 4], [1.0, 1.0, 1.0]),)]
    dfA = spark.createDataFrame(dataA, ["id", "features"])

    dataB = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),
             (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),
             (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]
    dfB = spark.createDataFrame(dataB, ["id", "features"])

    key = Vectors.sparse(6, [1, 3], [1.0, 1.0])

    mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)
    model = mh.fit(dfA)

    # Feature Transformation
Ejemplo n.º 49
0
def main(sc):
    sqlContext = SQLContext(sc)
    # In[1]:
    input_path = ''
    model_path = ''
    model_info_path = model_path + ''
    model_scaler_path = model_path + ''
    model_train_set_path = model_path + ''

    # Import the table of features and labels into dataframes
    df_data = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true', inferschema='true').load(input_path)

    # Convert all features to double type except for ID and Label, which remain as strings
    # This is done because the Random Forest Algorithm requires features to be numbers
    df_data = df_data.select(
        *(col(c).cast("double").alias(c) for c in df_data.columns[1:-1]),
        df_data.u_msisdn.cast('string'), df_data.tag.cast('string'))

    # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features
    df_master = df_data.rdd.map(lambda r: Row(
        cust_id=r[-2], label=r[-1], features=Vectors.dense(r[:-2]))).toDF()

    # Randomly Split the data into a test and train set
    (df_master_train, df_master_test) = df_master.randomSplit([0.5, 0.5],
                                                              seed=123)

    # Set the Random Forest input to the training set
    rf_init_data = df_master_train

    # Indexing labels for Random Forest Algorithm
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexed_label")
    model = labelIndexer.fit(rf_init_data)
    rf_init_data = model.transform(rf_init_data)

    # Indexing features for Random Forest Algorithm
    featureIndexer = VectorIndexer(inputCol="features",
                                   outputCol="indexed_features",
                                   maxCategories=2)
    model = featureIndexer.fit(rf_init_data)
    rf_init_data = model.transform(rf_init_data)

    # Configures inbuilt Random Forest Classifier function with 500 trees,
    # max depth = 8 and 32 bins
    rf_init = RandomForestClassifier(labelCol="indexed_label",
                                     featuresCol="indexed_features",
                                     numTrees=500,
                                     impurity="gini",
                                     maxDepth=8,
                                     maxBins=32)

    rf_init_data.persist()  # Cache the data set
    rf_init_model = rf_init.fit(
        rf_init_data)  # Run the Random Forest Algorithm

    rf_init_data.unpersist()

    # Extract a list of feature importances from the output of the Random Forest
    # Algorithm with each element corresponding to a feature
    rf_init_varimp = np.sqrt(rf_init_model.featureImportances.toArray())

    # Creates a list containing the 6 most important features to be used later
    # to subset our entire data from 146 features to just 6!

    # Create a list containing the names of all features
    column_names = df_data.columns[:-2]

    #Creating a dictionary mapping feature names to their respective importances
    NameToImp = dict()
    for i in range(len(column_names)):
        key = column_names[i]
        value = rf_init_varimp[i]
        NameToImp[key] = value

    # Sorted list in reverse order according to the variable importances
    sorted_varimp = sorted(NameToImp.values(), reverse=True)

    # Collect importances of 6 most important features
    sorted_top_varimp = sorted_varimp[:6]

    # Sorted list of column names in reverse order according to varimp
    sorted_colnames = sorted(NameToImp, key=NameToImp.get, reverse=True)

    # Collect colnames of 6 most imp features
    col_names = sorted_colnames[:6]

    # Pulling data for most import 6 features
    df_data_new = df_data.select(
        df_data.u_msisdn.cast('string'), df_data.tag.cast('string'),
        *(col(c).cast("double").alias(c) for c in col_names))

    # Defines that the first column is the unique ID, the last one contains the labels and all the ones in between are the given features
    df_master_new = df_data_new.rdd.map(lambda r: Row(
        cust_id=r[0], label=r[1], features=Vectors.dense(r[2:]))).toDF()

    # Scale and normaize the features so that all features can be compared
    # and create a new column for the features
    scaler = StandardScaler(inputCol="features",
                            outputCol="scaled_features",
                            withStd=True,
                            withMean=True)

    # Compute summary statistics by fitting the StandardScaler
    scalerModel = scaler.fit(df_master_new)

    # Normalize each feature to have unit standard deviation.
    df_master_new = scalerModel.transform(df_master_new)

    #The old features have been replaced with their scaled versions and thus
    # we no longer care about the old, unbalanced features
    df_master_new = df_master_new.drop('features')

    # Randomly Split the data into a test and train set
    (df_master_train, df_master_test) = df_master_new.randomSplit([0.5, 0.5],
                                                                  seed=123)

    test_all = df_master_test

    sqlContext.registerDataFrameAsTable(df_master_train,
                                        "df_master_train_table")

    # Remove the negative labels as only the positive ones are important
    train_all = sqlContext.sql(
        'select * from df_master_train_table where label = 1')

    # Multiply feature values with corresponding importances
    m = ElementwiseProduct(scalingVec=Vectors.dense(sorted_top_varimp),
                           inputCol="scaled_features",
                           outputCol="scaled_weighted_features")

    train_all = m.transform(train_all)

    test_all = m.transform(test_all)

    sqlContext.dropTempTable("df_master_train_table")

    # Create a list of tasks containing tuples of number of neighbours and
    # cutoff frequencies to be passed to KNN algorithm
    number_of_neighbours = [250, 550, 750, 1000]
    popshared = 0.30
    num_indices = int(popshared * (test_all.count()))
    tasks = []
    for num_neighbour in number_of_neighbours:
        tasks = tasks + [(num_neighbour, num_indices)]

    # Partitioning the tasks for parallel processing
    tasksRDD = sc.parallelize(tasks, numSlices=len(tasks))
    tasksRDD.collect()

    train_pd = train_all.toPandas()
    test_pd = test_all.toPandas()

    train_pd['indices'] = train_pd.index
    test_pd['indices'] = test_pd.index

    # Converting features into SparseVector format
    l_train = list()
    for k in train_pd.scaled_weighted_features:
        l_train.append(
            Vectors.sparse(len(k),
                           [(i, j) for i, j in enumerate(k) if j != 0]))

    l_test = list()
    for k in test_pd.scaled_weighted_features:
        l_test.append(
            Vectors.sparse(len(k),
                           [(i, j) for i, j in enumerate(k) if j != 0]))

        # Converting to a numpy array
    knn_train = np.asarray(l_train)
    knn_test = np.asarray(l_test)
    # Broadcasting the training and test sets to all partitions
    train_broadcast = sc.broadcast(knn_train)
    test_broadcast = sc.broadcast(knn_test)

    # Calling K Nearest Neighbour search on each partition
    tree_type = "kd_tree"
    resultsRDD = tasksRDD.map(lambda nc: findNearestNeighbour(
        train_broadcast, test_broadcast, nc[0], nc[1], test_pd, tree_type))
    resultsRDD.cache()
    resultsRDD.count()

    resultsPD = resultsRDD.toDF().toPandas()

    resultsPD["popshared"] = popshared
    resultsPD = resultsPD.rename(columns={'_1': 'Recall'})
    resultsPD = resultsPD.rename(columns={'_2': 'Number of Neighbors'})

    bestResult = (resultsPD.sort_values(by=["Recall"], ascending=[0])).iloc[0]
    bestNN = int(bestResult["Number of Neighbors"])
    bestRecall = bestResult["Recall"]

    # saving the model info - varimp,recall,NN,col_names to model_path
    column_names = [i for i in col_names]
    model_info = sc.parallelize([{
        "varimp": sorted_top_varimp,
        "recall": bestRecall,
        "NN": bestNN,
        "col_names": column_names
    }])
    model_info.saveAsPickleFile(path=model_info_path)

    # saving the scaler model to model_path
    scalerModel.write().overwrite().save(model_scaler_path)

    # saving the train set to model_path
    df_master_new.rdd.saveAsPickleFile(path=model_train_set_path)
Ejemplo n.º 50
0
"""
from __future__ import print_function

# $example on$
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("CorrelationExample") \
        .getOrCreate()

    # $example on$
    data = [(Vectors.sparse(4, [(0, 1.0), (3, -2.0)]),),
            (Vectors.dense([4.0, 5.0, 0.0, 3.0]),),
            (Vectors.dense([6.0, 7.0, 0.0, 8.0]),),
            (Vectors.sparse(4, [(0, 9.0), (3, 1.0)]),)]
    df = spark.createDataFrame(data, ["features"])

    r1 = Correlation.corr(df, "features").head()
    print("Pearson correlation matrix:\n" + str(r1[0]))

    r2 = Correlation.corr(df, "features", "spearman").head()
    print("Spearman correlation matrix:\n" + str(r2[0]))
    # $example off$

    spark.stop()
Ejemplo n.º 51
0
import os
import sys
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import PCA
from pyspark.ml.linalg import Vectors
from ml2rt import save_sparkml
from ml2rt import utils

executable = sys.executable
os.environ["SPARK_HOME"] = pyspark.__path__[0]
os.environ["PYSPARK_PYTHON"] = executable
os.environ["PYSPARK_DRIVER_PYTHON"] = executable
spark = SparkSession.builder.appName("redisai_trial").getOrCreate()

data = spark.createDataFrame([(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]), ),
                              (Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]), ),
                              (Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0]), )],
                             ["features"])
pca = PCA(k=2, inputCol="features", outputCol="pca_features")
model = pca.fit(data)

feature_count = data.first()[0].size
N = data.count()

featurestype = utils.guess_onnx_tensortype(node_name='features',
                                           dtype='float32',
                                           shape=(N, feature_count))
save_sparkml(model,
             'spark.onnx',
             initial_types=[featurestype],