def _generate_data(self):
     df = self.spark.range(10)
     output = (df
               .select(
                'id', F.rand(42).alias('a'), F.randn(1).alias('b'),
                F.round(10 * F.rand(42)).alias('Prediction'),
                F.rand().alias('distance'))
               .withColumn('is_outlier', F.when(F.col('distance') >= 0.7, 1.0).otherwise(0.))
               .withColumn('computed_boundary', F.randn())
               )
     return output
Esempio n. 2
0
    def trick2(self):
        @F.udf('integer')
        def random(v):
            return Random().randint(0, 3)

        df = self.session.range(0, 100).withColumn("v", random(
            F.col("id"))).select("id", "v",
                                 F.rand(seed=10).alias("uniform"),
                                 F.randn(seed=27).alias("normal"))

        @F.pandas_udf(df.schema, F.PandasUDFType.GROUPED_MAP)
        def subtract_mean(pdf):
            return pdf.assign(uniform=pdf.uniform - pdf.uniform.mean())

        df.groupby('v').apply(subtract_mean).show()

        @F.pandas_udf(
            StructType([
                StructField(name="v", dataType=IntegerType()),
                StructField(name="add_all", dataType=DoubleType())
            ]), F.PandasUDFType.GROUPED_MAP)
        def addAll(pdf):
            return pd.DataFrame(
                data={
                    "v": pdf.v[0],
                    'add_all': [pdf.uniform.sum() + pdf.normal.sum()]
                })

        df.groupby('v').apply(addAll).show()
Esempio n. 3
0
 def trick3(self):
     df = self.session.range(0, 1000000).select(
         "id",
         F.rand(seed=10).alias("uniform"),
         F.randn(seed=27).alias("normal"))
     # 更少的内存和更快的速度
     TimeProfile.profile(lambda: df.toPandas())()
     TimeProfile.print_prof_data(clear=True)
Esempio n. 4
0
 def test_rand_functions(self):
     df = self.df
     from pyspark.sql import functions
     rnd = df.select('key', functions.rand()).collect()
     for row in rnd:
         assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1]
     rndn = df.select('key', functions.randn(5)).collect()
     for row in rndn:
         assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1]
Esempio n. 5
0
 def test_rand_functions(self):
     df = self.df
     from pyspark.sql import functions
     rnd = df.select('key', functions.rand()).collect()
     for row in rnd:
         assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1]
     rndn = df.select('key', functions.randn(5)).collect()
     for row in rndn:
         assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1]
Esempio n. 6
0
    def test_rand_functions(self):
        df = self.df
        from pyspark.sql import functions
        rnd = df.select('key', functions.rand()).collect()
        for row in rnd:
            assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1]
        rndn = df.select('key', functions.randn(5)).collect()
        for row in rndn:
            assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1]

        # If the specified seed is 0, we should use it.
        # https://issues.apache.org/jira/browse/SPARK-9691
        rnd1 = df.select('key', functions.rand(0)).collect()
        rnd2 = df.select('key', functions.rand(0)).collect()
        self.assertEqual(sorted(rnd1), sorted(rnd2))

        rndn1 = df.select('key', functions.randn(0)).collect()
        rndn2 = df.select('key', functions.randn(0)).collect()
        self.assertEqual(sorted(rndn1), sorted(rndn2))
Esempio n. 7
0
    def test_rand_functions(self):
        df = self.df
        from pyspark.sql import functions
        rnd = df.select('key', functions.rand()).collect()
        for row in rnd:
            assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1]
        rndn = df.select('key', functions.randn(5)).collect()
        for row in rndn:
            assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1]

        # If the specified seed is 0, we should use it.
        # https://issues.apache.org/jira/browse/SPARK-9691
        rnd1 = df.select('key', functions.rand(0)).collect()
        rnd2 = df.select('key', functions.rand(0)).collect()
        self.assertEqual(sorted(rnd1), sorted(rnd2))

        rndn1 = df.select('key', functions.randn(0)).collect()
        rndn2 = df.select('key', functions.randn(0)).collect()
        self.assertEqual(sorted(rndn1), sorted(rndn2))
Esempio n. 8
0
    def _transform(self, data):
        mean = self.getMean()
        stddev = self.getStddev()
        inputCol = self.getInputCol()
        outputCol = self.getOutputCol()

        df = data.withColumn(outputCol,
                             when(col(inputCol).isNull(),
                                  stddev * randn() + mean). \
                             otherwise(col(inputCol)))
        return df
def benchmark2():
    print("===Benchmark 2===")
    print(
        "Comparing JDBC writes to InnoDB and API writes to ColumnStore with larger datasets"
    )
    print("")

    emptyDatabase()

    print("creating dataframe 1: two random generated doubles")
    randDF = sqlContext.range(0, 7000000).withColumn(
        'uniform', rand(seed=23)).withColumn('normal', randn(seed=42)).cache()
    randDFRows = randDF.count()
    randDFItems = randDFRows * len(randDF.columns)
    randDF.printSchema()
    print("bemchmarking dataframe 1")
    rand_benchmark = benchmark2execution(
        "rand", randDF, "id BIGINT, uniform DOUBLE, normal DOUBLE")
    randDF.unpersist()

    print(
        "creating dataframe 2: sha1, sha256, sha512 and md5 hashes of integers"
    )
    tmpDF = sqlContext.createDataFrame(
        sc.parallelize(range(
            0, 3000000)).map(lambda i: Row(number=i, string=str(i))))
    hashDF = tmpDF.select(tmpDF.number,
                          sha1(tmpDF.string).alias("sha1"),
                          sha2(tmpDF.string, 256).alias("sha256"),
                          sha2(tmpDF.string, 512).alias("sha512"),
                          md5(tmpDF.string).alias("md5")).cache()
    hashDFRows = hashDF.count()
    hashDFItems = hashDFRows * len(hashDF.columns)
    hashDF.printSchema()
    print("bemchmarking dataframe 2")
    hash_benchmark = benchmark2execution(
        "hash", hashDF,
        "number BIGINT, sha1 VARCHAR(40), sha256 VARCHAR(64), sha512 VARCHAR(128), md5 VARCHAR(32)"
    )
    hashDF.unpersist()

    print("jdbc_innodb\tapi_columnstore\t\trows\t\titems")
    print("%.3fs\t\t%.3fs\t\t%i\t\t%i" %
          (rand_benchmark[0], rand_benchmark[1], randDFRows, randDFItems))
    print("%.3fs\t\t%.3fs\t\t%i\t\t%i" %
          (hash_benchmark[0], hash_benchmark[1], hashDFRows, hashDFItems))
Esempio n. 10
0
def test_ks(sdf):
    # generates uniform
    sdf = sdf.withColumn('rand', F.rand(42))
    # compares with uniform,it should NOT reject
    pval = KolmogorovSmirnovTest(sdf, 'rand', dist='uniform').pValue
    npt.assert_equal(pval > .05, True)
    # compares with normal, it SHOULD reject
    pval = KolmogorovSmirnovTest(sdf, 'rand').pValue
    npt.assert_equal(pval < .05, True)

    # generates normal
    sdf = sdf.withColumn('rand', F.randn(42))
    # compares with normal, it should NOT reject
    pval = KolmogorovSmirnovTest(sdf, 'rand').pValue
    npt.assert_equal(pval > .05, True)
    # compares with uniform, it SHOULD reject
    pval = KolmogorovSmirnovTest(sdf, 'rand', dist='uniform').pValue
    npt.assert_equal(pval < .05, True)
Esempio n. 11
0
def randnMultiGaussian(meanArray, covMatrix, seed=0):
    """
    Samples from multivariate gaussian as vector

    :param meanArray: mean of the distribution, either List[Float] or numpy array
    :param covMatrix: covariance of the distribution, either List[List[Float]]] (row major) or numpy 2d array
    :param seed: seed of the rand
    :return: DenseVector column
    """
    root = np.linalg.cholesky(np.array(covMatrix))
    rows, columns = root.shape

    root = arrayToMatrix(
        F.lit(rows), F.lit(columns),
        F.array([F.lit(el) for el in root.reshape(int(rows*columns), order="F").tolist()]))
    mean = arrayToVector(F.array([F.lit(float(el)) for el in meanArray]))

    samples = arrayToVector(F.array([F.randn(seed=seed+el) for el in range(0, len(meanArray))]))
    return _function_factory([mean, root, samples], _spark_functions().scaleToMultiGaussian())
Esempio n. 12
0
def visualize_time_lines(patient_event, concept_id, num_patients=50):
    
    ra_patient = patient_event.where(F.col('standard_concept_id') == concept_id) \
        .groupBy('person_id', 'standard_concept_id').agg(F.min('date').alias('index_date')) \
        .withColumn('random_num', F.randn()) \
        .withColumn('rank', F.dense_rank().over(Window.orderBy('random_num'))) \
        .where(F.col('rank') <= num_patients)
    
    join_collection_udf = F.udf(lambda its: ' '.join(sorted([str(it[1]) for it in its], key=lambda x: (x[0], x[1]))), T.StringType())
    
    patient_timeline_pd = patient_event \
        .join(ra_patient, 'person_id') \
        .where(F.col('index_date').between(F.col('lower_bound'), F.col('upper_bound')))\
        .withColumn('date_concept_id', F.struct(F.col('index_date'), patient_event['standard_concept_id']))\
        .groupBy('person_id').agg(join_collection_udf(F.collect_list('date_concept_id')).alias('sequence'), 
                                  F.size(F.collect_list('date_concept_id')).alias('size')) \
        .where(F.col('size') > 1) \
        .select('person_id', 'sequence').toPandas()
    
    return patient_timeline_pd
Esempio n. 13
0
    def trick1(self):
        df = self.session.range(0, 1000000).select(
            "id",
            F.rand(seed=10).alias("uniform"),
            F.randn(seed=27).alias("normal"))

        @F.udf('double')
        def plus_one(v):
            return v + 1

        TimeProfile.profile(
            lambda: df.withColumn('v2', plus_one(df.uniform)).count())()
        TimeProfile.print_prof_data(clear=True)

        @F.pandas_udf('double', F.PandasUDFType.SCALAR)
        def pandas_plus_one(v):
            return v + 1

        TimeProfile.profile(
            lambda: df.withColumn('v2', pandas_plus_one(df.uniform)).count())()
        TimeProfile.print_prof_data(clear=True)
Esempio n. 14
0
from pyspark.sql import SQLContext

sc = SparkContext()
sqlContext = SQLContext(sc)
from pyspark.sql.session import SparkSession

spark = SparkSession(sc)

# <font size=4,font style=arial>
# 5 tane normal dağılımlı(mean=0,std=1) kolona sahip olan veri setimizi oluşturalım.
# </font>

# In[3]:

df1 = sqlContext.range(0, 4000000).withColumn(
    'normal1', func.abs(func.round(100 * randn(seed=1), 2))).withColumn(
        'normal2', func.abs(func.round(100 * randn(seed=2), 2))).withColumn(
            'normal3',
            func.abs(func.round(100 * randn(seed=3), 2))).withColumn(
                'normal4',
                func.abs(func.round(100 * randn(seed=4), 2))).withColumn(
                    'normal5', func.abs(func.round(100 * randn(seed=5), 2)))

# <font size=4,font style=arial>
# Sparkcontext'in içeriği aşağıda ki şekilde görülebilir. Spark UI tıklanırsa çalışılan jop'lar görüleiblir. Master makinem kendi lokalim.
# </font>

# In[4]:

sc
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import rand, randn
from pyspark.sql.functions import mean, min, max

spark = SparkSession \
    .builder \
    .appName("Summary and descriptive statistics") \
    .getOrCreate()

sqlContext = SQLContext(spark.sparkContext)

# A slightly different way to generate the two random columns.
df = sqlContext.range(0, 10) \
               .withColumn("uniform", rand(seed=10)) \
               .withColumn("normal", randn(seed=27))

df.describe().show()

# If you have a DataFrame with a large number of columns, you can also run
# describe on a subset of the columns:
df.describe("uniform", "normal").show()

# Of course, while describe works well for quick exploratory data analysis,
# you can also control the list of descriptive statistics and the columns
# they apply to using the normal select on a DataFrame:
df.select([mean("uniform"), min("uniform"), max("uniform")]).show()

spark.stop()
Esempio n. 16
0
    def gridIsingModel(self, n, vStd=1.0, eStd=1.0):
        """Grid Ising model with random parameters.

        Ising models are probabilistic graphical models over binary variables x\ :sub:`i`.
        Each binary variable x\ :sub:`i` corresponds to one vertex, and it may take values -1 or +1.
        The probability distribution P(X) (over all x\ :sub:`i`) is parameterized by vertex factors
        a\ :sub:`i` and edge factors b\ :sub:`ij`:

           P(X) = (1/Z) * exp[ \sum_i a_i x_i + \sum_{ij} b_{ij} x_i x_j ]

        where Z is the normalization constant (partition function). See `Wikipedia
        <https://en.wikipedia.org/wiki/Ising_model>`__ for more information on Ising models.

        Each vertex is parameterized by a single scalar a\ :sub:`i`.
        Each edge is parameterized by a single scalar b\ :sub:`ij`.

        :param n: Length of one side of the grid.  The grid will be of size n x n.
        :param vStd: Standard deviation of normal distribution used to generate vertex factors "a".
                     Default of 1.0.
        :param eStd: Standard deviation of normal distribution used to generate edge factors "b".
                     Default of 1.0.
        :return: GraphFrame. Vertices have columns "id" and "a". Edges have columns "src", "dst",
            and "b".  Edges are directed, but they should be treated as undirected in any algorithms
            run on this model. Vertex IDs are of the form "i,j".  E.g., vertex "1,3" is in the
            second row and fourth column of the grid.
        """
        assert n >= 1,\
            "Grid graph must have size >= 1, but was given invalid value n = {}".format(n)

        # create coodinates grid
        coordinates = self._sql.createDataFrame(itertools.product(
            range(n), range(n)),
                                                schema=('i', 'j'))

        # create SQL expression for converting coordinates (i,j) to a string ID "i,j"
        # avoid Cartesian join due to SPARK-15425: use generator since n should be small
        toIDudf = sqlfunctions.udf(lambda i, j: '{},{}'.format(i, j))

        # create the vertex DataFrame
        # create SQL expression for converting coordinates (i,j) to a string ID "i,j"
        vIDcol = toIDudf(sqlfunctions.col('i'), sqlfunctions.col('j'))
        # add random parameters generated from a normal distribution
        seed = 12345
        vertices = (coordinates.withColumn('id', vIDcol).withColumn(
            'a',
            sqlfunctions.randn(seed) * vStd))

        # create the edge DataFrame
        # create SQL expression for converting coordinates (i,j+1) and (i+1,j) to string IDs
        rightIDcol = toIDudf(sqlfunctions.col('i'), sqlfunctions.col('j') + 1)
        downIDcol = toIDudf(sqlfunctions.col('i') + 1, sqlfunctions.col('j'))
        horizontalEdges = (coordinates.filter(
            sqlfunctions.col('j') != n - 1).select(vIDcol.alias('src'),
                                                   rightIDcol.alias('dst')))
        verticalEdges = (coordinates.filter(
            sqlfunctions.col('i') != n - 1).select(vIDcol.alias('src'),
                                                   downIDcol.alias('dst')))
        allEdges = horizontalEdges.unionAll(verticalEdges)
        # add random parameters from a normal distribution
        edges = allEdges.withColumn('b', sqlfunctions.randn(seed + 1) * eStd)

        # create the GraphFrame
        g = GraphFrame(vertices, edges)

        # materialize graph as workaround for SPARK-13333
        g.vertices.cache().count()
        g.edges.cache().count()

        return g
Esempio n. 17
0
def randn(df, c, mu=0.0, sigma=1.0, seed=None):
    return df.withColumn(c, F.randn(seed) * sigma + mu)
Esempio n. 18
0
# Random data generation is useful for testing of existing algorithms and
# implementing randomized algorithms, such as random projection. We provide
# methods under sql.functions for generating columns that contains i.i.d.
# values drawn from a distribution, e.g., uniform (rand), and standard
# normal (randn).

from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql.functions import rand, randn

spark = SparkSession \
    .builder \
    .appName("Random data generation") \
    .getOrCreate()

sqlContext = SQLContext(spark.sparkContext)

# Create a DataFrame with one int column and 10 rows.
df = sqlContext.range(0, 10)
df.show()

# Generate two other columns using uniform distribution and normal
# distribution.
df.select("id",
          rand(seed=10).alias("uniform"),
          randn(seed=27).alias("normal")).show()

spark.stop()
Esempio n. 19
0
#FITS
#gal=spark.read.format("fits").option("hdu",1)\
#     .load(os.environ['FITSDIR'])\
#     .select(F.col("RA"), F.col("Dec"), (F.col("Z_COSMO")+F.col("DZ_RSD")).alias("z"))

#PKT   
PARQUET="hdfs://134.158.75.222:8020/user/julien.peloton/LSST10Y_shuffled_uncomp"
gal=spark.read.parquet(PARQUET)\
	.select(F.col("RA"), F.col("DEC").alias("Dec"), (F.col("Z_COSMO")+F.col("DZ_RSD")).alias("z"))
      
gal.printSchema()
timer.step()
timer.print("load")
#######
gal=gal.withColumn("zrec",(gal.z+0.03*(1+gal.z)*randn()).astype('float'))
gal.show(5)
timer.step()
timer.print("show")
##cache
gal=gal.cache()
print("N={}".format(gal.count()))
timer.step()
timer.print("data loaded")
####

zshell=[0.0,0.13,0.27,0.43,0.63,0.82,1.05,1.32,1.61,1.95,2.32]
#zshell=[0.1,0.2,0.3,0.4,0.5]

#writemap
write=False
Esempio n. 20
0
# Use the `range` method to generate a sequence of integers and add new
# columns as appropriate.
spark.range(1000).show(5)

# Use the `rand` function to generate a uniform random variable:
from pyspark.sql.functions import rand
df_uniform = spark \
  .range(1000) \
  .withColumn("uniform", rand(12345))
df_uniform.show(5)
df_uniform.describe("uniform").show()

# Or a Bernoulli random variable with $p = 0.25$:
df_bernoulli = spark \
  .range(1000) \
  .withColumn("bernoulli", (rand(12345) < 0.25).cast("int"))
df_bernoulli.show(5)
df_bernoulli.groupby("bernoulli").count().show()

# Use the `randn` function to generate a normal random variable:
from pyspark.sql.functions import randn

df_normal = spark.range(1000).withColumn("normal", 42 + 2 * randn(54321))
df_normal.show(5)
df_normal.describe("normal").show()

# ## Cleanup

# Stop the SparkSession:
spark.stop()
Esempio n. 21
0
    # OLS problem, states to be estimated are a, b and c
    # z = a*x + b * y + c + w, where w ~ N(0, 1)
    a = 0.5
    b = 0.2
    c = 1.2
    noise_param = 1
    label_expression = F.col("x") * a + F.col("y") * b + c + F.col("w")

    input_df = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\
        .withColumn("mod", F.col("value") % num_states)\
        .withColumn("stateKey", F.col("mod").cast("String"))\
        .withColumn("x", (F.col("value")/num_states).cast("Integer").cast("Double"))\
        .withColumn("y", F.sqrt("x"))\
        .withColumn("bias", F.lit(1.0))\
        .withColumn("w", F.randn(0) * noise_param)\
        .withColumn("label", label_expression)

    rls = RecursiveLeastSquaresFilter()\
        .setStateKeyCol("stateKey")\
        .setFeatureSize(3)\
        .setInitialEstimate(Vectors.dense([0.0, 0.0, 0.0]))\
        .setRegularizationMatrixFactor(10E6)\
        .setForgettingFactor(0.99)

    assembler = VectorAssembler(inputCols=["x", "y", "bias"],
                                outputCol="features")

    measurements = assembler.transform(input_df)
    query = rls.transform(measurements)\
        .writeStream\
Esempio n. 22
0
def get_baseline_scores(train_df, val_df, evaluator, eval_name):
    stats_rating_df = (
        train_df
        .agg(
            F.avg('rating').alias('avg_rating'),
            F.stddev_samp('rating').alias('stddev_rating')
        )
    )

    stats_row = stats_rating_df.head()

    print('[plot_scores Train] Avg: {}'.format(stats_row[0]))
    print('[plot_scores Train] Std Dev: {}'.format(stats_row[1]))

    # Naive model: random normal rating centered on average rating and scaled
    # with standard deviation of training data.
    train_predict_df = (
        train_df
        .crossJoin(stats_rating_df)
        .withColumn(
            'prediction',
            F.col('avg_rating') + F.randn() * F.col('stddev_rating')
        )
        .select(
            'user',
            'item',
            'rating',
            'prediction'
        )
    )

    val_predict_df = (
        val_df
        .crossJoin(stats_rating_df)
        .withColumn(
            'prediction',
            F.col('avg_rating') + F.randn() * F.col('stddev_rating')
        )
        .select(
            'user',
            'item',
            'rating',
            'prediction'
        )
    )

    naive_score_train = evaluator.evaluate(train_predict_df)
    naive_score_val = evaluator.evaluate(val_predict_df)

    print('Train Naive {} score: {}'.format(eval_name, naive_score_train))
    print('Validation Naive {} score: {}'.format(eval_name, naive_score_val))

    estimator = Recommender(
        lambda_1=0.0,
        lambda_2=0.0,
        lambda_3=0.0,
        useALS=False,
        useBias=True,
        userCol='user',
        itemCol='item',
        ratingCol='rating'
    )

    model = estimator.fit(train_df)
    baseline_score_train = evaluator.evaluate(model.transform(train_df))
    baseline_score_val = evaluator.evaluate(model.transform(val_df))

    print('Train Baseline {} score: {}'.format(eval_name, baseline_score_train))
    print('Validation Baseline {} score: {}'.format(eval_name, baseline_score_val))

    return (
        naive_score_train, naive_score_val,
        baseline_score_train, baseline_score_val
    )
Esempio n. 23
0
# <font size=4,font style=arial>
# <br>
# Veri setimizi oluşturalım. 5 kolonlu olsun kolonlar;<br>
# normal1: normal dağılımlı kolon<br>
# normal2: normal dağılımlı kolon<br>
# normal3: normal dağılımlı kolon<br>
# normal4: normal dağılımlı kolon<br>
# normal5: normal dağılımlı kolon<br>
# normal6: normal dağılımlı kolon<br>
# Y:0,1 dğişkenlibir kolon<br>
# </font>

# In[4]:

df1 = sqlContext.range(0, 1000000).withColumn(
    'normal1', func.abs(10 * func.round(randn(seed=1), 2))).withColumn(
        'normal2', func.abs(100 * func.round(randn(seed=2), 2))).withColumn(
            'normal3', func.abs(func.round(randn(seed=3), 2))).withColumn(
                'normal4', func.abs(func.round(randn(seed=4), 2))).withColumn(
                    'normal5',
                    func.abs(func.round(randn(seed=5), 2))).withColumn(
                        'normal6', func.abs(func.round(randn(seed=6), 2)))
df1.cache()

# <font size=4,font style=arial>
# <br>
# Y değişkenini diğer değişkenlerle ilişkilendirelim ki output değişkenimiz olan Y anlamlı olsun . Modelde
# anlamlı sonuçlar elde edelim. <br>
# </font>

# In[5]:
from pyspark.sql.functions import rand, randn
# Create a DataFrame with one int column and 10 rows.
df = sqlContext.range(0, 10)
df.show()

# COMMAND ----------

display(df)

# COMMAND ----------

# Generate two other columns using uniform distribution and normal distribution.
df.select("id",
          rand(seed=10).alias("uniform"),
          randn(seed=27).alias("normal")).show()

# COMMAND ----------

display(
    df.select("id",
              rand(seed=10).alias("uniform"),
              randn(seed=27).alias("normal")))

# COMMAND ----------

# MAGIC %md ### Summary and Descriptive Statistics
# MAGIC
# MAGIC The first operation to perform after importing data is to get some sense of what it looks like. For numerical columns, knowing the descriptive summary statistics can help a lot in understanding the distribution of your data. The function `describe` returns a DataFrame containing information such as number of non-null entries (count), mean, standard deviation, and minimum and maximum value for each numerical column.

# COMMAND ----------
Esempio n. 25
0
from pyspark.mllib.stat import Statistics

parallelData = sc.parallelize([1.0, 2.0, 5.0, 4.0, 3.0, 3.3, 5.5])

# run a KS test for the sample versus a standard normal distribution
testResult = Statistics.kolmogorovSmirnovTest(parallelData, "norm", 0, 1)
print(testResult)

from pyspark.sql import SQLContext
sqlCtx = SQLContext(sc)from pyspark.sql.functions import rand, randn
 # Create a DataFrame with one int column and 10 rows.
df = sqlCtx.range(0, 10)
df.show()

df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show()
df.describe().show()

df = sqlCtx.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))
print df.stat.corr('rand1', 'rand2')
print df.stat.corr('id', 'id')

names = ["Alice", "Bob", "Mike"]
items = ["milk", "bread", "butter", "apples", "oranges"]
df = sqlCtx.createDataFrame([(names[i % 3], items[i % 5]) for i in range(100)], ["name", "item"])
df.show(10)

df = sqlCtx.createDataFrame([(1, 2, 3) if i % 2 == 0 else (i, 2 * i, i % 4) for i in range(100)], ["a", "b", "c"])
print df.show(10)
freq = df.stat.freqItems(["a", "b", "c"], 0.4)
Esempio n. 26
0
timer = Timer()
ddt = []

ana = "1: load(HDU)"
gal=spark.read.format("fits").option("hdu",1)\
  .load(ff)\
  .select(F.col("RA"), F.col("Dec"), (F.col("Z_COSMO")+F.col("DZ_RSD")).alias("z"))

gal.printSchema()
ddt.append(timer.step())
timer.print(ana)

##### gauss
gal = gal.withColumn("zrec_g",
                     (gal.z + 0.03 * (1 + gal.z) * F.randn()).astype('float'))

####full PZ
ana = "2b: PZ full + show(5)"

# read the inverse-cumulative file
cuminv = np.loadtxt('scripts/cuminv_gauss.txt')
#cuminv=np.loadtxt('scripts/cuminv_gauss.txt')
#cuminv=np.loadtxt('scripts/cuminv_bdt.txt')
# we know the binnings that were used
dz = 0.01
du = 1 / 1000.


#find indices and return the table values
@pandas_udf('float', PandasUDFType.SCALAR)
Esempio n. 27
0
              "hdfs://hadoop-hadoop-hdfs-nn.spark.svc.cluster.local:9000/")

# generate the worker nodes.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
sc = spark.sparkContext

from pyspark.sql.functions import randn, round as roundNum

data = [(i, i) for i in range(10)]  # random data

columns = ['id', 'txt']  # add your columns label here

df = spark.createDataFrame(data, columns)
df = df.drop('txt')
for i in range(10):
    df = df.withColumn('col' + str(i), roundNum(randn(), 3))
df.show()

URI = sc._gateway.jvm.java.net.URI
Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
Configuration = sc._gateway.jvm.org.apache.hadoop.conf.Configuration

fs = FileSystem.get(
    URI("hdfs://hadoop-hadoop-hdfs-nn.spark.svc.cluster.local:9000"),
    Configuration())

status = fs.listStatus(Path('/'))

for fileStatus in status:
    print(fileStatus.getPath())
Esempio n. 28
0
            "Usage: lkf_rate_source_llt.py <num_states> <measurements_per_sec>",
            file=sys.stderr)
        sys.exit(-1)

    num_states = int(sys.argv[1])
    mps = int(sys.argv[2])

    spark = SparkSession.builder.appName("RateSourceLKF").getOrCreate()
    spark.sparkContext.setLogLevel("WARN")

    noise_param = 1

    input_df = spark.readStream.format("rate").option("rowsPerSecond", mps).load()\
        .withColumn("mod", F.col("value") % num_states)\
        .withColumn("stateKey", F.col("mod").cast("String"))\
        .withColumn("trend", (F.col("value")/num_states).cast("Integer") + F.randn() * noise_param)

    lkf = LinearKalmanFilter(2, 1)\
        .setStateKeyCol("stateKey")\
        .setMeasurementCol("measurement")\
        .setInitialCovariance(Matrices.dense(2, 2, [10000.0, 0.0, 0.0, 10000.0]))\
        .setProcessModel(Matrices.dense(2, 2, [1.0, 0.0, 1.0, 1.0]))\
        .setProcessNoise(Matrices.dense(2, 2, [0.0001, 0.0, 0.0, 0.0001]))\
        .setMeasurementNoise(Matrices.dense(1, 1, [noise_param]))\
        .setMeasurementModel(Matrices.dense(1, 2, [1.0, 0.0]))

    assembler = VectorAssembler(inputCols=["trend"], outputCol="measurement")

    measurements = assembler.transform(input_df)
    query = lkf.transform(measurements)\
        .writeStream\
Esempio n. 29
0
    def gridIsingModel(self, n, vStd=1.0, eStd=1.0):
        """Grid Ising model with random parameters.

        Ising models are probabilistic graphical models over binary variables x\ :sub:`i`.
        Each binary variable x\ :sub:`i` corresponds to one vertex, and it may take values -1 or +1.
        The probability distribution P(X) (over all x\ :sub:`i`) is parameterized by vertex factors
        a\ :sub:`i` and edge factors b\ :sub:`ij`:

           P(X) = (1/Z) * exp[ \sum_i a_i x_i + \sum_{ij} b_{ij} x_i x_j ]

        where Z is the normalization constant (partition function). See `Wikipedia
        <https://en.wikipedia.org/wiki/Ising_model>`__ for more information on Ising models.

        Each vertex is parameterized by a single scalar a\ :sub:`i`.
        Each edge is parameterized by a single scalar b\ :sub:`ij`.

        :param n: Length of one side of the grid.  The grid will be of size n x n.
        :param vStd: Standard deviation of normal distribution used to generate vertex factors "a".
                     Default of 1.0.
        :param eStd: Standard deviation of normal distribution used to generate edge factors "b".
                     Default of 1.0.
        :return: GraphFrame. Vertices have columns "id" and "a". Edges have columns "src", "dst",
            and "b".  Edges are directed, but they should be treated as undirected in any algorithms
            run on this model. Vertex IDs are of the form "i,j".  E.g., vertex "1,3" is in the
            second row and fourth column of the grid.
        """
        # check param n
        if n < 1:
            raise ValueError(
                "Grid graph must have size >= 1, but was given invalid value n = {}"
                .format(n))

        # create coodinates grid
        coordinates = self._sql.createDataFrame(
            itertools.product(range(n), range(n)),
            schema=('i', 'j'))

        # create SQL expression for converting coordinates (i,j) to a string ID "i,j"
        # avoid Cartesian join due to SPARK-15425: use generator since n should be small
        toIDudf = sqlfunctions.udf(lambda i, j: '{},{}'.format(i,j))

        # create the vertex DataFrame
        # create SQL expression for converting coordinates (i,j) to a string ID "i,j"
        vIDcol = toIDudf(sqlfunctions.col('i'), sqlfunctions.col('j'))
        # add random parameters generated from a normal distribution
        seed = 12345
        vertices = (coordinates.withColumn('id', vIDcol)
            .withColumn('a', sqlfunctions.randn(seed) * vStd))

        # create the edge DataFrame
        # create SQL expression for converting coordinates (i,j+1) and (i+1,j) to string IDs
        rightIDcol = toIDudf(sqlfunctions.col('i'), sqlfunctions.col('j') + 1)
        downIDcol = toIDudf(sqlfunctions.col('i') + 1, sqlfunctions.col('j'))
        horizontalEdges = (coordinates.filter(sqlfunctions.col('j') != n - 1)
            .select(vIDcol.alias('src'), rightIDcol.alias('dst')))
        verticalEdges = (coordinates.filter(sqlfunctions.col('i') != n - 1)
            .select(vIDcol.alias('src'), downIDcol.alias('dst')))
        allEdges = horizontalEdges.unionAll(verticalEdges)
        # add random parameters from a normal distribution
        edges = allEdges.withColumn('b', sqlfunctions.randn(seed + 1) * eStd)

        # create the GraphFrame
        g = GraphFrame(vertices, edges)

        # materialize graph as workaround for SPARK-13333
        g.vertices.cache().count()
        g.edges.cache().count()

        return g
Esempio n. 30
0
def benchmark(ff):
    timer=Timer()
    ddt=[]
    
    ana="1: load(HDU)"

    gal=spark.read.format("fits").option("hdu",1)\
         .load(ff)\
         .select(F.col("RA"), F.col("Dec"), (F.col("Z_COSMO")+F.col("DZ_RSD")).alias("z"))

    
    #PARQUET="hdfs://134.158.75.222:8020/user/julien.peloton/LSST10Y_shuffled_uncomp"
    #gal=spark.read.parquet(PARQUET)\
    #  .select(F.col("RA"), F.col("DEC").alias("Dec"), (F.col("Z_COSMO")+F.col("DZ_RSD")).alias("z"))


    gal.printSchema()
    ddt.append(timer.step())
    timer.print(ana)
    #######
    ana="2: gauss PZ + show(5)"
    gal=gal.withColumn("zrec",(gal.z+0.03*(1+gal.z)*F.randn()).astype('float'))
    gal.show(5)
    ddt.append(timer.step())
    timer.print(ana)

    ####
    ana="3: cache (count)"
    gal=gal.cache()#.persist(StorageLevel.MEMORY_ONLY_SER)
    print("N={}".format(gal.count()))
    ddt.append(timer.step())
    timer.print(ana)

    #####
    ana="4: statistics z"
    gal.describe(['z']).show()
    ddt.append(timer.step())
    timer.print(ana)

    ana="5: statistics all"
    #get all statitics on z
    gal.describe().show()
    ddt.append(timer.step())
    timer.print(ana)

    ana="6: minmax"
    minmax=gal.select(F.min("z"),F.max("z")).first()
    zmin=minmax[0]
    zmax=minmax[1]
    Nbins=100
    dz=(zmax-zmin)/Nbins
    ddt.append(timer.step())
    timer.print(ana)

    ###############
    ana="7: histo df"
    #df on z 
    #zbin=gal.select(gal.z,((gal['z']-zmin)/dz).astype('int').alias('bin'))
    zbin=gal.select(gal.z,((gal['z']-zmin-dz/2)/dz).cast(IntegerType()).alias('bin'))
    h=zbin.groupBy("bin").count().orderBy(F.asc("bin"))
    p=h.select("bin",(zmin+dz/2+h['bin']*dz).alias('zbin'),"count").drop("bin").toPandas()
    #p.to_csv("p.csv")
    ddt.append(timer.step())
    timer.print(ana)
    #
    #ana="histo p3"
    #import df_tools
    #p3=df_tools.hist_df(gal,"zrec",Nbins,bounds=minmax).toPandas()
    #p3.to_csv("prec3.csv")
    #timer.print(ana)
    #p3.to_csv("prec3.csv")
    #ana="histo p5 (on the fly)"
    #p5=df_tools.hist_df(gal.withColumn("zrec2",gal.z+0.05*randn()*(1+gal.z)),"zrec2",Nbins,bounds=minmax).toPandas()
    #timer.print(ana)
    #p5.to_csv("prec5.csv")

    #ana="8a: histo (UDF)"
    #binNumber_udf=F.udf(lambda z: int((z-zmin)/dz))
    #p_udf=gal.select(gal.z,binNumber_udf(gal.z).alias('bin')).groupBy("bin").count().orderBy(F.asc("bin")).toPandas()
    #ddt.append(timer.step())
    #timer.print(ana)

    
    ana="8b: histo (pandas UDF)"
    @pandas_udf("float", PandasUDFType.SCALAR)
    def binFloat(z):
        return pd.Series((z-zmin)/dz)
    #dont know how to cast in pd so do it later
    p_udf=gal.select(gal.z,binFloat("z").astype('int').alias('bin')).groupBy("bin").count().orderBy(F.asc("bin")).toPandas()
    ddt.append(timer.step())
    timer.print(ana)


    #via rdd
    #ana="9: histo (rdd) reducebykey"
    #from operator import add
    #h=zbin.select("bin").rdd.map(lambda r:(r.bin,1)).reduceByKey(add).sortByKey().map(lambda x: (zmin+dz/2 +x[0]*dz,x[1]))
    #h=zbin.select("bin").rdd.map(lambda r:(r[0],1)).countByKey()
    #h.collect()
    #plt.plot(h.keys(),k,values())
    #ddt.append(timer.step())
    #timer.print(ana)

##    ana="10: RDD histogram"
##    #p_rdd=gal.select(gal.z).rdd.flatMap(list).histogram(Nbins)
##    p_rdd=gal.select(gal.z).rdd.map(lambda r: r.z).histogram(Nbins)
##    ddt.append(timer.step())
##    timer.print(ana)

   ## ana="11:tomographie"
##    shell=gal.filter(gal['zrec'].between(0.1,0.2))
##    nside=512
##    @pandas_udf('int', PandasUDFType.SCALAR)
##    def Ang2Pix(ra,dec):
##        return pd.Series(hp.ang2pix(nside,np.radians(90-dec),np.radians(ra)))
##    map=shell.select(Ang2Pix("RA","Dec").alias("ipix")).groupBy("ipix").count().toPandas()

    #back to python world
    #myMap = np.zeros(12 * nside**2)
    #myMap[map['ipix'].values]=map['count'].values

    #ddt.append(timer.step())
    #timer.print(ana)

    return ddt
Esempio n. 31
0
## Run this to clear predicted quality tables, in case you want to try again
clear_for_demo()

# COMMAND ----------

# MAGIC %md
# MAGIC ### Generate data for demo

# COMMAND ----------

df = spark.range(1, 8000)
# Setup Temperature, Pressure, Duration
df = df.select("id",
               F.rand(seed=10).alias("temp_raw"),
               F.randn(seed=27).alias("pressure_raw"),
               F.rand(seed=45).alias("duration_raw"),
               F.randn(seed=54).alias("temp_n"),
               F.randn(seed=78).alias("pressure_n"),
               F.randn(seed=96).alias("duration_n"),
               F.round(F.rand() * 7.5 * 60, 0).alias("timestamp_n"))
df = df.withColumn('pid', (100000 + df["id"]))
df = (df.withColumn("temp_raw", (10.0 * df["temp_raw"]) + 350).withColumn(
    "pressure_raw", (2.0 * df["pressure_raw"]) + 12).withColumn(
        "duration_raw", (4.0 * df["duration_raw"]) + 28.5).withColumn(
            "timestamp", ((df["id"] * 7.5 * 60) + 1561939200 +
                          df["timestamp_n"]).cast('timestamp')))
df = df.withColumn("process_time", df["timestamp"])
df = df.withColumn("qualitycheck_time",
                   F.date_trunc("day", F.date_add(df["timestamp"], 2)))
Esempio n. 32
0
# id:id kolonu<br>
# uniform: uniform dağılımlı kolon<br>
# uniform1: uniform dağılımlı kolon<br>
# normal: normal dağılımlı kolon<br>
# normal1: normal dağılımlı kolon<br>
# Y:0,1 içeren kolon<br>
# NOT:func.round oluşturduğumzu rastgele sayıları yuvarlamak içindir <br>
# NOT: Sparksjobs ları http://localhost:4040/jobs/ 'den takip edebilirsiniz
# </font>

# In[33]:

df1 = sqlContext.range(0, 1000000).withColumn(
    'uniform', func.round(rand(seed=10), 2)).withColumn(
        'uniform1', func.round(rand(seed=9), 2)).withColumn(
            'normal', func.round(randn(seed=22), 2)).withColumn(
                'normal1',
                func.round(randn(seed=23),
                           2)).withColumn('Y',
                                          when(rand() > 0.5, 1).otherwise(0))

# <font size=4,font style=arial>
# df1 satır sayısı
#     </font>

# In[34]:

#df1 satır sayısı
df1.count()

# <font size=4,font style=arial>
Esempio n. 33
0
from pyspark.sql.functions import rand, randn, mean, min, max
from pyspark.sql.context import SQLContext
from pyspark.context import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("sparkDataFrame")
sc = SparkContext(conf = conf)
sqlcontext = SQLContext(sc)

# 1. Create a DataFrame with one int column and 10 rows.
df = sqlcontext.range(0, 10)
df.show()

# Generate two other columns using uniform distribution and normal distribution.
df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal"))
df.show()

# 2. Summary and Descriptive Statistics
df = sqlcontext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27))
df.describe('uniform', 'normal').show()

df.select([mean('uniform'), min('uniform'), max('uniform')]).show()

# 3. Sample covariance and correlation
# Covariance is a measure of how two variables change with respect to each other. 
# A positive number would mean that there is a tendency that as one variable increases, 
# the other increases as well. 
# A negative number would mean that as one variable increases, 
# the other variable has a tendency to decrease.
df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))
df.stat.cov('rand1', 'rand2')
df.stat.cov('id', 'id')
Esempio n. 34
0
 def break_ties(target):
     return (funcs.col(target) + (funcs.randn(conf['random_state']) / funcs.lit(10000000000))).alias(target)