def SearchTiles_and_Factorize(n): 
	global globalmergedtiles
	global globalcoordinates
	global factors_accum 
	global spcon

	spcon = SparkContext("local[4]","Spark_TileSearch_Optimized")

	if persisted_tiles == True:
        	tileintervalsf=open("/home/shrinivaasanka/Krishna_iResearch_OpenSource/GitHub/asfer-github-code/cpp-src/miscellaneous/DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.tileintervals","r")

        	tileintervalslist=tileintervalsf.read().split("\n")
		#print "tileintervalslist=",tileintervalslist
        	tileintervalslist_accum=spcon.accumulator(tilesintervalslist, VectorAccumulatorParam())
		paralleltileintervals=spcon.parallelize(tileintervalslist)
		paralleltileintervals.foreach(tilesearch)
	else:
		factorsfile=open("DiscreteHyperbolicFactorizationUpperbound_TileSearch_Optimized.factors","w")
		hardy_ramanujan_ray_shooting_queries(n)
		hardy_ramanujan_prime_number_theorem_ray_shooting_queries(n)
		baker_harman_pintz_ray_shooting_queries(n)
		cramer_ray_shooting_queries(n)
		zhang_ray_shooting_queries(n)
        	factors_accum=spcon.accumulator(factors_of_n, FactorsAccumulatorParam())
		#spcon.parallelize(xrange(1,n)).foreach(tilesearch_nonpersistent)
		spcon.parallelize(spcon.range(1,n).collect()).foreach(tilesearch_nonpersistent)
		print "factors_accum.value = ", factors_accum.value
		factors=[]
		factordict={}
		for f in factors_accum.value:
			factors += f
		factordict[n]=factors
		json.dump(factordict,factorsfile)
		return factors
Beispiel #2
0
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

if __name__ == "__main__":
    sc = SparkContext(appName="PythonStreamingNetworkWordCount")
    rdd = sc.range(1, 1000)

    counts = rdd.map(lambda i: i * 2)
    counts.saveAsTextFile("s3://uryyyyyyy-sandbox/py.log")
from pyspark import SparkContext
from pyspark.sql import SQLContext

# setup spark context
from pyspark.sql.types import StructType, StructField, StringType

sc = SparkContext("local", "data_processor")
sqlC = SQLContext(sc)
# create dummy data frames

rdd1 = sc.range(0,10000000).map(lambda x: ("key "+str(x), x)).repartition(100)
rdd2 = sc.range(0,10000).map(lambda x: ("key "+str(x), x)).repartition(10)



# Define schema
schema = StructType([
    StructField("Id", StringType(), True),
    StructField("Packsize", StringType(), True)
])

schema2 = StructType([
    StructField("Id2", StringType(), True),
    StructField("Packsize", StringType(), True)
])

df1 = sqlC.createDataFrame(rdd1,schema)
df2 = sqlC.createDataFrame(rdd2,schema2)

print df1.rdd.getNumPartitions()
print df2.rdd.getNumPartitions()
Beispiel #4
0
from pyspark import SparkContext
import os
os.environ['SPARK_HOME']='F:/hadoop/spark-2.3.2-bin-hadoop2.7'
os.environ['PYSPARK_PYTHON']='D:/ProgramData/Anaconda3/envs/tfColne/python.exe'


if __name__ == '__main__':
    sc = SparkContext('local', 'test')
    test_data = sc.range(0, 100)
    td = test_data.map(lambda d: (d % 5, d))
    ts = td.reduceByKey(lambda a,b: a + b)
    dt = sc.range(10)
    dm = dt.map(lambda d: (d % 5, d))
    dg = dm.reduceByKey(lambda a, b: (a, b))
    ds = dg.sortByKey(ascending=False)
    print(ts.collect())
    print(ds.collect())
    print(ts.join(ds).collect())

    broad = sc.parallelize([1, 8, 2, 4])
    acc = sc.accumulator(0)
    broad.foreach(lambda x: acc.add(x))
    print(acc.value)
Beispiel #5
0
df = changedTypedf.withColumn(
    "Literal", F.lit(0)
)

df.withColumn("meta", df.setosa_dbl * 2).show()
df.withColumn("meta", df['setosa_dbl'] * 2).show()


changedTypedf['mittens'] = changedTypedf['setosa_dbl'] * 10
changedTypedf.sum('setosa_dbl')



df_header.take(2)
df_header.head(4)
df_header.head(4)

# Displays the content of the DataFrame to stdout
df.show()
df.assign
result_pdf = df.select("*").toPandas()
df.toPandas()
result_pdf.dtypes

df.write.parquet("output/proto.parquet")



data = spark_context.range(0, 5)
data.write.format("delta").save("/tmp/delta-table")
Beispiel #6
0
                          int(x[0]) - 1, 1.0))
degrees = upper_entries.map(lambda entry: (entry.i, entry.value)).reduceByKey(
    lambda a, b: a + b)
W = CoordinateMatrix(upper_entries.union(lower_entries), numCols=N, numRows=N)

# XXX:
laplacian = sys.argv[1]

if laplacian == 'unnormalized':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], x[1]))
    D = CoordinateMatrix(entries, numCols=N, numRows=N)
    L = D.toBlockMatrix().subtract(W.toBlockMatrix()).toCoordinateMatrix()
elif laplacian == 'normalized':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / x[1]))
    D_inv = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix()
    I = CoordinateMatrix(sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0)),
                         numCols=N,
                         numRows=N).toBlockMatrix()
    L = I.subtract(D_inv.multiply(W.toBlockMatrix())).toCoordinateMatrix()
elif laplacian == 'symmetric':
    entries = degrees.map(lambda x: MatrixEntry(x[0], x[0], 1 / sqrt(x[1])))
    D_invsq = CoordinateMatrix(entries, numCols=N, numRows=N).toBlockMatrix()
    I = sc.range(N).map(lambda i: MatrixEntry(i, i, 1.0), N, N)
    tmp = D_invsq.multiply(W.toBlockMatrix()).multiply(D_invsq)
    L = I.toBlockMatrix().subtract(tmp)
else:
    raise ValueError('Unknown type of Laplacian.')

## SVD, and transform from dense matrix to dataframe.
svd = L.toRowMatrix().computeSVD(k=K, computeU=False)
V = svd.V.toArray().tolist()
Beispiel #7
0
    nexecutors = 4
    print(
        'Cannot determine number of executors, using default value {}'.format(
            nexecutors))

print('\n')


def prox_method(x):
    from mod import spark_method
    return spark_method(x)


from time import time

start = time()

results = sc.range(nexecutors, numSlices=nexecutors).map(prox_method).collect()
print('Results:')
pp.pprint(results)
print('Result length: {}'.format(len(results)))
print('Duration: {:.2f} s'.format(time() - start))
#print('\nUnique results:')
#print(set(results))

#nums = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 8, 20])
#print(nums.collect())

#sumAndCount = nums.map(lambda x: (x, 1)).fold((0, 0), (lambda x, y: (x[0] + y[0], x[1] + y[1])))

#print(sumAndCount)
Beispiel #8
0
from pyspark import SparkContext
from pyspark.mllib.random import RandomRDDs
from math import hypot
import sys

sc = SparkContext()

# Project Euler Problem 1

print (sc.range(1000).filter(lambda candidate: candidate%3==0 or candidate%5==0).sum())

# Approximating Pi using Monte Carlo integration

radius = 1
def dist(p):
	return hypot(p[0], p[1])

num_samples = int(sys.argv[1])
unit_square = RandomRDDs.uniformVectorRDD(sc, num_samples, 2)
hit = unit_square.map(dist).filter(lambda d: d < radius).count()
fraction = hit / num_samples

print (fraction * (2*radius)**2)
Beispiel #9
0
#!/usr/bin/env python3
#-*- coding: utf-8 -*-

import sys

from operator import add
from random import random

from pyspark import SparkContext


def f(_):
    x = random() * 2 - 1
    y = random() * 2 - 1
    return 1 if x**2 + y**2 < 1 else 0


if __name__ == "__main__":
    """
        Usage: pi [partitions]
    """
    sc = SparkContext(appName="PySpark_Pi")
    partitions = int(sys.argv[1]) if len(sys.argv) > 1 else 2
    n = 100000 * partitions

    count = sc.parallelize(sc.range(1, n + 1), partitions).map(f).reduce(add)
    print("Pi is roughly ", 4.0 * count / n)

    sc.stop()
# The plastic number, $\rho$, is the unique real solution to
# the cubic equation $x^3 = x + 1$.
# We use it for generating a quasi-random sequence in 2D for
# the initial set of complex numbers later.
rho = ((9 + m.sqrt(69)) / 18)**(1 / 3) + ((9 - m.sqrt(69)) / 18)**(1 / 3)

# Obtain the Spark Context
sc = SparkContext(conf=SparkConf())
print(sc)

# Print configurations.
print(sc.getConf().getAll())

# Create an RDD with 64 elements.
n_rdd = sc.range(1, 2**6 + 1)

# Print how the RDD gets mapped.
print(
    n_rdd.map(lambda x: (socket.gethostname(), os.getppid(), os.getpid())).
    distinct().collect())

# Mark for cache() in memory
points_rdd = n_rdd.map(lambda n: (n / rho % 1) / 10 +
                       (n / (rho * rho) % 1) / 10 * 1j).cache()

# Print the total number of partitions
print(points_rdd.getNumPartitions())

# Print the number of elements in each partition
print(points_rdd.glom().map(len).collect())
Beispiel #11
0
"""
Purpose: 
Date created: 2020-04-19

Contributor(s):
    Mark M.
"""

from __future__ import print_function

from pyspark import SparkContext

# from pyspark.sql import SparkSession
# spark = SparkSession.builder.appName("test1").getOrCreate()

sc = SparkContext(appName="matrices1")

rdd = sc.parallelize([
    1,
    2,
])
sorted(rdd.cartesian(rdd).collect())

n = 10
rng = sc.range(1, n + 1)
sum_ = rng.sum()
print(f"The sum of the numbers from 1 to 10 is: {sum_}")

sc.stop()