def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.context globs = pyspark.sql.context.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlCtx'] = sqlCtx = SQLContext(sc) globs['rdd'] = rdd = sc.parallelize([ Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3") ]) globs['df'] = sqlCtx.createDataFrame(rdd) jsonStrings = [ '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' '"field6":[{"field7": "row2"}]}', '{"field1" : null, "field2": "row3", ' '"field3":{"field4":33, "field5": []}}' ] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) (failure_count, test_count) = doctest.testmod(pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.context globs = pyspark.sql.context.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['rdd'] = rdd = sc.parallelize( [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3")] ) globs['df'] = rdd.toDF() jsonStrings = [ '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' '"field6":[{"field7": "row2"}]}', '{"field1" : null, "field2": "row3", ' '"field3":{"field4":33, "field5": []}}' ] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) (failure_count, test_count) = doctest.testmod( pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.group globs = pyspark.sql.group.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) globs['df3'] = sc.parallelize([ Row(name='Alice', age=2, height=80), Row(name='Bob', age=5, height=85) ]).toDF() globs['df4'] = sc.parallelize([ Row(course="dotNET", year=2012, earnings=10000), Row(course="Java", year=2012, earnings=20000), Row(course="dotNET", year=2012, earnings=5000), Row(course="dotNET", year=2013, earnings=48000), Row(course="Java", year=2013, earnings=30000) ]).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.group, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import os import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.context os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.context.__dict__.copy() sc = SparkContext("local[4]", "PythonTest") globs["sc"] = sc globs["sqlContext"] = SQLContext(sc) globs["rdd"] = rdd = sc.parallelize( [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3")] ) globs["df"] = rdd.toDF() jsonStrings = [ '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' '"field6":[{"field7": "row2"}]}', '{"field1" : null, "field2": "row3", ' '"field3":{"field4":33, "field5": []}}', ] globs["jsonStrings"] = jsonStrings globs["json"] = sc.parallelize(jsonStrings) (failure_count, test_count) = doctest.testmod( pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE ) globs["sc"].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.group globs = pyspark.sql.group.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) globs['df3'] = sc.parallelize([Row(name='Alice', age=2, height=80), Row(name='Bob', age=5, height=85)]).toDF() globs['df4'] = sc.parallelize([Row(course="dotNET", year=2012, earnings=10000), Row(course="Java", year=2012, earnings=20000), Row(course="dotNET", year=2012, earnings=5000), Row(course="dotNET", year=2013, earnings=48000), Row(course="Java", year=2013, earnings=30000)]).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.group, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import doctest from array import array from pyspark.context import SparkContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext('local[4]', 'PythonTest', batchSize=2) globs['sc'] = sc globs['sqlCtx'] = SQLContext(sc) globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"}, {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}]) jsonStrings = ['{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field2": "row2", "field3":{"field4":22}}', '{"field1" : 3, "field2": "row3", "field3":{"field4":33}}'] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) globs['nestedRdd1'] = sc.parallelize([ {"f1" : array('i', [1, 2]), "f2" : {"row1" : 1.0}}, {"f1" : array('i', [2, 3]), "f2" : {"row2" : 2.0}}]) globs['nestedRdd2'] = sc.parallelize([ {"f1" : [[1, 2], [2, 3]], "f2" : set([1, 2]), "f3" : (1, 2)}, {"f1" : [[2, 3], [3, 4]], "f2" : set([2, 3]), "f3" : (2, 3)}]) (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import os import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.context os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.context.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['rdd'] = rdd = sc.parallelize([ Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3") ]) globs['df'] = rdd.toDF() jsonStrings = [ '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' '"field6":[{"field7": "row2"}]}', '{"field1" : null, "field2": "row3", ' '"field3":{"field4":33, "field5": []}}' ] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) (failure_count, test_count) = doctest.testmod( pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.dataframe globs = pyspark.sql.dataframe.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlCtx'] = SQLContext(sc) globs['df'] = sc.parallelize( [Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF() globs['df2'] = sc.parallelize( [Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF() globs['df3'] = sc.parallelize([ Row(name='Alice', age=2, height=80), Row(name='Bob', age=5, height=85) ]).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.dataframe globs = pyspark.sql.dataframe.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF() globs['df3'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF() globs['df4'] = sc.parallelize([Row(name='Alice', age=10, height=80), Row(name='Bob', age=5, height=None), Row(name='Tom', age=None, height=None), Row(name=None, age=None, height=None)]).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.dataframe globs = pyspark.sql.dataframe.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')])\ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) globs['df2'] = sc.parallelize( [Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF() globs['df3'] = sc.parallelize([ Row(name='Alice', age=2, height=80), Row(name='Bob', age=5, height=85) ]).toDF() globs['df4'] = sc.parallelize([ Row(name='Alice', age=10, height=80), Row(name='Bob', age=5, height=None), Row(name='Tom', age=None, height=None), Row(name=None, age=None, height=None) ]).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.dataframe globs = pyspark.sql.dataframe.__dict__.copy() sc = SparkContext("local[4]", "PythonTest") globs["sc"] = sc globs["sqlContext"] = SQLContext(sc) globs["df"] = sc.parallelize([(2, "Alice"), (5, "Bob")]).toDF( StructType([StructField("age", IntegerType()), StructField("name", StringType())]) ) globs["df2"] = sc.parallelize([Row(name="Tom", height=80), Row(name="Bob", height=85)]).toDF() globs["df4"] = sc.parallelize( [ Row(name="Alice", age=10, height=80), Row(name="Bob", age=5, height=None), Row(name="Tom", age=None, height=None), Row(name=None, age=None, height=None), ] ).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF, ) globs["sc"].stop() if failure_count: exit(-1)
def _test(): import doctest from array import array from pyspark.context import SparkContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext("local[4]", "PythonTest", batchSize=2) globs["sc"] = sc globs["sqlCtx"] = SQLContext(sc) globs["rdd"] = sc.parallelize( [{"field1": 1, "field2": "row1"}, {"field1": 2, "field2": "row2"}, {"field1": 3, "field2": "row3"}] ) globs["nestedRdd1"] = sc.parallelize( [{"f1": array("i", [1, 2]), "f2": {"row1": 1.0}}, {"f1": array("i", [2, 3]), "f2": {"row2": 2.0}}] ) globs["nestedRdd2"] = sc.parallelize( [ {"f1": [[1, 2], [2, 3]], "f2": set([1, 2]), "f3": (1, 2)}, {"f1": [[2, 3], [3, 4]], "f2": set([2, 3]), "f3": (2, 3)}, ] ) (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) globs["sc"].stop() if failure_count: exit(-1)
class PyEdgeRDDTestCase(unittest.TestCase): """ Test collect, take, count, mapValues, filter and innerJoin for EdgeRDD """ def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") def tearDown(self): self.sc.stop() # TODO def collect(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) # TODO def take(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) # TODO def count(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, 2) # TODO def mapValues(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, 2) # TODO def filter(self): return # TODO def innerJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.diff(vertices1) self.assertEqual(results, 2)
def _test(): import doctest from array import array from pyspark.context import SparkContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext('local[4]', 'PythonTest', batchSize=2) globs['sc'] = sc globs['sqlCtx'] = SQLContext(sc) globs['rdd'] = sc.parallelize([{ "field1": 1, "field2": "row1" }, { "field1": 2, "field2": "row2" }, { "field1": 3, "field2": "row3" }]) jsonStrings = [ '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]}, "field6":[{"field7": "row2"}]}', '{"field1" : null, "field2": "row3", "field3":{"field4":33, "field5": []}}' ] globs['jsonStrings'] = jsonStrings globs['json'] = sc.parallelize(jsonStrings) globs['nestedRdd1'] = sc.parallelize([{ "f1": array('i', [1, 2]), "f2": { "row1": 1.0 } }, { "f1": array('i', [2, 3]), "f2": { "row2": 2.0 } }]) globs['nestedRdd2'] = sc.parallelize([{ "f1": [[1, 2], [2, 3]], "f2": set([1, 2]), "f3": (1, 2) }, { "f1": [[2, 3], [3, 4]], "f2": set([2, 3]), "f3": (2, 3) }]) (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1)
def execute(self): print("execute ", self.__class__) from pyspark.context import SparkContext from pyspark.sql import SparkSession sc = SparkContext(appName='test PySparkTask') b = sc.broadcast([1, 2, 3, 4, 5]) sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect() spark = SparkSession.builder \ .master("local") \ .appName("Word Count") \ .getOrCreate() data = [('Alice', 1), ('Monica', 2)] spark.createDataFrame(data).collect() spark.createDataFrame(data, ['name', 'age']).collect()
def _test(): import os import doctest from pyspark.context import SparkContext from pyspark.sql import Row import pyspark.sql.session os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.session.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['spark'] = SparkSession(sc) globs['rdd'] = rdd = sc.parallelize([ Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3") ]) globs['df'] = rdd.toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.session, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: sys.exit(-1)
def spark_run(): conf = SparkConf()\ .setAppName("Tissue detector")\ .setMaster("spark://spark-master:7077") #conf.set('spark.scheduler.mode', 'FAIR') sc = SparkContext(conf=conf) spark = SparkSession(sc) srcdir = '/data/promort/rois.test' slidedir='/data/promort/prom2/slides' tissuedir='/data/promort/tissue.test' # must have normal and tumor subdirs suf = 'tissue' dlist = os.scandir(srcdir) basenames = [e.name for e in dlist if e.is_dir()] # build job list job_list = [] for basename in basenames: slide = os.path.join(slidedir, basename + '.mrxs') job_list.append((slide, tissuedir, basename, suf)) procs = sc.defaultParallelism rdd = sc.parallelize(job_list, numSlices=procs) # run tissue detector for each slide rdd.foreachPartition(tissue_kernel())
def spark_run(): conf = SparkConf()\ .setAppName("Mask generator")\ .setMaster("spark://spark-master:7077") #conf.set('spark.scheduler.mode', 'FAIR') sc = SparkContext(conf=conf) spark = SparkSession(sc) coredir = '/data/o/svs_review/cores' frdir = '/data/o/svs_review/focus_regions' slidedir='/data/o/slides' maskdir='/data/o/masks' # must have normal and tumor subdirs work = [['NORMAL', 'cores.csv', coredir], ['TUMOR', 'focus_regions.csv', frdir]] suf = 'mask' # build job list job_list = [] dlist = os.scandir(coredir) basenames = [e.name for e in dlist if e.is_dir()] for basename in basenames: for lab, csvfile, srcdir in work: slide = os.path.join(slidedir, basename + '.mrxs') csvroot = os.path.join(srcdir, basename) job_list.append((slide, csvroot, maskdir, basename, lab, suf, csvfile)) # run jobs procs = sc.defaultParallelism rdd = sc.parallelize(job_list, numSlices=procs) rdd.foreach(mask_kernel)
def run(): try: from private_data import cass_pass except ImportError: cass_pass = getpass('Insert Cassandra password: '******'spark.scheduler.mode', 'FAIR') sc = SparkContext(conf=conf) spark = SparkSession(sc) parts_0 = 48 # get list of patches parts_1 = 24 # extract patches parts_2 = 24 # write to cassandra samples = next(os.walk(os.path.join(masks_root,'normal')))[2] samples = [s.split('_')[0] for s in samples] par_samples = sc.parallelize(samples, numSlices=parts_0) cols = ['sample_name', 'sample_rep', 'x', 'y', 'label', 'data', 'patch_id'] data = par_samples\ .map(get_job_list)\ .flatMapValues(lambda x: x)\ .repartition(parts_1)\ .flatMap(get_tiles) # save to Cassandra tables data.coalesce(parts_2)\ .foreachPartition(write_to_cassandra(cass_pass))
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.readwriter globs = pyspark.sql.readwriter.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) jsonStrings = [ '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},' '"field6":[{"field7": "row2"}]}', '{"field1" : null, "field2": "row3", ' '"field3":{"field4":33, "field5": []}}' ] globs['jsonStrings'] = jsonStrings (failure_count, test_count) = doctest.testmod( pyspark.sql.readwriter, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.dataframe globs = pyspark.sql.dataframe.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlCtx'] = SQLContext(sc) globs['df'] = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]).toDF() globs['df2'] = sc.parallelize([Row(name='Tom', height=80), Row(name='Bob', height=85)]).toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: exit(-1)
def main(): ''' some standard spark functions examples ''' # Starting a spark session sc = SparkContext() sc.setLogLevel("OFF") spark = SparkSession.builder.master("local").getOrCreate() # Creating a dataframe from data l = [(1, 'a', 'b', 'c', 'd'), (1, 'a', 'b', 'c', 'd')] df0 = spark.createDataFrame(l, ['col1', 'col2', 'col3', 'col4', 'col5']) # Creating a dataframe using rdd l = [(2, 'f', 'g'), (2, 'f', 'g')] rdd = sc.parallelize(l) schema = StructType([ StructField("col6", IntegerType(), True), StructField("col7", StringType(), True), StructField("col8", StringType(), True) ]) df1 = spark.createDataFrame(rdd, schema) # Joining both df0 and df1 indexedDf0 = add_column_index(df0) indexedDf1 = add_column_index(df1) df2 = indexedDf0.join(indexedDf1, indexedDf1.idx == indexedDf0.idx, 'inner').drop("idx") df2.write.csv("/tmp/file.csv", mode='overwrite', header=True, nullValue='NA', quoteAll=False) # Read a CSV file into a dataframe (multiLine=True to avoid splitting data with '\n' df = spark.read.csv("/tmp/file.csv", header=True, quote='"', escape='"', multiLine=True) # Print the Schema df.printSchema() # Count the number of rows print('Number of rows: {}'.format(df.count())) # Show columns print('Columns: {}'.format(df.columns)) # Display the data df.show() # Count Total Number of records in csv files in a given path path = '/tmp' count_total(spark, path)
def generate(): ss = SparkSession.builder.appName("generator").getOrCreate() df = ss.read.json("../ds/person.json") resultsDf = df.select("results") sc = SparkContext() sc.parallelize() resultsDf.printSchema() nestedArray = resultsDf.select(fn.explode('results')) nestedArray.select("col.*").show() dict = nestedArray.select("col.id", "col.name")\ .rdd \ .map(lambda e: {"id": e.id, "name": e.name, "birth_year":random.randint(1960, 2000), "hometown": "NA"})\ .collect() json.dump(dict, open("../ds/profile.json", "w"))
def main(): conf = SparkConf().setAppName('Home run count').setMaster('local') sc = SparkContext(conf=conf) spark = SparkSession.builder.appName("pyspark sql").getOrCreate() batting_path = "hdfs://localhost:8020/user/baseball/Batting.csv" batting_path="hdfs://localhost:9000/data/Batting.csv" data = spark.read.csv(batting_path, inferSchema=True, header=True) df = data.filter(data.yearID == '2018').select('playerID', 'teamID') # filter players that play on two or more teams players_vertices = df.groupBy("playerID").count().filter("count > 1").select("playerID") edges = df.withColumnRenamed("playerID", "src") edges = edges.withColumnRenamed("teamID", "dst") edges=players_vertices.join(edges, players_vertices.playerID == edges.src, "inner").select("src","dst") players_vertices=players_vertices.withColumnRenamed("playerID","id") teams_vertices = edges.select("dst").distinct().withColumnRenamed("dst","id") vertices = players_vertices.union(teams_vertices) # add one column with auto increasing id vertices = vertices.withColumn('num', monotonically_increasing_id()) graph=GraphFrame(vertices,edges) # motif 1 motif = graph.find("(a)-[]->(b); (a)-[]->(c)").filter("c.num > b.num") calculate(motif) # motif 2 motif = graph.find("(a)-[]->(b); (a)-[]->(c);(d)-[]->(b); (d)-[]->(c)").filter("c.num > b.num and d.num > a.num") calculate(motif) # motif 3 motif = graph.find("(a)-[]->(b); (a)-[]->(c);(d)-[]->(b); (d)-[]->(c);(e)-[]->(b); (e)-[]->(c)").filter( "c.num > b.num and d.num > a.num and e.num > d.num").distinct() calculate(motif) # motif 4 motif = graph.find( "(a)-[]->(b); (a)-[]->(c);(d)-[]->(b); (d)-[]->(c);(e)-[]->(b); (e)-[]->(c);(f)-[]->(b);(f)-[]->(c)").filter( "c.num > b.num and d.num > a.num and e.num > d.num and f.num > e.num").distinct() calculate(motif) output_path = "/user/Wang/graphframe" # format the output final_result=[] for key in result.keys(): line="" key_split=key.split("_") for i in range(len(key_split)): line += " " + key_split[i] for team in result[key]: line += " " + team final_result.append(line) data = sc.parallelize(final_result) data.saveAsTextFile(output_path)
def main(): parser = argparse.ArgumentParser(description="Find Dependency inclusions") parser.add_argument('--path', type=str) parser.add_argument('--cores', type=str) args = parser.parse_args() sc = SparkContext(appName="DDM") sc.getConf().set("spark.executor.cores", args.cores) sc.getConf().set("spark.driver.cores", args.cores) sc.getConf().set("spark.worker.cores", args.cores) sc.getConf().set("spark.deploy.defaultCores", args.cores) sc.getConf().set("spark.driver.memory", "15g") global number_of_columns data = [] file_headers = [] for file in os.listdir(args.path): if file.endswith(".csv"): rdd = sc.textFile(os.path.join(args.path, file)).map(lambda line: line[1:-1].split("\";\"")) file_data = rdd.collect() file_header = file_data[0] del file_data[0] file_data = [(number_of_columns, x) for x in file_data] data += file_data file_headers += file_header number_of_columns = number_of_columns + len(file_header) header_dummies = list(range(0, number_of_columns)) rdd = sc.parallelize(data) values_as_key = rdd.flatMap(lambda el: list(zip(el[1], range(el[0], el[0] + len(el[1]))))) unique_values = values_as_key.map(lambda x: (x[0], x[1])).groupByKey().mapValues(set) unique_values = unique_values.map(lambda x: (tuple(x[1]), 0)).reduceByKey(sum_func) matrix_per_key = unique_values.map(lambda x: make_candidate_matrix(x[0])) result_matrix = matrix_per_key.reduce(lambda x, y: matrix_and(x, y)) assert len(result_matrix) == number_of_columns output = [] for i in range(0, number_of_columns): assert len(result_matrix[i]) == number_of_columns output.append([]) for i in range(0, len(result_matrix)): for j in range(0, len(result_matrix[i])): if i != j and result_matrix[i][j]: output[j].append(file_headers[i]) for i in range(0, len(output)): row = output[i] if len(row) != 0: output_string = str(row[0]) for j in range(1, len(row)): output_string += (", " + str(row[j])) print(str(file_headers[i]) + " < " + output_string) sc.stop()
def rdd_basic(): conf = SparkConf().setAppName("appName").setMaster("local") sc = SparkContext(conf=conf) # Create a RDD for the above array rdd = sc.parallelize([ "10", "21", "90", "34", "40", "98", "21", "44", "59", "21", "90", "34", "29", "19", "21", "34", "29", "49", "78" ]) # Display the array print("The RDD elements are:", rdd.collect()) # Display the first element of the array print("The first element of the RDD:", rdd.first()) # Display the sorted output (ascending and descending) through an RDD sort_asc_rdd = rdd.sortByKey("asc") sort_desc_rdd = rdd.sortByKey(ascending=False) print("The ascending order of the RDD:", sort_asc_rdd.collect()) print("The descending order of the RDD:", sort_desc_rdd.collect()) # Display the distinct elements of the array using an RDD distinct_rdd = rdd.distinct() print(distinct_rdd) print("The distinct RDD list:", distinct_rdd) # Display distinct elements without using a new RDD. print("The distinct element are:", rdd.distinct()) print(rdd.distinct()) # Display maximum and minimum of given array using RDD. print("The maximum value of the RDD element is:", rdd.max()) print("The minimum value of the RDD element is:", rdd.min()) # Display top 5 list elements using RDD print("The top 5 element of the RDD are :", rdd.top(5)) # Combine above array with a new array { 30,35,45,60,75,85} and display output. new_rdd = sc.parallelize(["30", "35", "45", "60", "75", "85"]) # print(newRdd.collect()) print("After join the new elements:", rdd.union(new_rdd).collect())
def initialize(): global items, inputfile, sc, filterThreshold, t, totalEdges, cost_dict, strict_totalNodes, adjacency_listMain t = time.time() sc_conf = SparkConf() sc_conf.setAppName("Task1") sc_conf.setMaster('local[*]') # sc_conf.set("spark.driver.bindAddress", "127.0.0.1") sc = SparkContext(conf=sc_conf) sc.setLogLevel("ERROR") csvread = sc.textFile(inputfile) columnName = csvread.first().split(',') # print(columnName) items = csvread.map(lambda line: line.split(",")).filter( lambda line: (line) != columnName) # Getting user and their business count user_business = items.groupByKey().mapValues(set).collect() tuple_edge_list = [] for i in range(0, len(user_business) - 1): for j in range(i + 1, len(user_business)): inter = user_business[i][1] & user_business[j][1] if len(inter) >= filterThreshold: tuple_edge_list.append( (str(user_business[i][0]), str(user_business[j][0]))) tuple_edge_list.append( (str(user_business[j][0]), str(user_business[i][0]))) totalEdges = float(len(tuple_edge_list) / 2) adjacency_list = sc.parallelize(tuple_edge_list).groupByKey().mapValues( list).collectAsMap() adjacency_listMain = copy.deepcopy(adjacency_list) totalNodes = list(adjacency_list.keys()) # ------------------------Newly added line------------------------ strict_totalNodes = copy.deepcopy(totalNodes) # print(len(totalNodes)) # ----------------------Part 1--------------------- bfs(totalNodes, adjacency_list) print("Writing Betweenness to File....") # Converting into sorted List Initial Betweenness list_val = list(cost_dict.items()) list_val.sort(key=lambda x: (-x[1], x[0])) writeToFile(list_val) totalNodes = copy.deepcopy(strict_totalNodes) # print(len(totalNodes)) # ----------------------Part 2---------------------- print("Creating Partitions....") create_components(list_val, adjacency_listMain, totalNodes, totalEdges) # ---------------------EoC--------------------------- print("Duration: " + str(time.time() - t))
def insert(self, in_local_path, out_hdfs_path, num_partitions=10): # FOR TESTING input = np.random.rand(101, 3) input[0] = np.array([100, 200, 300]) #input = np.loadtxt(in_local_path, delimiter=',', dtype=str) header = input[0] data = input[1:] sc = SparkContext(conf=SparkConf().setAppName("data_setup")) # Split Data into Partitions #header_rdd = sc.parallelize(np.array([header for i in range(0,num_partitions)]), num_partitions).cache() header_rdd = sc.parallelize(header, 1).cache() data_rdd = sc.parallelize(data, num_partitions).cache() # Save Data Partitions header_rdd.map(self.to_csv).saveAsTextFile(out_hdfs_path + "/csv/header") data_rdd.map(self.to_csv).saveAsTextFile(out_hdfs_path + "/csv/data")
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.dataframe globs = pyspark.sql.dataframe.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlCtx'] = sqlCtx = SQLContext(sc) rdd2 = sc.parallelize([Row(name='Alice', age=2), Row(name='Bob', age=5)]) rdd3 = sc.parallelize( [Row(name='Tom', height=80), Row(name='Bob', height=85)]) globs['df'] = sqlCtx.inferSchema(rdd2) globs['df2'] = sqlCtx.inferSchema(rdd3) (failure_count, test_count) = doctest.testmod(pyspark.sql.dataframe, globs=globs, optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1)
class PySparkTestCase(unittest.TestCase): def setUp(self): class_name = self.__class__.__name__ self.sc = SparkContext('local', class_name) def tearDown(self): self.sc.stop() def test_should_be_able_to_word_count(self): rdd = self.sc.parallelize(["This is a text", "Another text", "More text", "a text"]) result = python_word_count.wordcount(rdd) expected = [('a', 2), ('This', 1), ('text', 4), ('is', 1), ('Another', 1), ('More', 1)] self.assertEquals(expected, result.collect())
def _test() -> None: import os import doctest import tempfile from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext import pyspark.sql.context os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.context.__dict__.copy() sc = SparkContext("local[4]", "PythonTest") globs["tempfile"] = tempfile globs["os"] = os globs["sc"] = sc globs["sqlContext"] = SQLContext(sc) globs["rdd"] = rdd = sc.parallelize([ Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3"), ]) globs["df"] = rdd.toDF() jsonStrings = [ '{"field1": 1, "field2": "row1", "field3":{"field4":11}}', '{"field1" : 2, "field3":{"field4":22, "field5": [10, 11]},"field6":[{"field7": "row2"}]}', '{"field1" : null, "field2": "row3", "field3":{"field4":33, "field5": []}}', ] globs["jsonStrings"] = jsonStrings globs["json"] = sc.parallelize(jsonStrings) (failure_count, test_count) = doctest.testmod( pyspark.sql.context, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE, ) globs["sc"].stop() if failure_count: sys.exit(-1)
def _test(): import doctest from pyspark.context import SparkContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext('local[4]', 'PythonTest', batchSize=2) globs['sc'] = sc globs['sqlCtx'] = SQLContext(sc) globs['rdd'] = sc.parallelize([{"field1" : 1, "field2" : "row1"}, {"field1" : 2, "field2": "row2"}, {"field1" : 3, "field2": "row3"}]) (failure_count, test_count) = doctest.testmod(globs=globs,optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1)
class PyVertexRDDTestCase(unittest.TestCase): """ Test collect, take, count, mapValues, diff, filter, mapVertexPartitions, innerJoin and leftJoin for VertexRDD """ def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") def tearDown(self): self.sc.stop() def collect(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.take(1) self.assertEqual(results, [(3, ("rxin", "student"))]) def take(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) def count(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.count() self.assertEqual(results, 2) def mapValues(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.mapValues(lambda x: x + ":" + x) self.assertEqual(results, [(3, ("rxin:rxin", "student:student")), (7, ("jgonzal:jgonzal", "postdoc:postdoc"))]) def innerJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.innerJoin(vertices1).collect() self.assertEqual(results, []) def leftJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.diff(vertices1) self.assertEqual(results, 2)
def run_sim(data, function, n): sc = SparkContext() input_data = sc.parallelize([data() for _ in range(n)]) transformed_data = input_data.map(lambda x: function(x)) kvp_data = transformed_data.map(lambda x: (x, 1)) counts = kvp_data.reduceByKey(lambda a, b: a+b) counts = counts.collectAsMap() try: num_true = counts[True] except: num_true = 0 num_false = counts[False] return num_true/n
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import SQLContext import pyspark.sql.column globs = pyspark.sql.column.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['sqlContext'] = SQLContext(sc) globs['df'] = sc.parallelize([(2, 'Alice'), (5, 'Bob')]) \ .toDF(StructType([StructField('age', IntegerType()), StructField('name', StringType())])) (failure_count, test_count) = doctest.testmod( pyspark.sql.column, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) globs['sc'].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext from pyspark.sql import SQLContext import pyspark.sql.column globs = pyspark.sql.column.__dict__.copy() sc = SparkContext("local[4]", "PythonTest") globs["sc"] = sc globs["sqlContext"] = SQLContext(sc) globs["df"] = sc.parallelize([(2, "Alice"), (5, "Bob")]).toDF( StructType([StructField("age", IntegerType()), StructField("name", StringType())]) ) (failure_count, test_count) = doctest.testmod( pyspark.sql.column, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF, ) globs["sc"].stop() if failure_count: exit(-1)
def _test(): import os import doctest from pyspark.context import SparkContext from pyspark.sql import Row import pyspark.sql.session os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.session.__dict__.copy() sc = SparkContext("local[4]", "PythonTest") globs["sc"] = sc globs["spark"] = SparkSession(sc) globs["rdd"] = rdd = sc.parallelize( [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=3, field2="row3")] ) globs["df"] = rdd.toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.session, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE ) globs["sc"].stop() if failure_count: exit(-1)
def _test(): import doctest from pyspark.context import SparkContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext('local[4]', 'PythonTest', batchSize=2) globs['sc'] = sc globs['sqlCtx'] = SQLContext(sc) globs['rdd'] = sc.parallelize([{ "field1": 1, "field2": "row1" }, { "field1": 2, "field2": "row2" }, { "field1": 3, "field2": "row3" }]) (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) globs['sc'].stop() if failure_count: exit(-1)
def _create_model(self, java_model): return Word2VecModel(java_model) class Word2VecModel(JavaModel): """ Model fitted by Word2Vec. """ if __name__ == "__main__": import doctest from pyspark.context import SparkContext from pyspark.sql import Row, SQLContext globs = globals().copy() # The small batch size here ensures that we see multiple batches, # even in these small test examples: sc = SparkContext("local[2]", "ml.feature tests") sqlContext = SQLContext(sc) globs['sc'] = sc globs['sqlContext'] = sqlContext testData = sc.parallelize([Row(id=0, label="a"), Row(id=1, label="b"), Row(id=2, label="c"), Row(id=3, label="a"), Row(id=4, label="a"), Row(id=5, label="c")], 2) globs['stringIndDf'] = sqlContext.createDataFrame(testData) (failure_count, test_count) = doctest.testmod(globs=globs, optionflags=doctest.ELLIPSIS) sc.stop() if failure_count: exit(-1)
class TestStreamingContextSuite(unittest.TestCase): """ Should we have conf property in SparkContext? @property def conf(self): return self._conf """ def setUp(self): self.master = "local[2]" self.appName = self.__class__.__name__ self.batachDuration = Milliseconds(500) self.sparkHome = "SomeDir" self.envPair = {"key": "value"} self.ssc = None self.sc = None def tearDown(self): # Do not call pyspark.streaming.context.StreamingContext.stop directly because # we do not wait to shutdown py4j client. # We need change this simply calll streamingConxt.Stop #self.ssc._jssc.stop() if self.ssc is not None: self.ssc.stop() if self.sc is not None: self.sc.stop() # Why does it long time to terminate StremaingContext and SparkContext? # Should we change the sleep time if this depends on machine spec? time.sleep(1) @classmethod def tearDownClass(cls): # Make sure tp shutdown the callback server SparkContext._gateway._shutdown_callback_server() def test_from_no_conf_constructor(self): self.ssc = StreamingContext(master=self.master, appName=self.appName, duration=self.batachDuration) # Alternative call master: ssc.sparkContext.master # I try to make code close to Scala. self.assertEqual(self.ssc.sparkContext._conf.get("spark.master"), self.master) self.assertEqual(self.ssc.sparkContext._conf.get("spark.app.name"), self.appName) def test_from_no_conf_plus_spark_home(self): self.ssc = StreamingContext(master=self.master, appName=self.appName, sparkHome=self.sparkHome, duration=self.batachDuration) self.assertEqual(self.ssc.sparkContext._conf.get("spark.home"), self.sparkHome) def test_from_no_conf_plus_spark_home_plus_env(self): self.ssc = StreamingContext(master=self.master, appName=self.appName, sparkHome=self.sparkHome, environment=self.envPair, duration=self.batachDuration) self.assertEqual(self.ssc.sparkContext._conf.get("spark.executorEnv.key"), self.envPair["key"]) def test_from_existing_spark_context(self): self.sc = SparkContext(master=self.master, appName=self.appName) self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration) def test_existing_spark_context_with_settings(self): conf = SparkConf() conf.set("spark.cleaner.ttl", "10") self.sc = SparkContext(master=self.master, appName=self.appName, conf=conf) self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration) self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10) def test_from_conf_with_settings(self): conf = SparkConf() conf.set("spark.cleaner.ttl", "10") conf.setMaster(self.master) conf.setAppName(self.appName) self.ssc = StreamingContext(conf=conf, duration=self.batachDuration) self.assertEqual(int(self.ssc.sparkContext._conf.get("spark.cleaner.ttl")), 10) def test_stop_only_streaming_context(self): self.sc = SparkContext(master=self.master, appName=self.appName) self.ssc = StreamingContext(sparkContext=self.sc, duration=self.batachDuration) self._addInputStream(self.ssc) self.ssc.start() self.ssc.stop(False) self.assertEqual(len(self.sc.parallelize(range(5), 5).glom().collect()), 5) def test_stop_multiple_times(self): self.ssc = StreamingContext(master=self.master, appName=self.appName, duration=self.batachDuration) self._addInputStream(self.ssc) self.ssc.start() self.ssc.stop() self.ssc.stop() def _addInputStream(self, s): # Make sure each length of input is over 3 and # numSlice is 2 due to deserializer problem in pyspark.streaming test_inputs = map(lambda x: range(1, x), range(5, 101)) test_stream = s._testInputStream(test_inputs, 2) # Register fake output operation result = list() test_stream._test_output(result)
class PySparkStreamingTestCase(unittest.TestCase): timeout = 20 # seconds duration = 1 def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") # TODO: decrease duration to speed up tests self.ssc = StreamingContext(self.sc, self.duration) def tearDown(self): self.ssc.stop() def wait_for(self, result, n): start_time = time.time() while len(result) < n and time.time() - start_time < self.timeout: time.sleep(0.01) if len(result) < n: print("timeout after", self.timeout) def _take(self, dstream, n): """ Return the first `n` elements in the stream (will start and stop). """ results = [] def take(_, rdd): if rdd and len(results) < n: results.extend(rdd.take(n - len(results))) dstream.foreachRDD(take) self.ssc.start() self.wait_for(results, n) return results def _collect(self, dstream, n, block=True): """ Collect each RDDs into the returned list. :return: list, which will have the collected items. """ result = [] def get_output(_, rdd): if rdd and len(result) < n: r = rdd.collect() if r: result.append(r) dstream.foreachRDD(get_output) if not block: return result self.ssc.start() self.wait_for(result, n) return result def _test_func(self, input, func, expected, sort=False, input2=None): """ @param input: dataset for the test. This should be list of lists. @param func: wrapped function. This function should return PythonDStream object. @param expected: expected output for this testcase. """ if not isinstance(input[0], RDD): input = [self.sc.parallelize(d, 1) for d in input] input_stream = self.ssc.queueStream(input) if input2 and not isinstance(input2[0], RDD): input2 = [self.sc.parallelize(d, 1) for d in input2] input_stream2 = self.ssc.queueStream(input2) if input2 is not None else None # Apply test function to stream. if input2: stream = func(input_stream, input_stream2) else: stream = func(input_stream) result = self._collect(stream, len(expected)) if sort: self._sort_result_based_on_key(result) self._sort_result_based_on_key(expected) self.assertEqual(expected, result) def _sort_result_based_on_key(self, outputs): """Sort the list based on first value.""" for output in outputs: output.sort(key=lambda x: x[0])
class StreamingContext(object): """ Main entry point for Spark Streaming functionality. A StreamingContext represents the connection to a Spark cluster, and can be used to create L{DStream}s and broadcast variables on that cluster. """ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None, gateway=None, sparkContext=None, duration=None): """ Create a new StreamingContext. At least the master and app name and duration should be set, either through the named parameters here or through C{conf}. @param master: Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). @param appName: A name for your job, to display on the cluster web UI. @param sparkHome: Location where Spark is installed on cluster nodes. @param pyFiles: Collection of .zip or .py files to send to the cluster and add to PYTHONPATH. These can be paths on the local file system or HDFS, HTTP, HTTPS, or FTP URLs. @param environment: A dictionary of environment variables to set on worker nodes. @param batchSize: The number of Python objects represented as a single Java object. Set 1 to disable batching or -1 to use an unlimited batch size. @param serializer: The serializer for RDDs. @param conf: A L{SparkConf} object setting Spark properties. @param gateway: Use an existing gateway and JVM, otherwise a new JVM will be instatiated. @param sparkContext: L{SparkContext} object. @param duration: A L{Duration} object for SparkStreaming. """ if not isinstance(duration, Duration): raise TypeError("Input should be pyspark.streaming.duration.Duration object") if sparkContext is None: # Create the Python Sparkcontext self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome, pyFiles=pyFiles, environment=environment, batchSize=batchSize, serializer=serializer, conf=conf, gateway=gateway) else: self._sc = sparkContext # Start py4j callback server. # Callback sever is need only by SparkStreming; therefore the callback sever # is started in StreamingContext. SparkContext._gateway.restart_callback_server() self._set_clean_up_handler() self._jvm = self._sc._jvm self._jssc = self._initialize_context(self._sc._jsc, duration._jduration) # Initialize StremaingContext in function to allow subclass specific initialization def _initialize_context(self, jspark_context, jduration): return self._jvm.JavaStreamingContext(jspark_context, jduration) def _set_clean_up_handler(self): """ set clean up hander using atexit """ def clean_up_handler(): SparkContext._gateway.shutdown() atexit.register(clean_up_handler) # atext is not called when the program is killed by a signal not handled by # Python. for sig in (SIGINT, SIGTERM): signal(sig, clean_up_handler) @property def sparkContext(self): """ Return SparkContext which is associated with this StreamingContext. """ return self._sc def start(self): """ Start the execution of the streams. """ self._jssc.start() def awaitTermination(self, timeout=None): """ Wait for the execution to stop. @param timeout: time to wait in milliseconds """ if timeout is None: self._jssc.awaitTermination() else: self._jssc.awaitTermination(timeout) def remember(self, duration): """ Set each DStreams in this context to remember RDDs it generated in the last given duration. DStreams remember RDDs only for a limited duration of time and releases them for garbage collection. This method allows the developer to specify how to long to remember the RDDs ( if the developer wishes to query old data outside the DStream computation). @param duration pyspark.streaming.duration.Duration object. Minimum duration that each DStream should remember its RDDs """ if not isinstance(duration, Duration): raise TypeError("Input should be pyspark.streaming.duration.Duration object") self._jssc.remember(duration._jduration) # TODO: add storageLevel def socketTextStream(self, hostname, port): """ Create an input from TCP source hostname:port. Data is received using a TCP socket and receive byte is interpreted as UTF8 encoded '\n' delimited lines. """ return DStream(self._jssc.socketTextStream(hostname, port), self, UTF8Deserializer()) def textFileStream(self, directory): """ Create an input stream that monitors a Hadoop-compatible file system for new files and reads them as text files. Files must be wrriten to the monitored directory by "moving" them from another location within the same file system. File names starting with . are ignored. """ return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer()) def stop(self, stopSparkContext=True, stopGraceFully=False): """ Stop the execution of the streams immediately (does not wait for all received data to be processed). """ self._jssc.stop(stopSparkContext, stopGraceFully) if stopSparkContext: self._sc.stop() # Shutdown only callback server and all py3j client is shutdowned # clean up handler SparkContext._gateway._shutdown_callback_server() def _testInputStream(self, test_inputs, numSlices=None): """ This function is only for unittest. It requires a list as input, and returns the i_th element at the i_th batch under manual clock. """ test_rdds = list() test_rdd_deserializers = list() for test_input in test_inputs: test_rdd = self._sc.parallelize(test_input, numSlices) test_rdds.append(test_rdd._jrdd) test_rdd_deserializers.append(test_rdd._jrdd_deserializer) # All deserializers have to be the same. # TODO: add deserializer validation jtest_rdds = ListConverter().convert(test_rdds, SparkContext._gateway._gateway_client) jinput_stream = self._jvm.PythonTestInputStream(self._jssc, jtest_rdds).asJavaDStream() return DStream(jinput_stream, self, test_rdd_deserializers[0])
class StreamingContext(object): """ Main entry point for Spark Streaming functionality. A StreamingContext represents the connection to a Spark cluster, and can be used to create L{DStream}s and broadcast variables on that cluster. """ def __init__(self, master=None, appName=None, sparkHome=None, pyFiles=None, environment=None, batchSize=1024, serializer=PickleSerializer(), conf=None, gateway=None, sparkContext=None, duration=None): """ Create a new StreamingContext. At least the master and app name and duration should be set, either through the named parameters here or through C{conf}. @param master: Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local[4]). @param appName: A name for your job, to display on the cluster web UI. @param sparkHome: Location where Spark is installed on cluster nodes. @param pyFiles: Collection of .zip or .py files to send to the cluster and add to PYTHONPATH. These can be paths on the local file system or HDFS, HTTP, HTTPS, or FTP URLs. @param environment: A dictionary of environment variables to set on worker nodes. @param batchSize: The number of Python objects represented as a single Java object. Set 1 to disable batching or -1 to use an unlimited batch size. @param serializer: The serializer for RDDs. @param conf: A L{SparkConf} object setting Spark properties. @param gateway: Use an existing gateway and JVM, otherwise a new JVM will be instatiated. @param sparkContext: L{SparkContext} object. @param duration: A L{Duration} object for SparkStreaming. """ if not isinstance(duration, Duration): raise TypeError( "Input should be pyspark.streaming.duration.Duration object") if sparkContext is None: # Create the Python Sparkcontext self._sc = SparkContext(master=master, appName=appName, sparkHome=sparkHome, pyFiles=pyFiles, environment=environment, batchSize=batchSize, serializer=serializer, conf=conf, gateway=gateway) else: self._sc = sparkContext # Start py4j callback server. # Callback sever is need only by SparkStreming; therefore the callback sever # is started in StreamingContext. SparkContext._gateway.restart_callback_server() self._set_clean_up_handler() self._jvm = self._sc._jvm self._jssc = self._initialize_context(self._sc._jsc, duration._jduration) # Initialize StremaingContext in function to allow subclass specific initialization def _initialize_context(self, jspark_context, jduration): return self._jvm.JavaStreamingContext(jspark_context, jduration) def _set_clean_up_handler(self): """ set clean up hander using atexit """ def clean_up_handler(): SparkContext._gateway.shutdown() atexit.register(clean_up_handler) # atext is not called when the program is killed by a signal not handled by # Python. for sig in (SIGINT, SIGTERM): signal(sig, clean_up_handler) @property def sparkContext(self): """ Return SparkContext which is associated with this StreamingContext. """ return self._sc def start(self): """ Start the execution of the streams. """ self._jssc.start() def awaitTermination(self, timeout=None): """ Wait for the execution to stop. @param timeout: time to wait in milliseconds """ if timeout is None: self._jssc.awaitTermination() else: self._jssc.awaitTermination(timeout) def remember(self, duration): """ Set each DStreams in this context to remember RDDs it generated in the last given duration. DStreams remember RDDs only for a limited duration of time and releases them for garbage collection. This method allows the developer to specify how to long to remember the RDDs ( if the developer wishes to query old data outside the DStream computation). @param duration pyspark.streaming.duration.Duration object. Minimum duration that each DStream should remember its RDDs """ if not isinstance(duration, Duration): raise TypeError( "Input should be pyspark.streaming.duration.Duration object") self._jssc.remember(duration._jduration) # TODO: add storageLevel def socketTextStream(self, hostname, port): """ Create an input from TCP source hostname:port. Data is received using a TCP socket and receive byte is interpreted as UTF8 encoded '\n' delimited lines. """ return DStream(self._jssc.socketTextStream(hostname, port), self, UTF8Deserializer()) def textFileStream(self, directory): """ Create an input stream that monitors a Hadoop-compatible file system for new files and reads them as text files. Files must be wrriten to the monitored directory by "moving" them from another location within the same file system. File names starting with . are ignored. """ return DStream(self._jssc.textFileStream(directory), self, UTF8Deserializer()) def stop(self, stopSparkContext=True, stopGraceFully=False): """ Stop the execution of the streams immediately (does not wait for all received data to be processed). """ self._jssc.stop(stopSparkContext, stopGraceFully) if stopSparkContext: self._sc.stop() # Shutdown only callback server and all py3j client is shutdowned # clean up handler SparkContext._gateway._shutdown_callback_server() def _testInputStream(self, test_inputs, numSlices=None): """ This function is only for unittest. It requires a list as input, and returns the i_th element at the i_th batch under manual clock. """ test_rdds = list() test_rdd_deserializers = list() for test_input in test_inputs: test_rdd = self._sc.parallelize(test_input, numSlices) test_rdds.append(test_rdd._jrdd) test_rdd_deserializers.append(test_rdd._jrdd_deserializer) # All deserializers have to be the same. # TODO: add deserializer validation jtest_rdds = ListConverter().convert( test_rdds, SparkContext._gateway._gateway_client) jinput_stream = self._jvm.PythonTestInputStream( self._jssc, jtest_rdds).asJavaDStream() return DStream(jinput_stream, self, test_rdd_deserializers[0])
class SparkExecutor(Executor): def __init__(self): # Setup PySpark. This is needed until PySpark becomes available on PyPI, # after which we can simply add it to requirements.txt. _setup_pyspark() from pyspark.conf import SparkConf from pyspark.context import SparkContext from pyspark.serializers import MarshalSerializer # Create a temporary .zip lib file for Metis, which will be copied over to # Spark workers so they can unpickle Metis functions and objects. metis_lib_file = tempfile.NamedTemporaryFile(suffix='.zip', delete=False) metis_lib_file.close() _copy_lib_for_spark_workers(metis_lib_file.name) # Also ship the Metis lib file so worker nodes can deserialize Metis # internal data structures. conf = SparkConf() conf.setMaster(app.config['SPARK_MASTER']) conf.setAppName('chronology:metis') parallelism = int(app.config.get('SPARK_PARALLELISM', 0)) if parallelism: conf.set('spark.default.parallelism', parallelism) self.context = SparkContext(conf=conf, pyFiles=[metis_lib_file.name], serializer=MarshalSerializer()) # Delete temporary Metis lib file. os.unlink(metis_lib_file.name) # We'll use this to parallelize fetching events in KronosSource. # The default of 8 is from: # https://spark.apache.org/docs/latest/configuration.html self.parallelism = parallelism or 8 def __getstate__(self): # Don't pickle the `SparkContext` object. state = self.__dict__.copy() del state['context'] return state def finalize(self, rdd): return rdd.collect() def execute_aggregate(self, node): def finalize(event): # `event` is of the form (key, event). return node.finalize_func(event[1]) return (self.execute(node.source) .map(node.group_func) .reduceByKey(node.reduce_func) .map(finalize)) def execute_filter(self, node): return self.execute(node.source).filter(generate_filter(node.condition)) def execute_join(self, node): left_alias = node.left.alias or 'left' right_alias = node.right.alias or 'right' def merge(events): event1, event2 = events if isinstance(event1, types.StringType): # Join case: events = (key, (event1, event2)) event1, event2 = event2 event = deepcopy(event1) event.update(event2) else: # Cartesian case: events = (event1, event2) event = {} for key, value in event1.iteritems(): event['%s.%s' % (left_alias, key)] = value for key, value in event2.iteritems(): event['%s.%s' % (right_alias, key)] = value return event def get_equijoin_key_values(condition): # condition must be a *leaf* condition. if getattr(condition, 'op', None) != Condition.Op.EQ: return None # Get properties being accessed by left and right side of the # conditional. left_properties = get_properties_accessed_by_value(condition.left) right_properties = get_properties_accessed_by_value(condition.right) if not (left_properties and right_properties): return None # Only return getters if both sides of the conditional read from different # sources. You can't use this optimization say if the condition is # (left.x + right.y = 10) # XXX: This isn't kosher for non-deterministic functions. if (all(p.startswith('%s.' % left_alias) for p in left_properties) and all(p.startswith('%s.' % right_alias) for p in right_properties)): return {'left': condition.left, 'right': condition.right} if (all(p.startswith('%s.' % right_alias) for p in left_properties) and all(p.startswith('%s.' % left_alias) for p in right_properties)): return {'left': condition.right, 'right': condition.left} return None def map_equijoin(alias, key_values): def map(event): new_event = {} for key, value in event.iteritems(): new_event['%s.%s' % (alias, key)] = value key = json.dumps([get_value(new_event, value) for value in key_values]) return (key, new_event) return map def setup_join(): eq_join_key_values = [] # TODO(usmanm): Right now we only optimize if the conditional is an EQ or # if its an AND and has some EQ in the top level. We don't do any # recursive searching in condition trees. Improve that. condition = node.condition _type = getattr(condition, 'type', None) if _type == Condition.Type.AND: filter_conditions = [] for c in condition.conditions: values = get_equijoin_key_values(c) if values: eq_join_key_values.append(values) else: filter_conditions.append(c) if filter_conditions: condition.conditions = filter_conditions else: condition = None elif _type != Condition.Type.OR: # Ignore ORs for now. value = get_equijoin_key_values(condition) if value: eq_join_key_values.append(value) condition = None return eq_join_key_values, (generate_filter(condition) if condition else None) eq_join_key_values, filter_function = setup_join() if eq_join_key_values: mapped_left = (self.execute(node.left) .map(map_equijoin( left_alias, [value['left'] for value in eq_join_key_values]))) mapped_right = (self.execute(node.right) .map(map_equijoin( right_alias, [value['right'] for value in eq_join_key_values]))) joined = mapped_left.join(mapped_right).map(merge) else: # Naive O(n^2) cartesian product. joined = (self.execute(node.left).cartesian(self.execute(node.right)) .map(merge)) if filter_function: joined = joined.filter(filter_function) return joined def execute_limit(self, node): # TODO(usmanm): Is there a better way than to collect and redistribute all # events? return self.context.parallelize(self.execute(node.source).take(node.limit)) def execute_order_by(self, node): return (self.execute(node.source) .keyBy(lambda e: tuple(get_value(e, field) for field in node.fields)) .sortByKey(ascending=node.order == node.ResultOrder.ASCENDING) .map(lambda e: e[1])) def execute_project(self, node): def project(event): if node.merge: new_event = deepcopy(event) else: new_event = {} for field in node.fields: new_event[field.alias] = get_value(event, field) return new_event return self.execute(node.source).map(node.map_func)
def avg_model(sgd, slices): sgd.coef_ /= slices sgd.intercept_ /= slices return sgd if __name__ == "__main__": if len(sys.argv) < 2: print >> sys.stderr, \ "Usage: PythonLR <master> <iterations>" exit(-1) sc = SparkContext(sys.argv[1], "PythonLR") ITERATIONS = int(sys.argv[2]) if len(sys.argv) > 2 else ITERATIONS slices = int(sys.argv[3]) if len(sys.argv) == 4 else 2 data = generate_data(N) print len(data) # initializing SGD sgd = lm.SGDClassifier(loss='log') for ii in range(ITERATIONS): sgd = sc.parallelize(data, numSlices=slices) \ .mapPartitions(lambda x: train(x, sgd)) \ .reduce(lambda x, y: merge(x, y)) sgd = avg_model(sgd, slices) # averaging weight vector for IPM update print "Iteration %d:" % (ii + 1) print "Model: " print sgd.coef_ print sgd.intercept_ print ""
#Feat = np.load('mllib-scripts/w2v_may1_may19_june1_june11.npy') #words = np.load('mllib-scripts/word_may1_may19_june1_june11.npy') #wordToModel = 'data' #maxWordsVis = 10 print "\n=================================================" print "Size of the Word2Vec matrix is: ", Feat.shape print "Number of words in the models: ", words.shape print "=================================================\n" ## Spark Context sc = SparkContext('local','visualize-words') ## Read the Word2Vec model # the next line should be read/stored from hdfs if it is large Feat = sc.parallelize(Feat) # map feature matrix to spark vectors from pyspark.mllib.linalg import Vectors Feat = Feat.map(lambda vec: (Vectors.dense(vec),)) ## Define a df with feature matrix from pyspark.sql import SQLContext sqlContext = SQLContext(sc) dfFeat = sqlContext.createDataFrame(Feat,["features"]) dfFeat.printSchema() ## PCA to project Feature matrix to 2 dimensions from pyspark.ml.feature import PCA numComponents = 3 pca = PCA(k=numComponents, inputCol="features", outputCol="pcaFeatures")
class TestRDDFunctions(PySparkTestCase): def test_failed_sparkcontext_creation(self): # Regression test for SPARK-1550 self.sc.stop() self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) self.sc = SparkContext("local") def test_save_as_textfile_with_unicode(self): # Regression test for SPARK-970 x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_save_as_textfile_with_utf8(self): x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x.encode("utf-8")]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_transforming_cartesian_result(self): # Regression test for SPARK-1034 rdd1 = self.sc.parallelize([1, 2]) rdd2 = self.sc.parallelize([3, 4]) cart = rdd1.cartesian(rdd2) result = cart.map(lambda (x, y): x + y).collect() def test_transforming_pickle_file(self): # Regression test for SPARK-2601 data = self.sc.parallelize(["Hello", "World!"]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsPickleFile(tempFile.name) pickled_file = self.sc.pickleFile(tempFile.name) pickled_file.map(lambda x: x).collect() def test_cartesian_on_textfile(self): # Regression test for path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") a = self.sc.textFile(path) result = a.cartesian(a).collect() (x, y) = result[0] self.assertEqual("Hello World!", x.strip()) self.assertEqual("Hello World!", y.strip()) def test_deleting_input_files(self): # Regression test for SPARK-1025 tempFile = tempfile.NamedTemporaryFile(delete=False) tempFile.write("Hello World!") tempFile.close() data = self.sc.textFile(tempFile.name) filtered_data = data.filter(lambda x: True) self.assertEqual(1, filtered_data.count()) os.unlink(tempFile.name) self.assertRaises(Exception, lambda: filtered_data.count()) def testAggregateByKey(self): data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2) def seqOp(x, y): x.add(y) return x def combOp(x, y): x |= y return x sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect()) self.assertEqual(3, len(sets)) self.assertEqual(set([1]), sets[1]) self.assertEqual(set([2]), sets[3]) self.assertEqual(set([1, 3]), sets[5]) def test_itemgetter(self): rdd = self.sc.parallelize([range(10)]) from operator import itemgetter self.assertEqual([1], rdd.map(itemgetter(1)).collect()) self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect()) def test_namedtuple_in_rdd(self): from collections import namedtuple Person = namedtuple("Person", "id firstName lastName") jon = Person(1, "Jon", "Doe") jane = Person(2, "Jane", "Doe") theDoes = self.sc.parallelize([jon, jane]) self.assertEquals([jon, jane], theDoes.collect()) def test_large_broadcast(self): N = 100000 data = [[float(i) for i in range(300)] for i in range(N)] bdata = self.sc.broadcast(data) # 270MB m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum() self.assertEquals(N, m) def test_zip_with_different_serializers(self): a = self.sc.parallelize(range(5)) b = self.sc.parallelize(range(100, 105)) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) a = a._reserialize(BatchedSerializer(PickleSerializer(), 2)) b = b._reserialize(MarshalSerializer()) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) def test_zip_with_different_number_of_items(self): a = self.sc.parallelize(range(5), 2) # different number of partitions b = self.sc.parallelize(range(100, 106), 3) self.assertRaises(ValueError, lambda: a.zip(b)) # different number of batched items in JVM b = self.sc.parallelize(range(100, 104), 2) self.assertRaises(Exception, lambda: a.zip(b).count()) # different number of items in one pair b = self.sc.parallelize(range(100, 106), 2) self.assertRaises(Exception, lambda: a.zip(b).count()) # same total number of items, but different distributions a = self.sc.parallelize([2, 3], 2).flatMap(range) b = self.sc.parallelize([3, 2], 2).flatMap(range) self.assertEquals(a.count(), b.count()) self.assertRaises(Exception, lambda: a.zip(b).count()) def test_histogram(self): # empty rdd = self.sc.parallelize([]) self.assertEquals([0], rdd.histogram([0, 10])[1]) self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1]) self.assertRaises(ValueError, lambda: rdd.histogram(1)) # out of range rdd = self.sc.parallelize([10.01, -0.01]) self.assertEquals([0], rdd.histogram([0, 10])[1]) self.assertEquals([0, 0], rdd.histogram((0, 4, 10))[1]) # in range with one bucket rdd = self.sc.parallelize(range(1, 5)) self.assertEquals([4], rdd.histogram([0, 10])[1]) self.assertEquals([3, 1], rdd.histogram([0, 4, 10])[1]) # in range with one bucket exact match self.assertEquals([4], rdd.histogram([1, 4])[1]) # out of range with two buckets rdd = self.sc.parallelize([10.01, -0.01]) self.assertEquals([0, 0], rdd.histogram([0, 5, 10])[1]) # out of range with two uneven buckets rdd = self.sc.parallelize([10.01, -0.01]) self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1]) # in range with two buckets rdd = self.sc.parallelize([1, 2, 3, 5, 6]) self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1]) # in range with two bucket and None rdd = self.sc.parallelize([1, 2, 3, 5, 6, None, float('nan')]) self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1]) # in range with two uneven buckets rdd = self.sc.parallelize([1, 2, 3, 5, 6]) self.assertEquals([3, 2], rdd.histogram([0, 5, 11])[1]) # mixed range with two uneven buckets rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01]) self.assertEquals([4, 3], rdd.histogram([0, 5, 11])[1]) # mixed range with four uneven buckets rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1]) self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1]) # mixed range with uneven buckets and NaN rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, None, float('nan')]) self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1]) # out of range with infinite buckets rdd = self.sc.parallelize([10.01, -0.01, float('nan'), float("inf")]) self.assertEquals([1, 2], rdd.histogram([float('-inf'), 0, float('inf')])[1]) # invalid buckets self.assertRaises(ValueError, lambda: rdd.histogram([])) self.assertRaises(ValueError, lambda: rdd.histogram([1])) self.assertRaises(ValueError, lambda: rdd.histogram(0)) self.assertRaises(TypeError, lambda: rdd.histogram({})) # without buckets rdd = self.sc.parallelize(range(1, 5)) self.assertEquals(([1, 4], [4]), rdd.histogram(1)) # without buckets single element rdd = self.sc.parallelize([1]) self.assertEquals(([1, 1], [1]), rdd.histogram(1)) # without bucket no range rdd = self.sc.parallelize([1] * 4) self.assertEquals(([1, 1], [4]), rdd.histogram(1)) # without buckets basic two rdd = self.sc.parallelize(range(1, 5)) self.assertEquals(([1, 2.5, 4], [2, 2]), rdd.histogram(2)) # without buckets with more requested than elements rdd = self.sc.parallelize([1, 2]) buckets = [1 + 0.2 * i for i in range(6)] hist = [1, 0, 0, 0, 1] self.assertEquals((buckets, hist), rdd.histogram(5)) # invalid RDDs rdd = self.sc.parallelize([1, float('inf')]) self.assertRaises(ValueError, lambda: rdd.histogram(2)) rdd = self.sc.parallelize([float('nan')]) self.assertRaises(ValueError, lambda: rdd.histogram(2)) # string rdd = self.sc.parallelize(["ab", "ac", "b", "bd", "ef"], 2) self.assertEquals([2, 2], rdd.histogram(["a", "b", "c"])[1]) self.assertEquals((["ab", "ef"], [5]), rdd.histogram(1)) self.assertRaises(TypeError, lambda: rdd.histogram(2)) # mixed RDD rdd = self.sc.parallelize([1, 4, "ab", "ac", "b"], 2) self.assertEquals([1, 1], rdd.histogram([0, 4, 10])[1]) self.assertEquals([2, 1], rdd.histogram(["a", "b", "c"])[1]) self.assertEquals(([1, "b"], [5]), rdd.histogram(1)) self.assertRaises(TypeError, lambda: rdd.histogram(2))
class TestRDDFunctions(PySparkTestCase): def test_failed_sparkcontext_creation(self): # Regression test for SPARK-1550 self.sc.stop() self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) self.sc = SparkContext("local") def test_save_as_textfile_with_unicode(self): # Regression test for SPARK-970 x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_transforming_cartesian_result(self): # Regression test for SPARK-1034 rdd1 = self.sc.parallelize([1, 2]) rdd2 = self.sc.parallelize([3, 4]) cart = rdd1.cartesian(rdd2) result = cart.map(lambda (x, y): x + y).collect() def test_transforming_pickle_file(self): # Regression test for SPARK-2601 data = self.sc.parallelize(["Hello", "World!"]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsPickleFile(tempFile.name) pickled_file = self.sc.pickleFile(tempFile.name) pickled_file.map(lambda x: x).collect() def test_cartesian_on_textfile(self): # Regression test for path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") a = self.sc.textFile(path) result = a.cartesian(a).collect() (x, y) = result[0] self.assertEqual("Hello World!", x.strip()) self.assertEqual("Hello World!", y.strip()) def test_deleting_input_files(self): # Regression test for SPARK-1025 tempFile = tempfile.NamedTemporaryFile(delete=False) tempFile.write("Hello World!") tempFile.close() data = self.sc.textFile(tempFile.name) filtered_data = data.filter(lambda x: True) self.assertEqual(1, filtered_data.count()) os.unlink(tempFile.name) self.assertRaises(Exception, lambda: filtered_data.count()) def testAggregateByKey(self): data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2) def seqOp(x, y): x.add(y) return x def combOp(x, y): x |= y return x sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect()) self.assertEqual(3, len(sets)) self.assertEqual(set([1]), sets[1]) self.assertEqual(set([2]), sets[3]) self.assertEqual(set([1, 3]), sets[5])
__author__ = "Chenweijia" import os import sys import pandas from operator import add from pyspark.sql import * from pyspark.context import SparkContext os.environ["SPARK_HOME"] = "F:/spark" sys.path.append("F:/spark/python") sc = SparkContext("local", "test") l = [("Alice", 1)] sqlContext = SQLContext(sc) rdd = sc.parallelize(l) Person = Row("name", "age") person = rdd.map(lambda r: Person(*r)) df2 = sqlContext.createDataFrame(person) df = sqlContext.createDataFrame(rdd, ["name", "age"]) from pyspark.sql.types import * schema = StructType([StructField("name", StringType(), True), StructField("age", IntegerType(), True)]) df3 = sqlContext.createDataFrame(rdd, schema) # print rdd # print sqlContext.createDataFrame(l).collect() # print sqlContext.createDataFrame(l, ['name', 'age']).collect() # print sqlContext.createDataFrame(rdd).collect() # print df.collect() # print df2.collect() # print df3.collect()
st = RDD.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) task1 = st.mapValues(lambda x: x[0] / x[1]).sortByKey(False) task1f = task1.top(statecount, key=lambda x: x[1]) t1_output_header = 'state,stars' with open(output_path1, 'w') as file_op: file_op.write(t1_output_header) for item in task1f: file_op.write("\n%s" % str(item).replace("(", "").replace( ")", "").replace("'", "").replace(" ", "")) file_op.close() t2 = time.time() t2a = task1.collect() print("Task 2B m1:", sorted(t2a, key=lambda x: x[1], reverse=True)[:5]) t3 = time.time() - t2 t0 = time.time() t2b = sc.parallelize(task1f).take(5) print("Task 2B m2: ", t2b) t1 = time.time() - t0 json_fileop = {} json_fileop['m1'] = t3 json_fileop['m2'] = t1 json_fileop[ 'explanation'] = "The collect() function takes more time as it brings all the elements of the dataset into the main driver memory or master node, due to which alot of time is consumed as the whole rdd is brought into the driver memory. The take() function takes less time as it just selects the first n items from the rdd" with open(output_path2, "w") as f: out = json.dump(OrderedDict(json_fileop), f)
class TestRDDFunctions(PySparkTestCase): def test_failed_sparkcontext_creation(self): # Regression test for SPARK-1550 self.sc.stop() self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) self.sc = SparkContext("local") def test_save_as_textfile_with_unicode(self): # Regression test for SPARK-970 x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_transforming_cartesian_result(self): # Regression test for SPARK-1034 rdd1 = self.sc.parallelize([1, 2]) rdd2 = self.sc.parallelize([3, 4]) cart = rdd1.cartesian(rdd2) result = cart.map(lambda (x, y): x + y).collect() def test_transforming_pickle_file(self): # Regression test for SPARK-2601 data = self.sc.parallelize(["Hello", "World!"]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsPickleFile(tempFile.name) pickled_file = self.sc.pickleFile(tempFile.name) pickled_file.map(lambda x: x).collect() def test_cartesian_on_textfile(self): # Regression test for path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") a = self.sc.textFile(path) result = a.cartesian(a).collect() (x, y) = result[0] self.assertEqual("Hello World!", x.strip()) self.assertEqual("Hello World!", y.strip()) def test_deleting_input_files(self): # Regression test for SPARK-1025 tempFile = tempfile.NamedTemporaryFile(delete=False) tempFile.write("Hello World!") tempFile.close() data = self.sc.textFile(tempFile.name) filtered_data = data.filter(lambda x: True) self.assertEqual(1, filtered_data.count()) os.unlink(tempFile.name) self.assertRaises(Exception, lambda: filtered_data.count()) def testAggregateByKey(self): data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2) def seqOp(x, y): x.add(y) return x def combOp(x, y): x |= y return x sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect()) self.assertEqual(3, len(sets)) self.assertEqual(set([1]), sets[1]) self.assertEqual(set([2]), sets[3]) self.assertEqual(set([1, 3]), sets[5]) def test_itemgetter(self): rdd = self.sc.parallelize([range(10)]) from operator import itemgetter self.assertEqual([1], rdd.map(itemgetter(1)).collect()) self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect()) def test_namedtuple_in_rdd(self): from collections import namedtuple Person = namedtuple("Person", "id firstName lastName") jon = Person(1, "Jon", "Doe") jane = Person(2, "Jane", "Doe") theDoes = self.sc.parallelize([jon, jane]) self.assertEquals([jon, jane], theDoes.collect()) def test_large_broadcast(self): N = 100000 data = [[float(i) for i in range(300)] for i in range(N)] bdata = self.sc.broadcast(data) # 270MB m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum() self.assertEquals(N, m)
class PyGraphXTestCase(unittest.TestCase): """ Test vertices, edges, partitionBy, numEdges, numVertices, inDegrees, outDegrees, degrees, triplets, mapVertices, mapEdges, mapTriplets, reverse, subgraph, groupEdges, joinVertices, outerJoinVertices, collectNeighborIds, collectNeighbors, mapReduceTriplets, triangleCount for Graph """ def setUp(self): class_name = self.__class__.__name__ conf = SparkConf().set("spark.default.parallelism", 1) self.sc = SparkContext(appName=class_name, conf=conf) self.sc.setCheckpointDir("/tmp") def tearDown(self): self.sc.stop() def collect(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) def take(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, [(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) def count(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, 2) def mapValues(self): vertexData = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertices = VertexRDD(vertexData) results = vertices.collect() self.assertEqual(results, 2) def diff(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.diff(vertices1) self.assertEqual(results, 2) def innerJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.diff(vertices1) self.assertEqual(results, 2) def leftJoin(self): vertexData0 = self.sc.parallelize([(3, ("rxin", "student")), (7, ("jgonzal", "postdoc"))]) vertexData1 = self.sc.parallelize([(1, ("rxin", "student")), (2, ("jgonzal", "postdoc"))]) vertices0 = VertexRDD(vertexData0) vertices1 = VertexRDD(vertexData1) results = vertices0.diff(vertices1) self.assertEqual(results, 2)
# values() m = sc.parallelize([(1, 2), (3, 4)]).values() m.collect() # variance() sc.parallelize([1, 2, 3]).variance() # zip(other) x = sc.parallelize(range(0,5)) y = sc.parallelize(range(1000, 1005)) x.zip(y).collect() # zipWithIndex() sc.parallelize(["a", "b", "c", "d"], 3).zipWithIndex().collect() # zipWithUniqueId() sc.parallelize(["a", "b", "c", "d", "e"], 3).zipWithUniqueId().collect() ### BROADCAST from pyspark.context import SparkContext sc = SparkContext('local', 'test') b = sc.broadcast([1, 2, 3, 4, 5]) b.value sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect() b.unpersist() large_broadcast = sc.broadcast(range(10000))
df.groupBy("word").count().show() ############### # Quick code to understand complete example # Q1.0 from pyspark.sql import Row from pyspark.sql.functions import split, explode # Q2.0 lines = ["Good morning. Nice day", "OK bye bye", "Good work", "Good day"] # Q3.0 Transform to RDD and apply to each element # of list, function Row() lines = sc.parallelize(lines).map(lambda x: Row(x)) # Q4.0 Convert it to dataframe with column name as 'value' lines = sqlContext.createDataFrame(lines, ['value']) lines.collect() lines.show(truncate=False) # Q5.0 What do split and explode do? # explode: Returns a new row for each element in the given array lines.select(split(lines.value, " ")).show(truncate=False) lines.select(explode(split(lines.value, " "))).show(truncate=False) lines.select(explode(split(lines.value, " ")).alias("word")).show(truncate=False) # Q6.0