class BroadcastTest(unittest.TestCase): def tearDown(self): if getattr(self, "sc", None) is not None: self.sc.stop() self.sc = None def _test_encryption_helper(self, vs): """ Creates a broadcast variables for each value in vs, and runs a simple job to make sure the value is the same when it's read in the executors. Also makes sure there are no task failures. """ bs = [self.sc.broadcast(value=v) for v in vs] exec_values = self.sc.parallelize( range(2)).map(lambda x: [b.value for b in bs]).collect() for ev in exec_values: self.assertEqual(ev, vs) # make sure there are no task failures status = self.sc.statusTracker() for jid in status.getJobIdsForGroup(): for sid in status.getJobInfo(jid).stageIds: stage_info = status.getStageInfo(sid) self.assertEqual(0, stage_info.numFailedTasks) def _test_multiple_broadcasts(self, *extra_confs): """ Test broadcast variables make it OK to the executors. Tests multiple broadcast variables, and also multiple jobs. """ conf = SparkConf() for key, value in extra_confs: conf.set(key, value) conf.setMaster("local-cluster[2,1,1024]") self.sc = SparkContext(conf=conf) self._test_encryption_helper([5]) self._test_encryption_helper([5, 10, 20]) def test_broadcast_with_encryption(self): self._test_multiple_broadcasts(("spark.io.encryption.enabled", "true")) def test_broadcast_no_encryption(self): self._test_multiple_broadcasts() def _test_broadcast_on_driver(self, *extra_confs): conf = SparkConf() for key, value in extra_confs: conf.set(key, value) conf.setMaster("local-cluster[2,1,1024]") self.sc = SparkContext(conf=conf) bs = self.sc.broadcast(value=5) self.assertEqual(5, bs.value) def test_broadcast_value_driver_no_encryption(self): self._test_broadcast_on_driver() def test_broadcast_value_driver_encryption(self): self._test_broadcast_on_driver(("spark.io.encryption.enabled", "true"))
def execute(self): print("execute ", self.__class__) from pyspark.context import SparkContext from pyspark.sql import SparkSession sc = SparkContext(appName='test PySparkTask') b = sc.broadcast([1, 2, 3, 4, 5]) sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect() spark = SparkSession.builder \ .master("local") \ .appName("Word Count") \ .getOrCreate() data = [('Alice', 1), ('Monica', 2)] spark.createDataFrame(data).collect() spark.createDataFrame(data, ['name', 'age']).collect()
######### Global variabls ######### (gross) # The following variables are broadcast to the spark # cluster and can be used in the functions below songTable = 'song_data' sc = SparkContext('local[*]', 'lastfm_recommender') sqlContext = SQLContext(sc) ### Set up database connections for metadata and similar artists ### This is starting to get really ugly. ### broadcasting this data is probably not a good idea artist_engine = create_engine('sqlite:///' + sys.argv[1]) sims = pd.read_sql_query('SELECT * FROM similarity', artist_engine) # broadcsasting these variables is probably a bad idea since # they ar quite big similars = sc.broadcast(sims.similar) similar_groups = sc.broadcast(sims.groupby('target').groups) tagFile = open('lastfm_unique_tags.txt', 'r') # make tag dictionary available across the cluster. tags = [ tagstr[0] for tagstr in map(lambda ts: ts.split('\t'), [next(tagFile) for x in xrange(500)]) ] tagDictionary = sc.broadcast(tags) tagFile.close() ######## Functions for feature extraction ######### # make a "vector" with indices corresoinding to values in
# parse raw user artist data userArtistDataFile = filePath + 'user_artist_data.txt' rawUserArtistData = sc.textFile(userArtistDataFile) # parse Artist data file artistDataFile = filePath + 'artist_data.txt' rawArtistData = sc.textFile(artistDataFile) artistById = rawArtistData.map(parseArtistByIdData).filter(lambda (k, v) : k != -1) # parse artist alias file artistAliasDataFile = filePath + 'artist_alias.txt' rawArtistAliasData = sc.textFile(artistAliasDataFile) artistAlias = rawArtistAliasData.map(parseArtistAliasData).filter(lambda (k, v) : k != -1).collectAsMap() # broadcast variable bArtistAlias = sc.broadcast(artistAlias) def processTrainData(line): (userId, artistId, count) = map(int, line.split(' ')) artistAliasId = bArtistAlias.value.get(artistId) if artistAliasId == None: artistAliasId = artistId return Rating(userId, artistAliasId, count) trainData = rawUserArtistData.map(processTrainData).cache() model = ALS.trainImplicit(trainData, 10) print model.productFeatures()
'JOB_NAME', 'source_bucket', 'report_folder_prefix', 'index_name_prefix_template', 'index_pattern_prefix', 'es_domain_url' ]) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) # Parameter init source_bucket = args['source_bucket'] report_folder_prefix = args['report_folder_prefix'] index_name_prefix_template = args['index_name_prefix_template'] index_pattern_prefix = args['index_pattern_prefix'] es_domain_url = args['es_domain_url'] es_domain_url_shared = sc.broadcast(es_domain_url) succeed = sc.accumulator(0) failed = sc.accumulator(0) now = datetime.datetime.now() index_name_base = index_name_prefix_template.format(str(now.year), str(now.month)) index_name = index_name_base + "-" + str(now.day) index_name_shared = sc.broadcast(index_name) def doc_generator(source): for row in source: updated_row = row.asDict() index_name = index_name_shared.value new_row = { '_index': index_name,
class TestRDDFunctions(PySparkTestCase): def test_failed_sparkcontext_creation(self): # Regression test for SPARK-1550 self.sc.stop() self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) self.sc = SparkContext("local") def test_save_as_textfile_with_unicode(self): # Regression test for SPARK-970 x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_save_as_textfile_with_utf8(self): x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x.encode("utf-8")]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_transforming_cartesian_result(self): # Regression test for SPARK-1034 rdd1 = self.sc.parallelize([1, 2]) rdd2 = self.sc.parallelize([3, 4]) cart = rdd1.cartesian(rdd2) result = cart.map(lambda (x, y): x + y).collect() def test_transforming_pickle_file(self): # Regression test for SPARK-2601 data = self.sc.parallelize(["Hello", "World!"]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsPickleFile(tempFile.name) pickled_file = self.sc.pickleFile(tempFile.name) pickled_file.map(lambda x: x).collect() def test_cartesian_on_textfile(self): # Regression test for path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") a = self.sc.textFile(path) result = a.cartesian(a).collect() (x, y) = result[0] self.assertEqual("Hello World!", x.strip()) self.assertEqual("Hello World!", y.strip()) def test_deleting_input_files(self): # Regression test for SPARK-1025 tempFile = tempfile.NamedTemporaryFile(delete=False) tempFile.write("Hello World!") tempFile.close() data = self.sc.textFile(tempFile.name) filtered_data = data.filter(lambda x: True) self.assertEqual(1, filtered_data.count()) os.unlink(tempFile.name) self.assertRaises(Exception, lambda: filtered_data.count()) def testAggregateByKey(self): data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2) def seqOp(x, y): x.add(y) return x def combOp(x, y): x |= y return x sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect()) self.assertEqual(3, len(sets)) self.assertEqual(set([1]), sets[1]) self.assertEqual(set([2]), sets[3]) self.assertEqual(set([1, 3]), sets[5]) def test_itemgetter(self): rdd = self.sc.parallelize([range(10)]) from operator import itemgetter self.assertEqual([1], rdd.map(itemgetter(1)).collect()) self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect()) def test_namedtuple_in_rdd(self): from collections import namedtuple Person = namedtuple("Person", "id firstName lastName") jon = Person(1, "Jon", "Doe") jane = Person(2, "Jane", "Doe") theDoes = self.sc.parallelize([jon, jane]) self.assertEquals([jon, jane], theDoes.collect()) def test_large_broadcast(self): N = 100000 data = [[float(i) for i in range(300)] for i in range(N)] bdata = self.sc.broadcast(data) # 270MB m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum() self.assertEquals(N, m) def test_zip_with_different_serializers(self): a = self.sc.parallelize(range(5)) b = self.sc.parallelize(range(100, 105)) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) a = a._reserialize(BatchedSerializer(PickleSerializer(), 2)) b = b._reserialize(MarshalSerializer()) self.assertEqual(a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) def test_zip_with_different_number_of_items(self): a = self.sc.parallelize(range(5), 2) # different number of partitions b = self.sc.parallelize(range(100, 106), 3) self.assertRaises(ValueError, lambda: a.zip(b)) # different number of batched items in JVM b = self.sc.parallelize(range(100, 104), 2) self.assertRaises(Exception, lambda: a.zip(b).count()) # different number of items in one pair b = self.sc.parallelize(range(100, 106), 2) self.assertRaises(Exception, lambda: a.zip(b).count()) # same total number of items, but different distributions a = self.sc.parallelize([2, 3], 2).flatMap(range) b = self.sc.parallelize([3, 2], 2).flatMap(range) self.assertEquals(a.count(), b.count()) self.assertRaises(Exception, lambda: a.zip(b).count()) def test_histogram(self): # empty rdd = self.sc.parallelize([]) self.assertEquals([0], rdd.histogram([0, 10])[1]) self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1]) self.assertRaises(ValueError, lambda: rdd.histogram(1)) # out of range rdd = self.sc.parallelize([10.01, -0.01]) self.assertEquals([0], rdd.histogram([0, 10])[1]) self.assertEquals([0, 0], rdd.histogram((0, 4, 10))[1]) # in range with one bucket rdd = self.sc.parallelize(range(1, 5)) self.assertEquals([4], rdd.histogram([0, 10])[1]) self.assertEquals([3, 1], rdd.histogram([0, 4, 10])[1]) # in range with one bucket exact match self.assertEquals([4], rdd.histogram([1, 4])[1]) # out of range with two buckets rdd = self.sc.parallelize([10.01, -0.01]) self.assertEquals([0, 0], rdd.histogram([0, 5, 10])[1]) # out of range with two uneven buckets rdd = self.sc.parallelize([10.01, -0.01]) self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1]) # in range with two buckets rdd = self.sc.parallelize([1, 2, 3, 5, 6]) self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1]) # in range with two bucket and None rdd = self.sc.parallelize([1, 2, 3, 5, 6, None, float('nan')]) self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1]) # in range with two uneven buckets rdd = self.sc.parallelize([1, 2, 3, 5, 6]) self.assertEquals([3, 2], rdd.histogram([0, 5, 11])[1]) # mixed range with two uneven buckets rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01]) self.assertEquals([4, 3], rdd.histogram([0, 5, 11])[1]) # mixed range with four uneven buckets rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1]) self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1]) # mixed range with uneven buckets and NaN rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, None, float('nan')]) self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1]) # out of range with infinite buckets rdd = self.sc.parallelize([10.01, -0.01, float('nan'), float("inf")]) self.assertEquals([1, 2], rdd.histogram([float('-inf'), 0, float('inf')])[1]) # invalid buckets self.assertRaises(ValueError, lambda: rdd.histogram([])) self.assertRaises(ValueError, lambda: rdd.histogram([1])) self.assertRaises(ValueError, lambda: rdd.histogram(0)) self.assertRaises(TypeError, lambda: rdd.histogram({})) # without buckets rdd = self.sc.parallelize(range(1, 5)) self.assertEquals(([1, 4], [4]), rdd.histogram(1)) # without buckets single element rdd = self.sc.parallelize([1]) self.assertEquals(([1, 1], [1]), rdd.histogram(1)) # without bucket no range rdd = self.sc.parallelize([1] * 4) self.assertEquals(([1, 1], [4]), rdd.histogram(1)) # without buckets basic two rdd = self.sc.parallelize(range(1, 5)) self.assertEquals(([1, 2.5, 4], [2, 2]), rdd.histogram(2)) # without buckets with more requested than elements rdd = self.sc.parallelize([1, 2]) buckets = [1 + 0.2 * i for i in range(6)] hist = [1, 0, 0, 0, 1] self.assertEquals((buckets, hist), rdd.histogram(5)) # invalid RDDs rdd = self.sc.parallelize([1, float('inf')]) self.assertRaises(ValueError, lambda: rdd.histogram(2)) rdd = self.sc.parallelize([float('nan')]) self.assertRaises(ValueError, lambda: rdd.histogram(2)) # string rdd = self.sc.parallelize(["ab", "ac", "b", "bd", "ef"], 2) self.assertEquals([2, 2], rdd.histogram(["a", "b", "c"])[1]) self.assertEquals((["ab", "ef"], [5]), rdd.histogram(1)) self.assertRaises(TypeError, lambda: rdd.histogram(2)) # mixed RDD rdd = self.sc.parallelize([1, 4, "ab", "ac", "b"], 2) self.assertEquals([1, 1], rdd.histogram([0, 4, 10])[1]) self.assertEquals([2, 1], rdd.histogram(["a", "b", "c"])[1]) self.assertEquals(([1, "b"], [5]), rdd.histogram(1)) self.assertRaises(TypeError, lambda: rdd.histogram(2))
"mapred.bq.input.dataset.id": "picfeed", "mapred.bq.input.table.id": "data_set_term", "spark.sql.shuffle.partitions": "1000", "spark.default.parallelism": "1000", } # Read the data from BigQuery into Spark as an RDD. table_data_data_combine_uinfo = spark.sparkContext.newAPIHadoopRDD( "com.google.cloud.hadoop.io.bigquery.JsonTextBigQueryInputFormat", "org.apache.hadoop.io.LongWritable", "com.google.gson.JsonObject", conf=conf2) print "start create broadcase value" b_uinfo = sc.broadcast(table_data_data_combine_uinfo) print "finish create broadcase value" def process_uinfo(line): line = (line.uid, line.ts, line.label, b_uinfo.get(line.urlid)) return line print "start map" out_rdd = table_data_train_set.map(process_uinfo) print "finish map" print out_rdd.take(2)
class TestRDDFunctions(PySparkTestCase): def test_failed_sparkcontext_creation(self): # Regression test for SPARK-1550 self.sc.stop() self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) self.sc = SparkContext("local") def test_save_as_textfile_with_unicode(self): # Regression test for SPARK-970 x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_transforming_cartesian_result(self): # Regression test for SPARK-1034 rdd1 = self.sc.parallelize([1, 2]) rdd2 = self.sc.parallelize([3, 4]) cart = rdd1.cartesian(rdd2) result = cart.map(lambda (x, y): x + y).collect() def test_transforming_pickle_file(self): # Regression test for SPARK-2601 data = self.sc.parallelize(["Hello", "World!"]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsPickleFile(tempFile.name) pickled_file = self.sc.pickleFile(tempFile.name) pickled_file.map(lambda x: x).collect() def test_cartesian_on_textfile(self): # Regression test for path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") a = self.sc.textFile(path) result = a.cartesian(a).collect() (x, y) = result[0] self.assertEqual("Hello World!", x.strip()) self.assertEqual("Hello World!", y.strip()) def test_deleting_input_files(self): # Regression test for SPARK-1025 tempFile = tempfile.NamedTemporaryFile(delete=False) tempFile.write("Hello World!") tempFile.close() data = self.sc.textFile(tempFile.name) filtered_data = data.filter(lambda x: True) self.assertEqual(1, filtered_data.count()) os.unlink(tempFile.name) self.assertRaises(Exception, lambda: filtered_data.count()) def testAggregateByKey(self): data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2) def seqOp(x, y): x.add(y) return x def combOp(x, y): x |= y return x sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect()) self.assertEqual(3, len(sets)) self.assertEqual(set([1]), sets[1]) self.assertEqual(set([2]), sets[3]) self.assertEqual(set([1, 3]), sets[5]) def test_itemgetter(self): rdd = self.sc.parallelize([range(10)]) from operator import itemgetter self.assertEqual([1], rdd.map(itemgetter(1)).collect()) self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect()) def test_namedtuple_in_rdd(self): from collections import namedtuple Person = namedtuple("Person", "id firstName lastName") jon = Person(1, "Jon", "Doe") jane = Person(2, "Jane", "Doe") theDoes = self.sc.parallelize([jon, jane]) self.assertEquals([jon, jane], theDoes.collect()) def test_large_broadcast(self): N = 100000 data = [[float(i) for i in range(300)] for i in range(N)] bdata = self.sc.broadcast(data) # 270MB m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum() self.assertEquals(N, m)
return [] else: try: return [(int(k[0]), int(k[1]))] except: return [] # In[12]: artistAlias = alias.flatMap(lambda x: aliases(x)).collectAsMap() artist_id = dict(artist_data.flatMap(lambda k: artist(k)).collect()) # In[13]: lookup = sc.broadcast(artistAlias) def mapper(x): userID, artistID, count = map(lambda lineItem: int(lineItem), x.split()) finalArtistID = lookup.value.get(artistID) if ID is None: ID = artistID return Rating(userID, ID, count) Data = user_data.map(lambda x: mapper(x)) Data.cache() # In[14]:
test_data = "hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/share/MNIST/Test-label-28x28.csv" test_df = spark.read.csv(test_data, header=False, inferSchema="true") assembler_test = VectorAssembler(inputCols=test_df.columns[1:784], outputCol="features") test_vectors = assembler_test.transform(test_df).select(test_df.columns[0],"features") pca_test = train_model.transform(test_vectors).select(test_vectors.columns[0],'pca') p_number = pca_test.rdd.count() train_np1 = np.array(pca_train.select('pca').collect()) train_label = np.array(pca_train.select('_c0').collect()) a,b,c = train_np1.shape train_set = train_np1.reshape(a,c) train_np = sc.broadcast(train_np1) label = sc.broadcast(train_label) result = pca_test.rdd.map(knn) predict_rdd = result.map(lambda x: x[0]) predict_rdd.saveAsTextFile("hdfs://soit-hdp-pro-1.ucc.usyd.edu.au/user/czho9311/123") acc = result.filter(lambda x:x[0]==x[1]).count() / float(p_number) metrics = MulticlassMetrics(result) label_metrics = ['0.0','1.0','2.0','3.0','4.0','5.0','6.0','7.0','8.0','9.0'] pre_dict = {} recall_dict = {} f_score = {} for i in label_metrics: pre_dict[i] = metrics.precision(i)
# values() m = sc.parallelize([(1, 2), (3, 4)]).values() m.collect() # variance() sc.parallelize([1, 2, 3]).variance() # zip(other) x = sc.parallelize(range(0,5)) y = sc.parallelize(range(1000, 1005)) x.zip(y).collect() # zipWithIndex() sc.parallelize(["a", "b", "c", "d"], 3).zipWithIndex().collect() # zipWithUniqueId() sc.parallelize(["a", "b", "c", "d", "e"], 3).zipWithUniqueId().collect() ### BROADCAST from pyspark.context import SparkContext sc = SparkContext('local', 'test') b = sc.broadcast([1, 2, 3, 4, 5]) b.value sc.parallelize([0, 0]).flatMap(lambda x: b.value).collect() b.unpersist() large_broadcast = sc.broadcast(range(10000))
# Getting relevant columns - last two cointain review and rating - also remove puncutations word_counted = Amazon.map(lambda x: (x[-1], x[-2].translate({ord(char): None for char in string.punctuation}))) # In[17]: #combining all comments for the same key word_counted_1 = word_counted.reduceByKey(lambda x,y : x+y) # In[18]: ####count of freq so that it can be used to get average word_freq_1 = sc.broadcast(word_counted.countByKey()) # In[19]: word_counted.countByKey() # In[24]: #Getting the average - total length divide by count word_avg = word_counted_1.map(lambda x: (x[0], len(x[1].split())/word_freq_1.value[x[0]])).sortByKey()
training_rdd_cleaned = training_rdd.subtractByKey(validation_rdd) training_rdd_cleaned = training_rdd_cleaned.map(lambda x: (x[0][0], (x[0][1], x[1]))) test_rdd = test_rdd_with_res.map(lambda x: (x[0][0], x[0][1])).sortByKey() users_rdd1 = training_rdd_cleaned.groupByKey().sortByKey().mapValues( list).collectAsMap() user_business_rdd1 = training_rdd_cleaned.map( lambda x: ((x[0], x[1][0]), x[1][1])).sortByKey().collectAsMap() business_rdd1 = training_rdd_cleaned.map(lambda x: (x[1][0], (x[0], x[1][ 1]))).groupByKey().sortByKey().mapValues(list).collectAsMap() business_rdd2 = training_rdd_cleaned.map(lambda x: (x[1][0], x[ 0])).groupByKey().sortByKey().mapValues(list).collectAsMap() users_rdd = sc.broadcast(users_rdd1) user_business_rdd = sc.broadcast(user_business_rdd1) business_rdd = sc.broadcast(business_rdd1) all_user_train = training_rdd.map(lambda x: x[0][0]).distinct().collect() all_user_val = test_rdd_with_res.map( lambda x: x[0][0]).distinct().collect() all_business_train = training_rdd.map( lambda x: x[0][1]).distinct().collect() all_business_val = test_rdd_with_res.map( lambda x: x[0][1]).distinct().collect() all_business_dic = dict() all_user_dic = dict()
class TestRDDFunctions(PySparkTestCase): def test_failed_sparkcontext_creation(self): # Regression test for SPARK-1550 self.sc.stop() self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) self.sc = SparkContext("local") def test_save_as_textfile_with_unicode(self): # Regression test for SPARK-970 x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_save_as_textfile_with_utf8(self): x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x.encode("utf-8")]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_transforming_cartesian_result(self): # Regression test for SPARK-1034 rdd1 = self.sc.parallelize([1, 2]) rdd2 = self.sc.parallelize([3, 4]) cart = rdd1.cartesian(rdd2) result = cart.map(lambda (x, y): x + y).collect() def test_transforming_pickle_file(self): # Regression test for SPARK-2601 data = self.sc.parallelize(["Hello", "World!"]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsPickleFile(tempFile.name) pickled_file = self.sc.pickleFile(tempFile.name) pickled_file.map(lambda x: x).collect() def test_cartesian_on_textfile(self): # Regression test for path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") a = self.sc.textFile(path) result = a.cartesian(a).collect() (x, y) = result[0] self.assertEqual("Hello World!", x.strip()) self.assertEqual("Hello World!", y.strip()) def test_deleting_input_files(self): # Regression test for SPARK-1025 tempFile = tempfile.NamedTemporaryFile(delete=False) tempFile.write("Hello World!") tempFile.close() data = self.sc.textFile(tempFile.name) filtered_data = data.filter(lambda x: True) self.assertEqual(1, filtered_data.count()) os.unlink(tempFile.name) self.assertRaises(Exception, lambda: filtered_data.count()) def testAggregateByKey(self): data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2) def seqOp(x, y): x.add(y) return x def combOp(x, y): x |= y return x sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect()) self.assertEqual(3, len(sets)) self.assertEqual(set([1]), sets[1]) self.assertEqual(set([2]), sets[3]) self.assertEqual(set([1, 3]), sets[5]) def test_itemgetter(self): rdd = self.sc.parallelize([range(10)]) from operator import itemgetter self.assertEqual([1], rdd.map(itemgetter(1)).collect()) self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect()) def test_namedtuple_in_rdd(self): from collections import namedtuple Person = namedtuple("Person", "id firstName lastName") jon = Person(1, "Jon", "Doe") jane = Person(2, "Jane", "Doe") theDoes = self.sc.parallelize([jon, jane]) self.assertEquals([jon, jane], theDoes.collect()) def test_large_broadcast(self): N = 100000 data = [[float(i) for i in range(300)] for i in range(N)] bdata = self.sc.broadcast(data) # 270MB m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum() self.assertEquals(N, m) def test_zip_with_different_serializers(self): a = self.sc.parallelize(range(5)) b = self.sc.parallelize(range(100, 105)) self.assertEqual( a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) a = a._reserialize(BatchedSerializer(PickleSerializer(), 2)) b = b._reserialize(MarshalSerializer()) self.assertEqual( a.zip(b).collect(), [(0, 100), (1, 101), (2, 102), (3, 103), (4, 104)]) def test_zip_with_different_number_of_items(self): a = self.sc.parallelize(range(5), 2) # different number of partitions b = self.sc.parallelize(range(100, 106), 3) self.assertRaises(ValueError, lambda: a.zip(b)) # different number of batched items in JVM b = self.sc.parallelize(range(100, 104), 2) self.assertRaises(Exception, lambda: a.zip(b).count()) # different number of items in one pair b = self.sc.parallelize(range(100, 106), 2) self.assertRaises(Exception, lambda: a.zip(b).count()) # same total number of items, but different distributions a = self.sc.parallelize([2, 3], 2).flatMap(range) b = self.sc.parallelize([3, 2], 2).flatMap(range) self.assertEquals(a.count(), b.count()) self.assertRaises(Exception, lambda: a.zip(b).count()) def test_histogram(self): # empty rdd = self.sc.parallelize([]) self.assertEquals([0], rdd.histogram([0, 10])[1]) self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1]) self.assertRaises(ValueError, lambda: rdd.histogram(1)) # out of range rdd = self.sc.parallelize([10.01, -0.01]) self.assertEquals([0], rdd.histogram([0, 10])[1]) self.assertEquals([0, 0], rdd.histogram((0, 4, 10))[1]) # in range with one bucket rdd = self.sc.parallelize(range(1, 5)) self.assertEquals([4], rdd.histogram([0, 10])[1]) self.assertEquals([3, 1], rdd.histogram([0, 4, 10])[1]) # in range with one bucket exact match self.assertEquals([4], rdd.histogram([1, 4])[1]) # out of range with two buckets rdd = self.sc.parallelize([10.01, -0.01]) self.assertEquals([0, 0], rdd.histogram([0, 5, 10])[1]) # out of range with two uneven buckets rdd = self.sc.parallelize([10.01, -0.01]) self.assertEquals([0, 0], rdd.histogram([0, 4, 10])[1]) # in range with two buckets rdd = self.sc.parallelize([1, 2, 3, 5, 6]) self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1]) # in range with two bucket and None rdd = self.sc.parallelize([1, 2, 3, 5, 6, None, float('nan')]) self.assertEquals([3, 2], rdd.histogram([0, 5, 10])[1]) # in range with two uneven buckets rdd = self.sc.parallelize([1, 2, 3, 5, 6]) self.assertEquals([3, 2], rdd.histogram([0, 5, 11])[1]) # mixed range with two uneven buckets rdd = self.sc.parallelize([-0.01, 0.0, 1, 2, 3, 5, 6, 11.0, 11.01]) self.assertEquals([4, 3], rdd.histogram([0, 5, 11])[1]) # mixed range with four uneven buckets rdd = self.sc.parallelize( [-0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1]) self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1]) # mixed range with uneven buckets and NaN rdd = self.sc.parallelize([ -0.01, 0.0, 1, 2, 3, 5, 6, 11.01, 12.0, 199.0, 200.0, 200.1, None, float('nan') ]) self.assertEquals([4, 2, 1, 3], rdd.histogram([0.0, 5.0, 11.0, 12.0, 200.0])[1]) # out of range with infinite buckets rdd = self.sc.parallelize([10.01, -0.01, float('nan'), float("inf")]) self.assertEquals([1, 2], rdd.histogram([float('-inf'), 0, float('inf')])[1]) # invalid buckets self.assertRaises(ValueError, lambda: rdd.histogram([])) self.assertRaises(ValueError, lambda: rdd.histogram([1])) self.assertRaises(ValueError, lambda: rdd.histogram(0)) self.assertRaises(TypeError, lambda: rdd.histogram({})) # without buckets rdd = self.sc.parallelize(range(1, 5)) self.assertEquals(([1, 4], [4]), rdd.histogram(1)) # without buckets single element rdd = self.sc.parallelize([1]) self.assertEquals(([1, 1], [1]), rdd.histogram(1)) # without bucket no range rdd = self.sc.parallelize([1] * 4) self.assertEquals(([1, 1], [4]), rdd.histogram(1)) # without buckets basic two rdd = self.sc.parallelize(range(1, 5)) self.assertEquals(([1, 2.5, 4], [2, 2]), rdd.histogram(2)) # without buckets with more requested than elements rdd = self.sc.parallelize([1, 2]) buckets = [1 + 0.2 * i for i in range(6)] hist = [1, 0, 0, 0, 1] self.assertEquals((buckets, hist), rdd.histogram(5)) # invalid RDDs rdd = self.sc.parallelize([1, float('inf')]) self.assertRaises(ValueError, lambda: rdd.histogram(2)) rdd = self.sc.parallelize([float('nan')]) self.assertRaises(ValueError, lambda: rdd.histogram(2)) # string rdd = self.sc.parallelize(["ab", "ac", "b", "bd", "ef"], 2) self.assertEquals([2, 2], rdd.histogram(["a", "b", "c"])[1]) self.assertEquals((["ab", "ef"], [5]), rdd.histogram(1)) self.assertRaises(TypeError, lambda: rdd.histogram(2)) # mixed RDD rdd = self.sc.parallelize([1, 4, "ab", "ac", "b"], 2) self.assertEquals([1, 1], rdd.histogram([0, 4, 10])[1]) self.assertEquals([2, 1], rdd.histogram(["a", "b", "c"])[1]) self.assertEquals(([1, "b"], [5]), rdd.histogram(1)) self.assertRaises(TypeError, lambda: rdd.histogram(2))
if len(tmp_ind) > 0 and itr < prune_stop_iter: # run at window @6 _tmp_c = np.array(len(crates_list) * [-1.]) for t_name in tmp_ind: _tmp_c[layer_inds[t_name]] = crates[t_name] apply_prune(solver.net, _tmp_c) #if len(tmp_ind)>1 and itr < prune_stop_iter: if itr % 1000 == 0 and len( tmp_ind) > 1 and itr < prune_stop_iter: # run at window @3 accuracy_ = test_net(solver.net, _count=1, _start="ip1") es = {} #reference_model = sc.broadcast(solver.net) ## not work, can not be pickled solver.net.save(parallel_file_name ) # share model by files through parallel individual #print(solver.net.blobs['data'].data.shape) the_input_batch = sc.broadcast(solver.net.blobs['data'].data) if es_method == 'ncs': __C = edict() __C.parameters = { 'reset_xl_to_pop': False, 'init_value': tmp_crates, 'stepsize': ncs_stepsize, 'bounds': [0.0, 10.], 'ftarget': 0, 'tmax': 1600, 'popsize': 10, 'best_k': 1 } es = ncs.NCS(__C.parameters) print('***************NCS initialization***************') tmp_x_ = np.array(crates_list)
class TestRDDFunctions(PySparkTestCase): def test_failed_sparkcontext_creation(self): # Regression test for SPARK-1550 self.sc.stop() self.assertRaises(Exception, lambda: SparkContext("an-invalid-master-name")) self.sc = SparkContext("local") def test_save_as_textfile_with_unicode(self): # Regression test for SPARK-970 x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_save_as_textfile_with_utf8(self): x = u"\u00A1Hola, mundo!" data = self.sc.parallelize([x.encode("utf-8")]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsTextFile(tempFile.name) raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*"))) self.assertEqual(x, unicode(raw_contents.strip(), "utf-8")) def test_transforming_cartesian_result(self): # Regression test for SPARK-1034 rdd1 = self.sc.parallelize([1, 2]) rdd2 = self.sc.parallelize([3, 4]) cart = rdd1.cartesian(rdd2) result = cart.map(lambda (x, y): x + y).collect() def test_transforming_pickle_file(self): # Regression test for SPARK-2601 data = self.sc.parallelize(["Hello", "World!"]) tempFile = tempfile.NamedTemporaryFile(delete=True) tempFile.close() data.saveAsPickleFile(tempFile.name) pickled_file = self.sc.pickleFile(tempFile.name) pickled_file.map(lambda x: x).collect() def test_cartesian_on_textfile(self): # Regression test for path = os.path.join(SPARK_HOME, "python/test_support/hello.txt") a = self.sc.textFile(path) result = a.cartesian(a).collect() (x, y) = result[0] self.assertEqual("Hello World!", x.strip()) self.assertEqual("Hello World!", y.strip()) def test_deleting_input_files(self): # Regression test for SPARK-1025 tempFile = tempfile.NamedTemporaryFile(delete=False) tempFile.write("Hello World!") tempFile.close() data = self.sc.textFile(tempFile.name) filtered_data = data.filter(lambda x: True) self.assertEqual(1, filtered_data.count()) os.unlink(tempFile.name) self.assertRaises(Exception, lambda: filtered_data.count()) def testAggregateByKey(self): data = self.sc.parallelize([(1, 1), (1, 1), (3, 2), (5, 1), (5, 3)], 2) def seqOp(x, y): x.add(y) return x def combOp(x, y): x |= y return x sets = dict(data.aggregateByKey(set(), seqOp, combOp).collect()) self.assertEqual(3, len(sets)) self.assertEqual(set([1]), sets[1]) self.assertEqual(set([2]), sets[3]) self.assertEqual(set([1, 3]), sets[5]) def test_itemgetter(self): rdd = self.sc.parallelize([range(10)]) from operator import itemgetter self.assertEqual([1], rdd.map(itemgetter(1)).collect()) self.assertEqual([(2, 3)], rdd.map(itemgetter(2, 3)).collect()) def test_namedtuple_in_rdd(self): from collections import namedtuple Person = namedtuple("Person", "id firstName lastName") jon = Person(1, "Jon", "Doe") jane = Person(2, "Jane", "Doe") theDoes = self.sc.parallelize([jon, jane]) self.assertEquals([jon, jane], theDoes.collect()) def test_large_broadcast(self): N = 100000 data = [[float(i) for i in range(300)] for i in range(N)] bdata = self.sc.broadcast(data) # 270MB m = self.sc.parallelize(range(1), 1).map(lambda x: len(bdata.value)).sum() self.assertEquals(N, m)
def main(): start = time.time() #Step 2: handle input parameters parser = argparse.ArgumentParser(description='Process some integers.') #filenames, K and N parser.add_argument('-filenames', type=str, nargs='+', help='the list of fasta files', required=True) parser.add_argument('-K', type=int, nargs=1, help='value of K in k-mer', required=True) parser.add_argument('-N', type=int, nargs=1, help='value of N in top-n', required=True) args = parser.parse_args() #Step 3: create a Spark context object (ctx) ctx = SparkContext(appName="Kmer count") #Step 4: broadcast K and N as global shared objects files = ctx.broadcast(args.filenames) k = ctx.broadcast(args.K) n = ctx.broadcast(args.N) print(files.value) print(k.value) print(n.value) #Step 5: read FASTQ file from HDFS and create the first RDD records = ctx.textFile(files.value[0]) #remove file if exists # try: # shutil.rmtree("kmers/output/1") # print("removed old output") # except OSError: # print("kmers / output / 1 did not exist, creating now") # records.saveAsTextFile("kmers/output/1") #Step 6: filter redundant records #specChar = re.compile('[A-Za-z]') pattern = re.compile('^[ACGTNacgn]+$') records = records.filter(lambda x: re.match(pattern, x) != None) # for i in filterRDD.collect(): # print(i) # try: # shutil.rmtree("kmers/output/1.5") # print("removed old output") # except OSError: # print("kmers / output / 1.5 did not exist, creating now") # filterRDD.saveAsTextFile("kmers/output/1.5") # Step 7: generate K-mers kVal = k.value[0] kmers = records.map(lambda x: (x[0:kVal], 1)) # for k in kmers.collect(): # print(k) # try: # shutil.rmtree("kmers/output/2") # print("removed old output") # except OSError: # print("kmers / output / 2 did not exist, creating now") # kmers.saveAsTextFile("kmers/output/2") # Step 8: Combine/reduce frequent K-mers grouped = kmers.reduceByKey(lambda x, y: x + y) # try: # shutil.rmtree("kmers/output/2.5") # print("removed old output") # except OSError: # print("kmers / output / 2.5 did not exist, creating now") # grouped.saveAsTextFile("kmers/output/2.5") # Step 9: create a local top N for all partitions sortedKmers = grouped.map(lambda x: (int(-x[1]), x[0])).sortByKey().map( lambda x: (x[1], -1 * int(x[0]))) try: shutil.rmtree("kmers/output/3") print("removed old output") except OSError: print("kmers / output / 3 did not exist, creating now") sortedKmers.saveAsTextFile("kmers/output/3") #Step 10: get top N print("Top N={} {}-mers:".format(n.value[0], k.value[0])) for val in sortedKmers.take(n.value[0]): print(val) # print("Bottom N={} {}-mers:".format(n.value[0], k.value[0])) # for val in sortedKmers.takeOrdered(n.value[0], key=lambda x:-1*int(x[1])): # print(val) end = time.time() print(end - start)
"double"), ("clm15_days_supply", "long", "daysSupply", "long"), ("clm27_unit_dose_indicator", "long", "dosageUnit", "long"), ("drg13_route_description", "string", "route", "string"), ("drg14_strength", "string", "dosageStrength", "string"), ("drg25_dosage_form", "string", "doseForm", "string")], transformation_ctx="ds2") # Convert to spark DF df = ds.toDF() # Create MemberNumber -> patientId Map temp_dict = {} for row in df.select("memberNumber").distinct().collect(): temp_dict[str(row.memberNumber)] = str(uuid.uuid4()) # Share this dictionary with all workers patient_id_dict = sc.broadcast(temp_dict) def get_patient_id(member_number): return patient_id_dict.value[str(member_number)] # Add new column patientId based on memberNumber get_patient_id_udf = udf(get_patient_id, StringType()) df = df.withColumn("patientId", get_patient_id_udf(df["memberNumber"])) # Convert date string to a datetime convert_date_udf = udf( lambda date_str: str(datetime.strptime(date_str, "%Y-%m-%d")), StringType()) df = df.withColumn("fillDate", convert_date_udf(df["fillDate"])) # Add new column ndc9 based on substring of product_service_identification
######### Global variabls ######### (gross) # The following variables are broadcast to the spark # cluster and can be used in the functions below songTable = 'song_data' sc = SparkContext('local[*]', 'lastfm_recommender') sqlContext = SQLContext(sc) ### Set up database connections for metadata and similar artists ### This is starting to get really ugly. ### broadcasting this data is probably not a good idea artist_engine = create_engine('sqlite:///'+sys.argv[1]) sims = pd.read_sql_query( 'SELECT * FROM similarity', artist_engine) # broadcsasting these variables is probably a bad idea since # they ar quite big similars = sc.broadcast(sims.similar) similar_groups = sc.broadcast(sims.groupby('target').groups) tagFile = open('lastfm_unique_tags.txt', 'r') # make tag dictionary available across the cluster. tags = [tagstr[0] for tagstr in map(lambda ts: ts.split('\t'), [next(tagFile) for x in xrange(500)])] tagDictionary = sc.broadcast(tags) tagFile.close() ######## Functions for feature extraction ######### # make a "vector" with indices corresoinding to values in # tagDictionary def getTagVector(track): return {tagDictionary.value[tag]:1 for [tag, f] in track.tags
class MainApp(object): def __init__(self): pass def init(self): os.environ[ "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") # conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) def loadData(self): self.df_review = self.sqlContext.read.json( "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json" ).cache() # self.df_review = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_review.json").cache() self.df_business = self.sqlContext.read.json( "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json" ).cache() # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache() self.df_review.registerTempTable("reviews") self.df_business.registerTempTable("business") def createCheckInDataPerUser(self): review_user = self.sqlContext.sql( "SELECT business_id, user_id FROM reviews") business_loc = self.sqlContext.sql( "SELECT business_id, latitude, longitude FROM business") review_user.registerTempTable("reviews_user") business_loc.registerTempTable("business_loc") self.df_join_reviewAndBusiness = self.sqlContext.sql( "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id" ).cache() self.df_join_reviewAndBusiness.registerTempTable("userBusiness") self.df_unique_users = self.sqlContext.sql( "SELECT DISTINCT user_id FROM userBusiness where user_id = \"SIfJLNMv7vBwo-fSipxNgg\"" ) self.df_unique_users.registerTempTable("users") pd = self.df_join_reviewAndBusiness.toPandas() global_db = self.sc.broadcast(pd) schema = StructType([ StructField("latitude", FloatType()), StructField("longitude", FloatType()) ]) partialFunc = partial(getLocationsOfUser, business_db=global_db.value) self.get_locations = udf(partialFunc, ArrayType(schema)) self.get_centers = udf(getCentersOfUser, ArrayType(schema)) self.df_unique_users = self.df_unique_users.withColumn( "user_locations", self.get_locations(self.df_unique_users["user_id"])) self.df_unique_users.registerTempTable("users") self.df_unique_users.repartition(1).write.save("user.json", "json", "overwrite") print( getCentersOfUser( self.df_unique_users.toPandas().iloc[0]["user_locations"])) self.df_unique_users = self.df_unique_users.withColumn( "user_centers", self.get_centers(self.df_unique_users["user_locations"])) self.df_unique_users.registerTempTable("users") self.df_unique_users.repartition(1).write.save("center.json", "json", "overwrite") self.df_unique_users.show() def distanceCalc(self): self.df_unique_users = self.sqlContext.read.json( "user.json/part-r-00000-23a1b514-f5fe-4f61-9a64-01ebbc88c146" ).cache() print( len( getCentersOfUser(self.df_unique_users.toPandas().iloc[0] ["user_locations"])))
print "finish to map input" def process_uinfo(line): if len(line) != 2: return Row(urlid=line, urlinfo="") return Row(urlid=line[0], urlinfo=line[1]) #combine_uinfo_dict = combine_uinfo.map(lambda p: Row(urlid=p[0], urlinfo=p[1])).collect() print "begin to map and collect uinfo" combine_uinfo_dict = combine_uinfo.map(process_uinfo).collect() print "finish map" combine_uinfo_b = sc.broadcast(combine_uinfo_dict) print "finish broadcast" def update(line, uinfo): line = (line, uinfo.filter(uinfo.urlid==line[1]).urlinfo) return line print "begin update" train_set_url = train_set.map(lambda x: update(x, combine_uinfo_b)) print "finish update" accum.add(1) print "finish mapping"
all_business.sort() all_business_dic = dict() all_user_dic = dict() i = 0 for item in all_business: all_business_dic[item] = i i += 1 j = 0 for item in all_user: all_user_dic[item] = j j += 1 vu = sc.broadcast(all_user_dic) hashes_value = [[421, 167, 1610612741], [491, 397, 100663319], [659, 257, 3145739], [479, 193, 201326611], [167, 167, 402653189], [619, 139, 393241], [929, 137, 402653189], [389, 211, 393241], [443, 431, 805306457], [983, 211, 100663319], [109, 211, 805306457], [761, 389, 1572869], [661, 131, 1610612741], [241, 373, 25165843], [491, 163, 12582917], [257, 293, 786433], [317, 191, 402653189], [127, 389, 12582917], [467, 347, 3145739], [827, 191, 393241], [617, 211, 3145739], [127, 241, 25165843], [757, 233, 805306457], [641, 337, 196613], [547, 233, 1610612741], [233, 307, 1610612741], [457, 271, 100663319], [937, 173, 805306457],
from pyspark.sql.functions import udf, col from pyspark.sql.types import * from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.mllib.linalg import Vectors from pyspark.ml.feature import VectorAssembler from pyspark.ml.classification import RandomForestClassifier from pyspark.ml.feature import StringIndexer, VectorIndexer from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml import Pipeline from pyspark.mllib.linalg import SparseVector, DenseVector sc = SparkContext(appName='sparking_your_interest') SQLContext = HiveContext(sc) speech_stopwords_list = list([line.strip() for line in open('speech_stopwords.txt', 'r')]) speech_stopwords_broadcasted = sc.broadcast(speech_stopwords_list) nltk_stopwords = set(stopwords.words('english')) nltk_stopwords_broadcasted = sc.broadcast(nltk_stopwords) more_stopwords = set([line.strip() for line in open('more_stopwords.txt', 'r')]) more_stopwords_broadcasted = sc.broadcast(more_stopwords) def clean_up(s): text_removing_brackets = re.sub("[\(\[].*?[\)\]]", "", s) text_removing_double_quotes = re.sub('"',"",text_removing_brackets) speech_stopwords = speech_stopwords_broadcasted.value text_removing_stopwords = text_removing_double_quotes for token in speech_stopwords: text_removing_stopwords = re.sub(token,'',text_removing_stopwords) return text_removing_stopwords def unicode_encode(s):
getfrnd = udf(get_f_list, StringType()) original_df = sqlc.read \ .format("jdbc") \ .option("url", "jdbc:oracle:thin:@150.136.138.197:1521/BIASDB_PDB1.subnet12011439.vcn12011439.oraclevcn.com") \ .option("dbtable", "POC.CUSTOMER_DETAILS") \ .option("user", "poc") \ .option("password", "WElcome##123") \ .option("driver", "oracle.jdbc.driver.OracleDriver") \ .load() sm_id_df = original_df.select(original_df['social_media_id']) sm_id_df_rdd = sm_id_df.rdd.flatMap(lambda x: x).collect() b_rdd = sc.broadcast(sm_id_df_rdd) b1 = original_df.select( original_df['CUSTOMER_NAME'], original_df['social_media_id'].alias('SOCIAL_MEDIA_ID'), original_df['customer_email'].alias('EMAIL'), original_df['customer_address'].alias('ADDRESS'), original_df['customer_number'].alias('CONTACT'), getfrnd(original_df['is_friends_with']).alias('FRIENDS_LIST')) b1.write.format('jdbc').options( url= 'jdbc:oracle:thin:@150.136.138.197:1521/BIASDB_PDB1.subnet12011439.vcn12011439.oraclevcn.com', driver='oracle.jdbc.driver.OracleDriver', dbtable='POC.SOCIAL_MEDIA_DWH', user='******',
class MainApp(object): def __init__(self): pass def init(self): os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") # conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) def loadData(self): self.df_review = self.sqlContext.read.json( "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json" ).cache() # self.df_review = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_review.json").cache() self.df_business = self.sqlContext.read.json( "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json" ).cache() # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache() self.df_review.registerTempTable("reviews") self.df_business.registerTempTable("business") def createCheckInDataPerUser(self): review_user = self.sqlContext.sql("SELECT business_id, user_id FROM reviews") business_loc = self.sqlContext.sql("SELECT business_id, latitude, longitude FROM business") review_user.registerTempTable("reviews_user") business_loc.registerTempTable("business_loc") self.df_join_reviewAndBusiness = self.sqlContext.sql( "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id" ).cache() self.df_join_reviewAndBusiness.registerTempTable("userBusiness") self.df_unique_users = self.sqlContext.sql( 'SELECT DISTINCT user_id FROM userBusiness where user_id = "SIfJLNMv7vBwo-fSipxNgg"' ) self.df_unique_users.registerTempTable("users") pd = self.df_join_reviewAndBusiness.toPandas() global_db = self.sc.broadcast(pd) schema = StructType([StructField("latitude", FloatType()), StructField("longitude", FloatType())]) partialFunc = partial(getLocationsOfUser, business_db=global_db.value) self.get_locations = udf(partialFunc, ArrayType(schema)) self.get_centers = udf(getCentersOfUser, ArrayType(schema)) self.df_unique_users = self.df_unique_users.withColumn( "user_locations", self.get_locations(self.df_unique_users["user_id"]) ) self.df_unique_users.registerTempTable("users") self.df_unique_users.repartition(1).write.save("user.json", "json", "overwrite") print(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"])) self.df_unique_users = self.df_unique_users.withColumn( "user_centers", self.get_centers(self.df_unique_users["user_locations"]) ) self.df_unique_users.registerTempTable("users") self.df_unique_users.repartition(1).write.save("center.json", "json", "overwrite") self.df_unique_users.show() def distanceCalc(self): self.df_unique_users = self.sqlContext.read.json( "user.json/part-r-00000-23a1b514-f5fe-4f61-9a64-01ebbc88c146" ).cache() print(len(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"])))