class MLlibTestCase(unittest.TestCase): def setUp(self): self.sc = SparkContext('local[4]', "MLlib tests") self.spark = SparkSession(self.sc) def tearDown(self): self.spark.stop()
def benchmark_spark(ratings, factors, iterations=5): conf = (SparkConf() .setAppName("implicit_benchmark") .setMaster('local[*]') .set('spark.driver.memory', '16G') ) context = SparkContext(conf=conf) spark = SparkSession(context) times = {} try: ratings = convert_sparse_to_dataframe(spark, context, ratings) for rank in factors: als = ALS(rank=rank, maxIter=iterations, alpha=1, implicitPrefs=True, userCol="row", itemCol="col", ratingCol="data") start = time.time() als.fit(ratings) elapsed = time.time() - start times[rank] = elapsed / iterations print("spark. factors=%i took %.3f" % (rank, elapsed/iterations)) finally: spark.stop() return times
def test_get_active_session_when_no_active_session(self): active = SparkSession.getActiveSession() self.assertEqual(active, None) spark = SparkSession.builder \ .master("local") \ .getOrCreate() active = SparkSession.getActiveSession() self.assertEqual(active, spark) spark.stop() active = SparkSession.getActiveSession() self.assertEqual(active, None)
def _test(): import doctest import os import tempfile import py4j from pyspark.context import SparkContext from pyspark.sql import SparkSession, Row import pyspark.sql.readwriter os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.readwriter.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') try: spark = SparkSession.withHiveSupport(sc) except py4j.protocol.Py4JError: spark = SparkSession(sc) globs['tempfile'] = tempfile globs['os'] = os globs['sc'] = sc globs['spark'] = spark globs['df'] = spark.read.parquet('python/test_support/sql/parquet_partitioned') globs['sdf'] = \ spark.read.format('text').stream('python/test_support/sql/streaming') (failure_count, test_count) = doctest.testmod( pyspark.sql.readwriter, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE | doctest.REPORT_NDIFF) sc.stop() if failure_count: exit(-1)
def test_active_session(self): spark = SparkSession.builder \ .master("local") \ .getOrCreate() try: activeSession = SparkSession.getActiveSession() df = activeSession.createDataFrame([(1, 'Alice')], ['age', 'name']) self.assertEqual(df.collect(), [Row(age=1, name=u'Alice')]) finally: spark.stop()
def test_get_active_session_after_create_dataframe(self): session2 = None try: activeSession1 = SparkSession.getActiveSession() session1 = self.spark self.assertEqual(session1, activeSession1) session2 = self.spark.newSession() activeSession2 = SparkSession.getActiveSession() self.assertEqual(session1, activeSession2) self.assertNotEqual(session2, activeSession2) session2.createDataFrame([(1, 'Alice')], ['age', 'name']) activeSession3 = SparkSession.getActiveSession() self.assertEqual(session2, activeSession3) session1.createDataFrame([(1, 'Alice')], ['age', 'name']) activeSession4 = SparkSession.getActiveSession() self.assertEqual(session1, activeSession4) finally: if session2 is not None: session2.stop()
def test_active_session_with_None_and_not_None_context(self): from pyspark.context import SparkContext from pyspark.conf import SparkConf sc = None session = None try: sc = SparkContext._active_spark_context self.assertEqual(sc, None) activeSession = SparkSession.getActiveSession() self.assertEqual(activeSession, None) sparkConf = SparkConf() sc = SparkContext.getOrCreate(sparkConf) activeSession = sc._jvm.SparkSession.getActiveSession() self.assertFalse(activeSession.isDefined()) session = SparkSession(sc) activeSession = sc._jvm.SparkSession.getActiveSession() self.assertTrue(activeSession.isDefined()) activeSession2 = SparkSession.getActiveSession() self.assertNotEqual(activeSession2, None) finally: if session is not None: session.stop() if sc is not None: sc.stop()
def __init__(self, sc, moldb_name, isocalc): self._sc = sc self._moldb_name = moldb_name self._isocalc = isocalc self._sm_config = SMConfig.get_conf() self._parquet_chunks_n = 64 self._iso_gen_part_n = 512 self._spark_session = SparkSession(self._sc) self._ion_centroids_path = '{}/{}/{}/{}'.format(self._sm_config['isotope_storage']['path'], self._moldb_name, self._isocalc.sigma, self._isocalc.charge) self.ion_df = None self.ion_centroids_df = None
from pyspark.sql import SparkSession from pyspark.sql.functions import * spark = SparkSession.Builder().appName('Example').getOrCreate() sales_df = spark.read \ .option("inferSchema", "true") \ .option("header", "true") \ .csv("sales.csv") result = sales_df.groupBy("COUNTRY_CODE")\ .sum("AMOUNT")\ .orderBy(desc("sum(AMOUNT)")) result.show()
# Load Amazon metadata path = '../data/Amazon/amazon_meta_p2.json' records = [] for line in open(path, 'r'): item = json.loads(line.strip()) record = (item['asin'], html.unescape(item['title'])) records.append(record) amazon_df = pd.DataFrame.from_records(records, columns=['asin', 'title']) sc = SparkContext() sc.setLogLevel('ERROR') imdb_df = sc.broadcast(imdb_df).value amazon_df = sc.broadcast(amazon_df).value spark = SparkSession(sc) imdb_df = spark.createDataFrame(imdb_df) amazon_df = spark.createDataFrame(amazon_df) k = 1 print(f'Performing matching round {k}...') src_encode = udf(lambda x: encode(x, lower=False, replace=False)) desc_encode = udf( lambda x: encode(x, end_words=['['], skip_words=['vhs', 'dvd', 'anglais', 'italien'], lower=False, replace=False)) matched, amazon_df = match(imdb_df, amazon_df, 'title', 'imdb_id', src_encode, desc_encode)
import atexit import os import platform import warnings from pyspark.context import SparkContext from pyspark.sql import SparkSession if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) SparkContext._ensure_initialized() # type: ignore try: spark = SparkSession._create_shell_session() # type: ignore except Exception: import sys import traceback warnings.warn("Failed to initialize Spark session.") traceback.print_exc(file=sys.stderr) sys.exit(1) sc = spark.sparkContext sql = spark.sql atexit.register(lambda: sc.stop()) # for compatibility sqlContext = spark._wrapped sqlCtx = sqlContext
def test(spark: SparkSession): spark.sql("use aijiami") spark.sql("desc dw_opera_detail").show(100)
except TypeError: pass return dictX if __name__ == "__main__": if len(sys.argv) != 4: print("Usage: kafka_wordcount.py <file> <hdfs-files> <zk> <topic> ", file=sys.stderr) exit(-1) sc = SparkContext(appName="Kafka with DT") sc.setLogLevel("ERROR") spark = SparkSession(sc) #Create model a = 0 orig = sys.argv[1] path = 'hdfs://master:9000/user/app/' file = orig.split('app/')[1] features = sc.textFile(path + 'features-des.txt').collect() feat = [] for i in features: #feat.append(i.split('-')[0].split(' ')[0]) feat.append(i.split(',')[1]) [model, index] = getModel(path, file) if path_exist(
# -*- coding: UTF-8 -*- from pyspark import SparkContext, SparkConf from pyspark.ml.linalg import Vectors, SparseVector from pyspark.sql import SparkSession from pyspark.sql import Row from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.sql import SQLContext from pyspark.ml.feature import StringIndexer, VectorIndexer, IndexToString from pyspark.ml.classification import DecisionTreeClassificationModel, DecisionTreeClassifier from pyspark.ml.evaluation import MulticlassClassificationEvaluator import time from pyspark.ml import Pipeline conf = SparkConf().setAppName("dt") sc = SparkContext(conf=conf) spark = SparkSession(sc) sqlContext = SQLContext(sc) df = spark.read.csv('hdfs://192.168.100.6:9000/user/ubuntu/Dataset75.csv', header=True) data = df.rdd.map(list) print(data.first()) score = data.map(lambda s: 1.0 if s[1].isdigit() and float(s[1]) == 1.0 else 0.0) comment = data.map(lambda s: s[3]) split_neg_data2 = score.zip(comment) tranform_data = split_neg_data2.map(lambda p: (p[0], p[1])) sentenceData = spark.createDataFrame(tranform_data, ["label", "sentence"]) #转化DataFrame tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
def convert( spark: SparkSession, dataset_root: str, limit: int = 0, asset_dir: Optional[str] = None, ) -> DataFrame: """Convert a Coco Dataset into Rikai dataset. This function expects the COCO datasets are stored in directory with the following structure: - dataset - annotations - captions_train2017.json - instances_train2017.json - ... - train2017 - val2017 - test2017 Parameters ---------- spark : SparkSession A live spark session dataset_root : str The directory of dataset limit : int, optional The number of images of each split to be converted. asset_dir : str, optional The asset directory to store images, can be a s3 directory. Return ------ DataFrame Returns a Spark DataFrame """ train_json = os.path.join(dataset_root, "annotations", "instances_train2017.json") val_json = os.path.join(dataset_root, "annotations", "instances_val2017.json") categories = load_categories(train_json) examples = [] for split, anno_file in zip(["train", "val"], [train_json, val_json]): coco = COCO(annotation_file=anno_file) # Coco has native dependencies, so we do not distributed them # to the workers. image_ids = coco.imgs if limit > 0: image_ids = islice(image_ids, limit) for image_id in image_ids: ann_id = coco.getAnnIds(imgIds=image_id) annotations = coco.loadAnns(ann_id) annos = [] for ann in annotations: bbox = Box2d(*ann["bbox"]) annos.append({ "category_id": ann["category_id"], "category_text": categories[ann["category_id"]]["name"], "bbox": bbox, "area": float(ann["area"]), }) image_payload = coco.loadImgs(ids=image_id)[0] example = { "image_id": image_id, "annotations": annos, "image": Image( os.path.abspath( os.path.join( os.curdir, "dataset", "{}2017".format(split), image_payload["file_name"], ))), "split": split, } examples.append(example) schema = StructType([ StructField("image_id", LongType(), False), StructField( "annotations", ArrayType( StructType([ StructField("category_id", IntegerType()), StructField("category_text", StringType()), StructField("area", FloatType()), StructField("bbox", Box2dType()), ])), False, ), StructField("image", ImageType(), False), StructField("split", StringType(), False), ]) df = spark.createDataFrame(examples, schema=schema) if asset_dir: asset_dir = asset_dir if asset_dir.endswith("/") else asset_dir + "/" print("ASSET DIR: ", asset_dir) df = df.withColumn("image", image_copy(col("image"), lit(asset_dir))) return df
from pyspark import SparkContext from pyspark.sql import SparkSession from pyspark import SQLContext from pyspark.sql.types import * import pyspark.sql.functions as F from pyspark.sql.functions import col, udf, lag, date_add, explode, lit, concat, unix_timestamp, sum, abs from pyspark.ml.evaluation import MulticlassClassificationEvaluator from pyspark.ml import PipelineModel sc = SparkContext(appName="MyFirstApp3_Task_task2") spark = SparkSession(sc) df_node16 = spark.read.format("parquet").load( path="hdfs://namenode:9000/example3/test.parquet") model_node17 = PipelineModel.load("hdfs://namenode:9000/example3/model/") df_node18 = model_node17.transform(df_node16) evaluator_node19 = MulticlassClassificationEvaluator( labelCol="indexedSurvived", predictionCol="prediction", metricName="accuracy") score_node19 = evaluator_node19.evaluate(df_node18) df_node19 = spark.createDataFrame([(score_node19, )], ["score"]) df_node19.write.format("csv").save( path="hdfs://namenode:9000/example3/EvalResult3.csv", quote="\"", header=True, sep=",")
from pyspark.sql.functions import struct, array, lit, monotonically_increasing_id, col, expr, when, concat, udf, split, size, lag, count, isnull from pyspark.sql import Window from pyspark.ml.linalg import Vectors from pyspark.ml.regression import GBTRegressor, LinearRegression, GeneralizedLinearRegression, RandomForestRegressor from pyspark.ml.classification import GBTClassifier, RandomForestClassifier from pyspark.ml.feature import VectorIndexer, VectorAssembler, StringIndexer, IndexToString from pyspark.ml.evaluation import RegressionEvaluator from pyspark.ml import Pipeline, PipelineModel from pyspark.ml.evaluation import MulticlassClassificationEvaluator, RegressionEvaluator # In[2]: from pyspark import SparkContext from pyspark.sql import SQLContext, SparkSession #sc = SparkContext() sparkSession = SparkSession(sc).builder.getOrCreate() # # 1. Data Ingestion # ### Peeking into data # In[4]: get_ipython().system( u'curl -i -L "http://edwdemo0.field.hortonworks.com:50070/webhdfs/v1/data/NFLPlaybyPlay2015.csv?op=OPEN" | tail -n 5' ) # ### Load Data from Remote HDP Cluster (from HDFS) # In[5]:
#chapter 12 - rdd, spark definitive guide #refer chapter 3, programming with rdds, learning spark from pyspark.sql import SparkSession spark = SparkSession.Builder().appName("rdd").master("local[3]").getOrCreate() sc = spark.sparkContext r1 = spark.range(10).rdd #below statement would fail if file does not exist r2 = sc.textFile( "/home/user/workarea/projects/learn-pyspark/data/sampledata/sample01.txt") #to handle the situation, use below code: from py4j.protocol import Py4JJavaError def try_read(path): rdd = sc.textFile(path) try: rdd.first() return rdd except Py4JJavaError as e: print("file does not exist, returning empty rdd") return sc.emptyRDD() #now, passing a non-existent file path rdd = try_read("/home/user/workarea/projects/learn-pyspark/data/sample01.txt")
def main(): # Configure argparse parser = argparse.ArgumentParser(description='NetLytics Job') parser.add_argument('--connector', metavar='connector', type=str, help='Connector class name') parser.add_argument('--input_path', metavar='input_path', type=str, help='Base Log Files Input Path') parser.add_argument('--start_day', metavar='start_day', type=str, help='Start day for analysis, format YYYY_MM_DD') parser.add_argument('--end_day', metavar='end_day', type=str, help='End day for analysis, format YYYY_MM_DD') parser.add_argument( '--output_path', metavar='output_path', type=str, help='Path where to store resulting labeled Data Table') parser.add_argument('--algo', metavar='algo', type=str, help='Clustering Algorithm to run') parser.add_argument( '--params', metavar='params', type=str, default="{}", help='Parameters to be given to the Clustering Algorithm, in Json') parser.add_argument( '--query', metavar='query', type=str, default=None, help='Eventual SQL query to execute to preprocess the dataset') parser.add_argument( '--numerical_features', metavar='numerical_features', type=str, default="", help='Columns to use as numerical features, separated by comma') parser.add_argument( '--categorical_features', metavar='categorical_features', type=str, default="", help='Columns to use as categorical features, separated by comma') parser.add_argument("--normalize", action="store_true", help="Normalize data before clustering") # Get parameters args = vars(parser.parse_args()) input_path = args["input_path"] output_path = args["output_path"] connector = args["connector"] algo = args["algo"] params = args["params"] start_day = args["start_day"] end_day = args["end_day"] query = args["query"] numerical_features = args["numerical_features"].split( ",") if args["numerical_features"] != "" else [] categorical_features = args["categorical_features"].split( ",") if args["categorical_features"] != "" else [] normalize = args["normalize"] # Get path of NetLytics base_path = os.path.dirname(os.path.realpath(__file__)) # Create Spark Context conf = (SparkConf().setAppName("NetLytics Job")) sc = SparkContext(conf=conf) spark = SparkSession(sc) # Create the dataframe dataset = core.utils.get_dataset(sc, spark, base_path, connector, input_path, start_day, end_day) # Pre process the dataframe manipulated_dataset = core.utils.transform(dataset,spark,\ sql_query = query,\ numerical_features = numerical_features, categorical_features = categorical_features, normalize=normalize) # Run Clustering clustering_algo_module = my_import(algo, sc) clustering_algo_instance = clustering_algo_module(json.loads(params)) prediction = clustering_algo_instance.run(manipulated_dataset) # Save Output in CSV rdd = prediction.rdd.map(RowToStr) rdd.saveAsTextFile(output_path)
def setUp(self): self.sc = SparkContext.getOrCreate(SparkConf()) self.spark = SparkSession(self.sc) self.obj = MotelsHomeRecommendation('', '', '', '')
def test_sqlcontext_with_stopped_sparkcontext(self): # SPARK-30856: test initialization via SparkSession when only the SparkContext is stopped self.sc.stop() self.sc = SparkContext('local[4]', self.sc.appName) self.spark = SparkSession(self.sc) self.assertIs(SQLContext.getOrCreate(self.sc).sparkSession, self.spark)
print("Could not convert datatype to an Float.") # print(df) """ Checking datatype of each column """ # print('Close ',df['Close'].dtype) # print('Volume ',df['Volume'].dtype) # print('Low ',df['Low'].dtype) # print('Open ',df['Open'].dtype) # print('High ',df['High'].dtype) from pyspark import SparkContext from pyspark.sql import SparkSession sc = SparkContext() sparkSession = SparkSession(sc) stockData = sparkSession.createDataFrame(df) # print(stock_price_data) # print(stock_price_data.printSchema()) # print(stock_price_data.describe().toPandas().transpose()) from pyspark.ml.feature import VectorAssembler from pyspark.ml.regression import LinearRegression from pyspark.ml.evaluation import RegressionEvaluator print("Seperating the Open, High and Low:") featureAssembler = VectorAssembler(inputCols=["Open", "High", "Low"], outputCol="Independent Columns") output = featureAssembler.transform(stockData) # print(output.show())
def transform_data(self, sc, args): """This method gets datajet files as input and prepare them on a daily intermediary basis for Marreco's main algorithm DIMSUM. :type sc: spark context :param sc: spark context for running jobs. :param args: :type days_init: int :param days: How many days to scan through the files to be used in the transformation phase. If this value is say ``5`` then Marreco will take today's date and come back 5 days in time from where it will start reading input files. :type days_end: int :param days_end: Similar to ``days_init`` but tells where the end of scanning should be. If set say equals to ``3``, then scans back in time until 3 days ago couting from today. :type w_browse: float :param w_browse: Weight associated to browsing events on skus. :type w_purchase: float :param w_purchase: Weight associated to purchasing events on skus. :type force: str :param force: Either ``yes``, in which case forces recreation of files, or ``no``, in which case if files already exist then do nothing. :type source_uri: str :param source_uri: URI from where to read input data from. :type inter_uri: str :param inter_uri: URI to save intermediate results. :type neighbor_uri: str :param neighbor_uri: URI for where to save similarity matrix result. :type threshold: float :param threshold: This should be converted to float. It asserts how much quality we should sacrifice in order to gain performance. :type decay: float :param decay: How much less of an influence a score has given how long ago it happened. The further ago the more this ``decay`` factor diminishes the value. """ spark = SparkSession(sc) for day in range(args.days_init, args.days_end - 1, -1): formatted_day = self.get_formatted_date(day) source_uri = args.source_uri.format(formatted_day) inter_uri = args.inter_uri.format(formatted_day) try: inter_data = spark.read.json( inter_uri, schema=self._load_users_matrix_schema()).first() if args.force == 'yes' or not inter_data: self._process_datajet_day(sc, source_uri, inter_uri, args, mode='overwrite') except (Py4JJavaError, AnalysisException): self._process_datajet_day(sc, source_uri, inter_uri, args)
class DeltaTableTests(PySparkTestCase): def setUp(self): super(DeltaTableTests, self).setUp() self.sqlContext = SQLContext(self.sc) self.spark = SparkSession(self.sc) self.tempPath = tempfile.mkdtemp() self.tempFile = os.path.join(self.tempPath, "tempFile") def tearDown(self): self.spark.stop() shutil.rmtree(self.tempPath) super(DeltaTableTests, self).tearDown() def test_forPath(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) dt = DeltaTable.forPath(self.spark, self.tempFile).toDF() self.__checkAnswer(dt, [('a', 1), ('b', 2), ('c', 3)]) def test_alias_and_toDF(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) dt = DeltaTable.forPath(self.spark, self.tempFile).toDF() self.__checkAnswer( dt.alias("myTable").select('myTable.key', 'myTable.value'), [('a', 1), ('b', 2), ('c', 3)]) def test_delete(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)]) dt = DeltaTable.forPath(self.spark, self.tempFile) # delete with condition as str dt.delete("key = 'a'") self.__checkAnswer(dt.toDF(), [('b', 2), ('c', 3), ('d', 4)]) # delete with condition as Column dt.delete(col("key") == lit("b")) self.__checkAnswer(dt.toDF(), [('c', 3), ('d', 4)]) # delete without condition dt.delete() self.__checkAnswer(dt.toDF(), []) # bad args with self.assertRaises(TypeError): dt.delete(condition=1) def test_update(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)]) dt = DeltaTable.forPath(self.spark, self.tempFile) # update with condition as str and with set exprs as str dt.update("key = 'a' or key = 'b'", {"value": "1"}) self.__checkAnswer(dt.toDF(), [('a', 1), ('b', 1), ('c', 3), ('d', 4)]) # update with condition as Column and with set exprs as Columns dt.update(expr("key = 'a' or key = 'b'"), {"value": expr("0")}) self.__checkAnswer(dt.toDF(), [('a', 0), ('b', 0), ('c', 3), ('d', 4)]) # update without condition dt.update(set={"value": "200"}) self.__checkAnswer(dt.toDF(), [('a', 200), ('b', 200), ('c', 200), ('d', 200)]) # bad args with self.assertRaisesRegex(ValueError, "cannot be None"): dt.update({"value": "200"}) with self.assertRaisesRegex(ValueError, "cannot be None"): dt.update(condition='a') with self.assertRaisesRegex(TypeError, "must be a dict"): dt.update(set=1) with self.assertRaisesRegex(TypeError, "must be a Spark SQL Column or a string"): dt.update(1, {}) with self.assertRaisesRegex(TypeError, "Values of dict in .* must contain only"): dt.update(set={"value": 1}) with self.assertRaisesRegex(TypeError, "Keys of dict in .* must contain only"): dt.update(set={1: ""}) with self.assertRaises(TypeError): dt.update(set=1) def test_merge(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)]) source = self.spark.createDataFrame([('a', -1), ('b', 0), ('e', -5), ('f', -6)], ["k", "v"]) def reset_table(): self.__overwriteDeltaTable([('a', 1), ('b', 2), ('c', 3), ('d', 4)]) dt = DeltaTable.forPath(self.spark, self.tempFile) # ============== Test basic syntax ============== # String expressions in merge condition and dicts reset_table() dt.merge(source, "key = k") \ .whenMatchedUpdate(set={"value": "v + 0"}) \ .whenNotMatchedInsert(values={"key": "k", "value": "v + 0"}) \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 0), ('c', 3), ('d', 4), ('e', -5), ('f', -6)])) # Column expressions in merge condition and dicts reset_table() dt.merge(source, expr("key = k")) \ .whenMatchedUpdate(set={"value": col("v") + 0}) \ .whenNotMatchedInsert(values={"key": "k", "value": col("v") + 0}) \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 0), ('c', 3), ('d', 4), ('e', -5), ('f', -6)])) # ============== Test clause conditions ============== # String expressions in all conditions and dicts reset_table() dt.merge(source, "key = k") \ .whenMatchedUpdate(condition="k = 'a'", set={"value": "v + 0"}) \ .whenMatchedDelete(condition="k = 'b'") \ .whenNotMatchedInsert(condition="k = 'e'", values={"key": "k", "value": "v + 0"}) \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('c', 3), ('d', 4), ('e', -5)])) # Column expressions in all conditions and dicts reset_table() dt.merge(source, expr("key = k")) \ .whenMatchedUpdate( condition=expr("k = 'a'"), set={"value": col("v") + 0}) \ .whenMatchedDelete(condition=expr("k = 'b'")) \ .whenNotMatchedInsert( condition=expr("k = 'e'"), values={"key": "k", "value": col("v") + 0}) \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('c', 3), ('d', 4), ('e', -5)])) # Positional arguments reset_table() dt.merge(source, "key = k") \ .whenMatchedUpdate("k = 'a'", {"value": "v + 0"}) \ .whenMatchedDelete("k = 'b'") \ .whenNotMatchedInsert("k = 'e'", {"key": "k", "value": "v + 0"}) \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('c', 3), ('d', 4), ('e', -5)])) # ============== Test updateAll/insertAll ============== # No clause conditions and insertAll/updateAll + aliases reset_table() dt.alias("t") \ .merge(source.toDF("key", "value").alias("s"), expr("t.key = s.key")) \ .whenMatchedUpdateAll() \ .whenNotMatchedInsertAll() \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 0), ('c', 3), ('d', 4), ('e', -5), ('f', -6)])) # String expressions in all clause conditions and insertAll/updateAll + aliases reset_table() dt.alias("t") \ .merge(source.toDF("key", "value").alias("s"), "s.key = t.key") \ .whenMatchedUpdateAll("s.key = 'a'") \ .whenNotMatchedInsertAll("s.key = 'e'") \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 2), ('c', 3), ('d', 4), ('e', -5)])) # Column expressions in all clause conditions and insertAll/updateAll + aliases reset_table() dt.alias("t") \ .merge(source.toDF("key", "value").alias("s"), expr("t.key = s.key")) \ .whenMatchedUpdateAll(expr("s.key = 'a'")) \ .whenNotMatchedInsertAll(expr("s.key = 'e'")) \ .execute() self.__checkAnswer(dt.toDF(), ([('a', -1), ('b', 2), ('c', 3), ('d', 4), ('e', -5)])) # ============== Test bad args ============== # ---- bad args in merge() with self.assertRaisesRegex(TypeError, "must be DataFrame"): dt.merge(1, "key = k") with self.assertRaisesRegex(TypeError, "must be a Spark SQL Column or a string"): dt.merge(source, 1) # ---- bad args in whenMatchedUpdate() with self.assertRaisesRegex(ValueError, "cannot be None"): dt.merge(source, "key = k").whenMatchedUpdate({"value": "v"}) with self.assertRaisesRegex(ValueError, "cannot be None"): dt.merge(source, "key = k").whenMatchedUpdate(1) with self.assertRaisesRegex(ValueError, "cannot be None"): dt.merge(source, "key = k").whenMatchedUpdate(condition="key = 'a'") with self.assertRaisesRegex(TypeError, "must be a Spark SQL Column or a string"): dt.merge(source, "key = k").whenMatchedUpdate(1, {"value": "v"}) with self.assertRaisesRegex(TypeError, "must be a dict"): dt.merge(source, "key = k").whenMatchedUpdate("k = 'a'", 1) with self.assertRaisesRegex(TypeError, "Values of dict in .* must contain only"): dt.merge(source, "key = k").whenMatchedUpdate(set={"value": 1}) with self.assertRaisesRegex(TypeError, "Keys of dict in .* must contain only"): dt.merge(source, "key = k").whenMatchedUpdate(set={1: ""}) with self.assertRaises(TypeError): dt.merge(source, "key = k").whenMatchedUpdate(set="k = 'a'", condition={"value": 1}) # bad args in whenMatchedDelete() with self.assertRaisesRegex(TypeError, "must be a Spark SQL Column or a string"): dt.merge(source, "key = k").whenMatchedDelete(1) # ---- bad args in whenNotMatchedInsert() with self.assertRaisesRegex(ValueError, "cannot be None"): dt.merge(source, "key = k").whenNotMatchedInsert({"value": "v"}) with self.assertRaisesRegex(ValueError, "cannot be None"): dt.merge(source, "key = k").whenNotMatchedInsert(1) with self.assertRaisesRegex(ValueError, "cannot be None"): dt.merge(source, "key = k").whenNotMatchedInsert(condition="key = 'a'") with self.assertRaisesRegex(TypeError, "must be a Spark SQL Column or a string"): dt.merge(source, "key = k").whenNotMatchedInsert(1, {"value": "v"}) with self.assertRaisesRegex(TypeError, "must be a dict"): dt.merge(source, "key = k").whenNotMatchedInsert("k = 'a'", 1) with self.assertRaisesRegex(TypeError, "Values of dict in .* must contain only"): dt.merge(source, "key = k").whenNotMatchedInsert(values={"value": 1}) with self.assertRaisesRegex(TypeError, "Keys of dict in .* must contain only"): dt.merge(source, "key = k").whenNotMatchedInsert(values={1: "value"}) with self.assertRaises(TypeError): dt.merge(source, "key = k").whenNotMatchedInsert(values="k = 'a'", condition={"value": 1}) def test_history(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) self.__overwriteDeltaTable([('a', 3), ('b', 2), ('c', 1)]) dt = DeltaTable.forPath(self.spark, self.tempFile) operations = dt.history().select('operation') self.__checkAnswer( operations, [Row("WRITE"), Row("WRITE")], StructType([StructField("operation", StringType(), True)])) lastMode = dt.history(1).select('operationParameters.mode') self.__checkAnswer( lastMode, [Row("Overwrite")], StructType( [StructField("operationParameters.mode", StringType(), True)])) def test_vacuum(self): self.__writeDeltaTable([('a', 1), ('b', 2), ('c', 3)]) dt = DeltaTable.forPath(self.spark, self.tempFile) self.__createFile('abc.txt', 'abcde') self.__createFile('bac.txt', 'abcdf') self.assertEqual(True, self.__checkFileExists('abc.txt')) dt.vacuum() # will not delete files as default retention is used. dt.vacuum(1000) # test whether integers work self.assertEqual(True, self.__checkFileExists('bac.txt')) retentionConf = "spark.databricks.delta.retentionDurationCheck.enabled" self.spark.conf.set(retentionConf, "false") dt.vacuum(0.0) self.spark.conf.set(retentionConf, "true") self.assertEqual(False, self.__checkFileExists('bac.txt')) self.assertEqual(False, self.__checkFileExists('abc.txt')) def test_convertToDelta(self): df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"]) df.write.format("parquet").save(self.tempFile) dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % self.tempFile) self.__checkAnswer( self.spark.read.format("delta").load(self.tempFile), [('a', 1), ('b', 2), ('c', 3)]) # test if convert to delta with partition columns work tempFile2 = self.tempFile + "_2" df.write.partitionBy("value").format("parquet").save(tempFile2) schema = StructType() schema.add("value", IntegerType(), True) dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % tempFile2, schema) self.__checkAnswer( self.spark.read.format("delta").load(tempFile2), [('a', 1), ('b', 2), ('c', 3)]) # convert to delta with partition column provided as a string tempFile3 = self.tempFile + "_3" df.write.partitionBy("value").format("parquet").save(tempFile3) dt = DeltaTable.convertToDelta(self.spark, "parquet.`%s`" % tempFile3, "value int") self.__checkAnswer( self.spark.read.format("delta").load(tempFile3), [('a', 1), ('b', 2), ('c', 3)]) def test_isDeltaTable(self): df = self.spark.createDataFrame([('a', 1), ('b', 2), ('c', 3)], ["key", "value"]) df.write.format("parquet").save(self.tempFile) tempFile2 = self.tempFile + '_2' df.write.format("delta").save(tempFile2) self.assertEqual(DeltaTable.isDeltaTable(self.spark, self.tempFile), False) self.assertEqual(DeltaTable.isDeltaTable(self.spark, tempFile2), True) def __checkAnswer(self, df, expectedAnswer, schema=["key", "value"]): if not expectedAnswer: self.assertEqual(df.count(), 0) return expectedDF = self.spark.createDataFrame(expectedAnswer, schema) try: self.assertEqual(df.count(), expectedDF.count()) self.assertEqual(len(df.columns), len(expectedDF.columns)) self.assertEqual([], df.subtract(expectedDF).take(1)) self.assertEqual([], expectedDF.subtract(df).take(1)) except AssertionError: print("Expected:") expectedDF.show() print("Found:") df.show() raise def __writeDeltaTable(self, datalist): df = self.spark.createDataFrame(datalist, ["key", "value"]) df.write.format("delta").save(self.tempFile) def __overwriteDeltaTable(self, datalist): df = self.spark.createDataFrame(datalist, ["key", "value"]) df.write.format("delta").mode("overwrite").save(self.tempFile) def __createFile(self, fileName, content): with open(os.path.join(self.tempFile, fileName), 'w') as f: f.write(content) def __checkFileExists(self, fileName): return os.path.exists(os.path.join(self.tempFile, fileName))
import platform import warnings import py4j from pyspark import SparkConf from pyspark.context import SparkContext from pyspark.sql import SparkSession, SQLContext if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) SparkContext._ensure_initialized() try: spark = SparkSession._create_shell_session() except Exception: import sys import traceback warnings.warn("Failed to initialize Spark session.") traceback.print_exc(file=sys.stderr) sys.exit(1) sc = spark.sparkContext sql = spark.sql atexit.register(lambda: sc.stop()) # for compatibility sqlContext = spark._wrapped sqlCtx = sqlContext
def setUp(self): super(DeltaTableTests, self).setUp() self.sqlContext = SQLContext(self.sc) self.spark = SparkSession(self.sc) self.tempPath = tempfile.mkdtemp() self.tempFile = os.path.join(self.tempPath, "tempFile")
from pyspark.sql import SQLContext # start JVM gateway client = GatewayClient(port=${JVM_GATEWAY_PORT}) gateway = JavaGateway(client, auto_convert=True) java_import(gateway.jvm, "org.apache.spark.SparkEnv") java_import(gateway.jvm, "org.apache.spark.SparkConf") java_import(gateway.jvm, "org.apache.spark.api.java.*") java_import(gateway.jvm, "org.apache.spark.api.python.*") java_import(gateway.jvm, "org.apache.spark.mllib.api.python.*") intp = gateway.entry_point jsc = intp.getJavaSparkContext() java_import(gateway.jvm, "org.apache.spark.sql.*") java_import(gateway.jvm, "org.apache.spark.sql.hive.*") java_import(gateway.jvm, "scala.Tuple2") jconf = jsc.getConf() conf = SparkConf(_jvm=gateway.jvm, _jconf=jconf) sc = _zsc_ = SparkContext(jsc=jsc, gateway=gateway, conf=conf) if intp.isSpark2(): from pyspark.sql import SparkSession spark = __zSpark__ = SparkSession(sc, intp.getSparkSession()) sqlContext = sqlc = __zSqlc__ = __zSpark__._wrapped else: sqlContext = sqlc = __zSqlc__ = SQLContext(sparkContext=sc, sqlContext=intp.getSQLContext())
def setUpClass(cls): PySparkTestCase.setUpClass() cls.spark = SparkSession(cls.sc)
def main(spark) -> None: """ run example """ PG_CONFIG = get_pg_config() print('TRY: create df') df = spark.range(1, 20, 1, 4).withColumn('mono_id', F.monotonically_increasing_id()) print('OK: create df') df.show() print('') print('TRY: write_to_pg') write_to_pg(df=df, config=PG_CONFIG, table='test_table') print('OK: write_to_pg') print('') print('TRY: read_from_pg') df_loaded = read_from_pg(config=PG_CONFIG, sql='test_table', sc=sc) print('OK: read_from_pg') df_loaded.show() if __name__ == '__main__': sc = init_spark_context('app') spark = SparkSession(sc) main(spark) spark.stop()
def run_spark(output_file): sc = SparkContext() spark = SparkSession(sc) violations_df = get_violations_df(violations, spark) streets_df = get_streets_df(streets, spark) streets_dict = streets_df.rdd.flatMap(mapper).reduceByKey( lambda x, y: x + y).collectAsMap() streets_dict_bc = sc.broadcast(streets_dict) def get_val(borocode, street, num0, num1): res = None housenum = (num0, num1) if num0 != 0 and num1 == 0: housenum = (num1, num0) candidates = streets_dict_bc.value.get((borocode, street)) if candidates: res = search_candidates(candidates, housenum) if res is None and num0 > 1000 and num1 == 0: housenum = (int(num0 / 100), num0 % 100) res = search_candidates(candidates, housenum) if res is None and num0 != 0 and num1 != 0: housenum = (0, (num0 * 100) + num1) res = search_candidates(candidates, housenum) if res is None and num0 != 0 and num1 != 0: housenum = (0, num1) res = search_candidates(candidates, housenum) return res get_val_udf = udf(get_val) matched_violations = violations_df.withColumn( 'PHYSICALID', get_val_udf(violations_df['v.COUNTY'], violations_df['v.STREETNAME'], violations_df['v.NUM0'], violations_df['v.NUM1'])) matched_violations = matched_violations.filter( matched_violations['PHYSICALID'].isNotNull()) matched_violations = matched_violations.withColumn( "PHYSICALID", matched_violations["PHYSICALID"].cast("integer")) matched_violations = matched_violations.orderBy("PHYSICALID") matched_violations = matched_violations.groupBy("PHYSICALID", "YEAR").agg( count("*").alias("YEAR_COUNT")) matched_violations.createOrReplaceTempView("matched_violations") summaries = spark.sql( "select PHYSICALID, " + "MAX(CASE WHEN (YEAR = 2015) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2015, " + "MAX(CASE WHEN (YEAR = 2016) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2016, " + "MAX(CASE WHEN (YEAR = 2017) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2017, " + "MAX(CASE WHEN (YEAR = 2018) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2018, " + "MAX(CASE WHEN (YEAR = 2019) THEN YEAR_COUNT ELSE 0 END) AS COUNT_2019 " + "from matched_violations " + "group by PHYSICALID " + "order by PHYSICALID ") getOLS_udf = udf(getOLS) summaries = summaries.withColumn( 'OLS_COEF', getOLS_udf( array('COUNT_2015', 'COUNT_2016', 'COUNT_2017', 'COUNT_2018', 'COUNT_2019'))) streets_df = streets_df.select(col("s.PHYSICALID")) \ .join(summaries, "PHYSICALID", how='left') \ .distinct() \ .orderBy("PHYSICALID") \ streets_df = streets_df.withColumn("COUNT_2015", coalesce("COUNT_2015", lit(0))) streets_df = streets_df.withColumn("COUNT_2016", coalesce("COUNT_2016", lit(0))) streets_df = streets_df.withColumn("COUNT_2017", coalesce("COUNT_2017", lit(0))) streets_df = streets_df.withColumn("COUNT_2018", coalesce("COUNT_2018", lit(0))) streets_df = streets_df.withColumn("COUNT_2019", coalesce("COUNT_2019", lit(0))) streets_df = streets_df.withColumn("OLS_COEF", coalesce("OLS_COEF", lit(0.0))) streets_df.write.csv(output_file, header=False)
local_data_directory = '/home/brendan/FastData/pubmed/gz/' ################################################ print('initializing spark') # init spark conf = SparkConf() conf = ( conf.setMaster('local[*]').set('spark.driver.memory', '96G') # 40 .set('spark.driver.maxResultSize', '500M')) #.set('spark.storage.memoryFraction',0)) # this setting is now a legacy option #.set('spark.executor.memory','1G') # 20 #.set('spark.python.worker.reuse', 'false') #.set('spark.python.worker.memory','512m') #.set('spark.executor.cores','1')) sc = SparkContext(conf=conf) spark = SparkSession(sc) # don't need this for vanilla RDDs print(sc._conf.getAll()) ############################################### def ftp_helper(namestr): # local save file for ftp'd xml target = target_brdcst.value + namestr.split( '.')[0] + '.xml.gz' # pubmed year chunk# .xml try: with FTP(ftp_dir_broadcast.value) as ftp: ftp.login() r = BytesIO()
def init(): global sc sc = SparkContext() sc.setLogLevel("ALL") # sc.addPyFile("load.py") spark = SparkSession(sc)
from pyspark import SparkContext from pyspark import SparkConf from pyspark.sql import SparkSession from pyspark.sql import SQLContext from pyspark.sql.functions import * from tableFunctions import * sc = SparkContext() spark = SparkSession(sc) #start SparkSQL session sqlContext = SQLContext(sc) #read the file with its header df = sqlContext.read.option("header",True).csv("*.csv") #count the number of rows (it is 4820022) df.count() #count the unique values in recipient_name (it is 153449) df.select('recipient_name').distinct().count() #delete periods, commas, and spaces from recipient names df = df.withColumn('recipient_name', regexp_replace('recipient_name', ' AND ', '')) df = df.withColumn('recipient_name', regexp_replace('recipient_name', '&', '')) df = df.withColumn('recipient_name', regexp_replace('recipient_name', '\.', '')) df = df.withColumn('recipient_name', regexp_replace('recipient_name', ',', '')) df = df.withColumn('recipient_name', regexp_replace('recipient_name', ' ', '')) #count again (it is now 149582) df.select('recipient_name').distinct().count()
os.environ["PYSPARK_SUBMIT_ARGS"] = ( "--packages graphframes:graphframes:0.8.1-spark3.0-s_2.12 pyspark-shell") import pyspark from pyspark.sql import SparkSession from App.utils import * from graphframes import * hdfs_host = "hdfs://localhost:9000" hdfs_root_path = "/SafeEntry_Analytics/" conf = pyspark.SparkConf().setAppName("Process Entry Record Graph").setMaster( "local[*]") sc = pyspark.SparkContext(conf=conf) spark = SparkSession(sc) resident_file_dest = "resident.parquet" safe_entry_daily_file_dest = "entry_record.parquet" contact_graph_edge_file_dest = "contact_graph_edge.parquet" contact_graph_vertex_file_dest = "contact_graph_vertex.parquet" # Step 1: read resident parquet file resident_df = read_parquet_file( spark, hdfs_host + hdfs_root_path + resident_file_dest) resident_df.cache() def lookup_nric(resident_id): return resident_df.filter( resident_df['resident_id'] == resident_id).collect()[0]['nric']
def setUpClass(cls): super(ReusedSQLTestCase, cls).setUpClass() cls.spark = SparkSession(cls.sc) cls.spark.conf.set('spark.sql.execution.arrow.enabled', True)
class SparkConfig: sc = pyspark.SparkContext(master=os.environ['SPARK_MASTER'], appName='anomaly_detector') ss = SparkSession(sc)
class IonCentroidsGenerator(object): """ Generator of theoretical isotope peaks for all molecules in a database. Args ---------- sc : pyspark.SparkContext moldb_name : str isocalc: IsocalcWrapper """ def __init__(self, sc, moldb_name, isocalc): self._sc = sc self._moldb_name = moldb_name self._isocalc = isocalc self._sm_config = SMConfig.get_conf() self._parquet_chunks_n = 64 self._iso_gen_part_n = 512 self._spark_session = SparkSession(self._sc) self._ion_centroids_path = '{}/{}/{}/{}'.format(self._sm_config['isotope_storage']['path'], self._moldb_name, self._isocalc.sigma, self._isocalc.charge) self.ion_df = None self.ion_centroids_df = None def exists(self): """ Check if ion centroids saved to parquet """ if self._ion_centroids_path.startswith('s3a://'): cred_dict = dict(aws_access_key_id=self._sm_config['aws']['aws_access_key_id'], aws_secret_access_key=self._sm_config['aws']['aws_secret_access_key']) bucket, key = split_s3_path(self._ion_centroids_path) s3 = boto3.client('s3', **cred_dict) try: s3.head_object(Bucket=bucket, Key=key + '/ions/_SUCCESS') except ClientError: return False else: return True else: return Path(self._ion_centroids_path + '/ions/_SUCCESS').exists() def generate(self, isocalc, sfs, adducts): """ Generate isotopic peaks Args --- isocalc: IsocalcWrapper Cannot be a class field as Spark doesn't allow to pass 'self' to functions adducts: list """ logger.info('Generating molecular isotopic peaks') def calc_centroids(args): ion_i, sf, adduct = args mzs, ints = isocalc.ion_centroids(sf, adduct) if mzs is not None: return zip(repeat(ion_i), range(0, len(mzs)), map(float, mzs), map(float, ints)) else: return [] ion_df = pd.DataFrame([(i, sf, adduct) for i, (sf, adduct) in enumerate(sorted(product(sfs, adducts)))], columns=['ion_i', 'sf', 'adduct']).set_index('ion_i') ion_centroids_rdd = (self._sc.parallelize(ion_df.reset_index().values, numSlices=self._iso_gen_part_n) .flatMap(calc_centroids)) self.ion_centroids_df = (pd.DataFrame(data=ion_centroids_rdd.collect(), columns=['ion_i', 'peak_i', 'mz', 'int']) .sort_values(by='mz') .set_index('ion_i')) self.ion_df = ion_df.loc[self.ion_centroids_df.index.unique()] # Use when pandas DataFrames get way too big # ion_centroids_df = self._spark_session.createDataFrame(data=ion_centroids_rdd, # schema=self.ion_centroids_df_fields) # self.ion_centroids_df = (ion_centroids_df # .sort(ion_centroids_df.mz.asc()) # .coalesce(self._parquet_chunks_n)) def save(self): """ Save isotopic peaks """ logger.info('Saving peaks') centr_spark_df = self._spark_session.createDataFrame(self.ion_centroids_df.reset_index()) centr_spark_df.write.parquet(self._ion_centroids_path + '/ion_centroids', mode='overwrite') ion_spark_df = self._spark_session.createDataFrame(self.ion_df.reset_index()) ion_spark_df.write.parquet(self._ion_centroids_path + '/ions', mode='overwrite') def restore(self): logger.info('Restoring peaks') self.ion_df = self._spark_session.read.parquet( self._ion_centroids_path + '/ions').toPandas().set_index('ion_i') self.ion_centroids_df = self._spark_session.read.parquet( self._ion_centroids_path + '/ion_centroids').toPandas().set_index('ion_i') def sf_adduct_centroids_df(self): return self.ion_df.join(self.ion_centroids_df).set_index(['sf', 'adduct']) def centroids_subset(self, ions): """ Restore isotopic peaks dataframe only for the 'ions' Args --- ions: list of tuples Returns --- : pandas.DataFrame """ assert self.ion_df is not None ion_map = self.ion_df.reset_index().set_index(['sf', 'adduct']).ion_i ion_ids = ion_map.loc[ions].values return self.ion_centroids_df.loc[ion_ids].sort_values(by='mz') def generate_if_not_exist(self, isocalc, sfs, adducts): if not self.exists(): self.generate(isocalc=isocalc, sfs=sfs, adducts=adducts) self.save() else: self.restore() def ions(self, adducts): return (self.ion_df[self.ion_df.adduct.isin(adducts)] .sort_values(by=['sf', 'adduct']) .to_records(index=False))
def setUp(self): self.sc = SparkContext('local[4]', "MLlib tests") self.spark = SparkSession(self.sc)
if data['BOT'] == 'False': return [(('Domain', data['Domain']), 1), (('Bot', data['BOT']), 1), (('User', data['User']), 1), (('Class', data['Class']), 1)] else: return [(('Domain', data['Domain']), 1), (('Bot', data['BOT']), 1), (('Class', data['Class']), 1)] except: return [] if __name__ == '__main__': print('Starting Spark Context and Session') #Initialize Spark sc = SparkContext(appName='WikiStats') sparksql = SparkSession(sc) sc.setLogLevel("ERROR") #Streaming and window variables window_slide = 6 window_duration = 40 batch_time = 2 #Read streams from Kafka Topic 'wikipedia' as dstreams ssc = StreamingContext(sc, batch_time) ssc.checkpoint("checkpoint_wiki") print('Building process pipeline') ##Stream Pipeline Design wikiKafkaStream = KafkaUtils.createDirectStream( ssc, [KAFKA_TOPIC_read], {"metadata.broker.list": KAFKA_BROKER_read})
def init_opaque_sql(testing=False): sc = SparkContext.getOrCreate() sc._jvm.edu.berkeley.cs.rise.opaque.Utils.initOpaqueSQL( SparkSession(sc)._jsparkSession, testing)
class SparkWithCustomGateway: def __init__(self): spark_conf = SparkConf() spark_conf.setAppName(spark_nlp_config.app_name) spark_conf.setMaster(spark_nlp_config.master) spark_conf.set("spark.driver.memory", memory) spark_conf.set("spark.serializer", spark_nlp_config.serializer) spark_conf.set("spark.kryoserializer.buffer.max", spark_nlp_config.serializer_max_buffer) spark_conf.set("spark.driver.maxResultSize", spark_nlp_config.driver_max_result_size) if gpu: spark_conf.set("spark.jars.packages", spark_nlp_config.maven_gpu_spark) else: spark_conf.set("spark.jars.packages", spark_nlp_config.maven_spark) # Make the py4j JVM stdout and stderr available without buffering popen_kwargs = { 'stdout': subprocess.PIPE, 'stderr': subprocess.PIPE, 'bufsize': 0 } # Launch the gateway with our custom settings self.gateway = launch_gateway(conf=spark_conf, popen_kwargs=popen_kwargs) self.process = self.gateway.proc # Use the gateway we launched spark_context = SparkContext(gateway=self.gateway) self.spark_session = SparkSession(spark_context) self.out_thread = threading.Thread(target=self.output_reader) self.error_thread = threading.Thread(target=self.error_reader) self.std_background_listeners() def std_background_listeners(self): self.out_thread.start() self.error_thread.start() def output_reader(self): for line in iter(self.process.stdout.readline, b''): print('{0}'.format(line.decode('utf-8')), end='') def error_reader(self): RED = '\033[91m' RESET = '\033[0m' for line in iter(self.process.stderr.readline, b''): if output_level == 0: print(RED + '{0}'.format(line.decode('utf-8')) + RESET, end='') else: # output just info pass def shutdown(self): self.spark_session.stop() self.gateway.shutdown() self.process.communicate() self.out_thread.join() self.error_thread.join()
import pyspark from pyspark.context import SparkContext from pyspark.sql import SparkSession, SQLContext from pyspark.storagelevel import StorageLevel if os.environ.get("SPARK_EXECUTOR_URI"): SparkContext.setSystemProperty("spark.executor.uri", os.environ["SPARK_EXECUTOR_URI"]) sc = SparkContext() atexit.register(lambda: sc.stop()) try: # Try to access HiveConf, it will raise exception if Hive is not added sc._jvm.org.apache.hadoop.hive.conf.HiveConf() spark = SparkSession.withHiveSupport(sc) except py4j.protocol.Py4JError: spark = SparkSession(sc) except TypeError: spark = SparkSession(sc) # for compatibility sqlContext = spark._wrapped sqlCtx = sqlContext print("""Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /__ / .__/\_,_/_/ /_/\_\ version %s /_/