def test_group_transactions(spark): sql = SQLContext(spark) create_spark_views(spark, customers_location, products_location, transactions_location) group_transactions(spark) columns = spark.sql("""SELECT * FROM transactions_grouped""").columns assert "transactions_grouped" in sql.tableNames() assert columns == ["customer_id", "product_id", "count"]
def __init__(self, sparkContext, snappyContext=None): self._sc = sparkContext self._jsc = self._sc._jsc self._jvm = self._sc._jvm snappySession = SnappySession(sparkContext) SQLContext.__init__(self, sparkContext, snappySession) if snappyContext: self._scala_SnappyContext = snappyContext
def init(self): os.environ[ "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") # conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc)
def main(): sc = initializeSpark() spark = SparkSession(sc) directory, post_id = parse_pls() rdds = make_rdds_from_dir(directory, sc) post_rdd = rdds["posts_rdd"] string = make_stripped_string(post_rdd, post_id) print("\n Body from post_id: " + str(post_id) + ", stripped of shitespaces and special characters:\n") print("'" + string + "'\n") # Tokenize the string tokens = tokenize(string) # remove duplicate entries tokens_unique = remove_dupes(tokens) # Assign id to the unique tokens token_id_tuple = assign_id_to_list(tokens_unique) # Now assign these id's the the original token list token_id_all = assign_unique_ids(token_id_tuple, tokens) print("\nTokens retrieved from the body with their respective id's: \n") for i in token_id_all: print(i) print("\n\nEdges:\n") ids = [] for i in token_id_all: ids.append(i[0]) # Create edges on a window size of 5, using the ids of the tokens edges = create_edges(ids, 5) # Removes duplicate edges from list edges = remove_dupe_tuples(edges) print(edges) print("\n\nPageRank:") sqlContext = SQLContext(sc) v = sqlContext.createDataFrame(token_id_tuple, ["id", "word"]) e = sqlContext.createDataFrame(edges, ["src", "dst"]) g = graphframes.GraphFrame(v, e) results = g.pageRank(resetProbability=0.15, tol=0.0001) results.vertices.select("word", "pagerank").show(truncate=False)
def __init__(self, sparkContext: SparkContext, jsparkSession: Optional[JavaObject] = None): from pyspark.sql.context import SQLContext self._sc = sparkContext self._jsc = self._sc._jsc # type: ignore[attr-defined] self._jvm = self._sc._jvm # type: ignore[attr-defined] if jsparkSession is None: if self._jvm.SparkSession.getDefaultSession().isDefined() \ and not self._jvm.SparkSession.getDefaultSession().get() \ .sparkContext().isStopped(): jsparkSession = self._jvm.SparkSession.getDefaultSession().get() else: jsparkSession = self._jvm.SparkSession(self._jsc.sc()) self._jsparkSession = jsparkSession self._jwrapped = self._jsparkSession.sqlContext() self._wrapped = SQLContext(self._sc, self, self._jwrapped) _monkey_patch_RDD(self) install_exception_handler() # If we had an instantiated SparkSession attached with a SparkContext # which is stopped now, we need to renew the instantiated SparkSession. # Otherwise, we will use invalid SparkSession when we call Builder.getOrCreate. if SparkSession._instantiatedSession is None \ or SparkSession._instantiatedSession._sc._jsc is None: # type: ignore[attr-defined] SparkSession._instantiatedSession = self SparkSession._activeSession = self self._jvm.SparkSession.setDefaultSession(self._jsparkSession) self._jvm.SparkSession.setActiveSession(self._jsparkSession)
def __init__(self, sparkContext, jsparkSession=None): """Creates a new SparkSession. >>> from datetime import datetime >>> spark = SparkSession(sc) >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1, ... b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1), ... time=datetime(2014, 8, 1, 14, 1, 5))]) >>> df = allTypes.toDF() >>> df.createOrReplaceTempView("allTypes") >>> spark.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a ' ... 'from allTypes where b and i > 0').collect() [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \ dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)] >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect() [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])] """ from pyspark.sql.context import SQLContext self._sc = sparkContext self._jsc = self._sc._jsc self._jvm = self._sc._jvm if jsparkSession is None: jsparkSession = self._jvm.SparkSession.builder().getOrCreate() self._jsparkSession = jsparkSession self._jwrapped = self._jsparkSession.sqlContext() self._wrapped = SQLContext(self._sc, self, self._jwrapped) _monkey_patch_RDD(self) install_exception_handler() # If we had an instantiated SparkSession attached with a SparkContext # which is stopped now, we need to renew the instantiated SparkSession. # Otherwise, we will use invalid SparkSession when we call Builder.getOrCreate. if SparkSession._instantiatedSession is None \ or SparkSession._instantiatedSession._sc._jsc is None: SparkSession._instantiatedSession = self
def test_estimator_graph_dataframe(self): tf.reset_default_graph() model = SimpleModel() file_path = os.path.join(resource_path, "orca/learn/ncf.csv") sc = init_nncontext() sqlcontext = SQLContext(sc) df = sqlcontext.read.csv(file_path, header=True, inferSchema=True) est = Estimator.from_graph( inputs=[model.user, model.item], labels=[model.label], outputs=[model.logits], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) est.fit(data=df, batch_size=8, epochs=10, feature_cols=['user', 'item'], label_cols=['label'], validation_data=df) result = est.evaluate(df, batch_size=4, feature_cols=['user', 'item'], label_cols=['label']) print(result) prediction_df = est.predict(df, batch_size=4, feature_cols=['user', 'item']) assert 'prediction' in prediction_df.columns predictions = prediction_df.collect() assert len(predictions) == 48
def __init__(self, sparkContext, jsparkSession=None): """Creates a new SparkSession. >>> from datetime import datetime >>> spark = SparkSession(sc) >>> allTypes = sc.parallelize([Row(i=1, s="string", d=1.0, l=1, ... b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=1), ... time=datetime(2014, 8, 1, 14, 1, 5))]) >>> df = allTypes.toDF() >>> df.registerTempTable("allTypes") >>> spark.sql('select i+1, d+1, not b, list[1], dict["s"], time, row.a ' ... 'from allTypes where b and i > 0').collect() [Row((i + CAST(1 AS BIGINT))=2, (d + CAST(1 AS DOUBLE))=2.0, (NOT b)=False, list[1]=2, \ dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=1)] >>> df.rdd.map(lambda x: (x.i, x.s, x.d, x.l, x.b, x.time, x.row.a, x.list)).collect() [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])] """ from pyspark.sql.context import SQLContext self._sc = sparkContext self._jsc = self._sc._jsc self._jvm = self._sc._jvm if jsparkSession is None: jsparkSession = self._jvm.SparkSession(self._jsc.sc()) self._jsparkSession = jsparkSession self._jwrapped = self._jsparkSession.wrapped() self._wrapped = SQLContext(self._sc, self, self._jwrapped) _monkey_patch_RDD(self) install_exception_handler() if SparkSession._instantiatedContext is None: SparkSession._instantiatedContext = self
def __init__(self, session: SparkSession, jsparkSession=None): """Creates a new SequilaSession. """ ss = session._jvm.org.apache.spark.sql.SequilaSession( session._jsparkSession) session._jvm.org.biodatageeks.utils.SequilaRegister.register(ss) session._jvm.org.biodatageeks.utils.UDFRegister.register(ss) session._jvm.SequilaSession.setDefaultSession(ss) sequilaSession = SequilaSession._instantiatedSession from pyspark.sql.context import SQLContext self._sc = sequilaSession._sc self._jsc = self._sc._jsc self._jvm = session._jvm if jsparkSession is None: if self._jvm.SequilaSession.getDefaultSession().isDefined() \ and not self._jvm.SequilaSession.getDefaultSession().get() \ .sparkContext().isStopped(): jsparkSession = self._jvm.SequilaSession.getDefaultSession( ).get() else: jsparkSession = self._jvm.SequilaSession(self._jsc.sc()) self._jsparkSession = jsparkSession self._jwrapped = self._jsparkSession.sqlContext() self._wrapped = SQLContext(self._sc, self, self._jwrapped) if SequilaSession._instantiatedSession is None \ or SequilaSession._instantiatedSession._sc._jsc is None: SequilaSession._instantiatedSession = self self._jvm.SparkSession.setDefaultSession(self._jsparkSession)
def init(self): os.environ[ "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local") conf.setAppName("My application") conf.set("spark.executor.memory", "2g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) self.df_user = self.sqlContext.read.json("dataset/user.json").cache() self.df_review = self.sqlContext.read.json( "dataset/review.json").cache() self.df_business = self.sqlContext.read.json( "dataset/business.json").cache() self.df_user.registerTempTable("user")
def __init__(self, sc, spark): self.sc = sc self.spark = spark self.sql_context = SQLContext(self.sc) self.schema = StructType([ StructField("sensor_id", StringType(), False), StructField("currentTemperature", IntegerType(), False), StructField("status", StringType(), False) ])
def init(self): os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local[10]") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc)
class MainApp(object): def __init__(self): pass def init(self): os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local[10]") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) def loadData(self): category_list = self.sc.textFile("/Users/abhinavrungta/Desktop/uf-study/snc/github/SNC-WEB/src/yahoo/ydata-ymovies-user-movie-ratings-train-v1_0.txt").map(lambda line: (int(line.split(',')[0]), int(line.split(',')[1]), float(line.split(',')[2]), long(line.split(',')[3]))) category_schema = StructType([ StructField("userid", IntegerType(), True), StructField("movieid", IntegerType(), True), StructField("rating", FloatType(), True), StructField("time", LongType(), True) ]) category_list = self.sqlContext.createDataFrame(category_list, category_schema) category_list.registerTempTable("data") movie_list = self.sqlContext.sql("SELECT movieid, COUNT(movieid) AS ct FROM data GROUP BY movieid") movie_list.registerTempTable("movie") movieid = movie_list.sort(movie_list.ct.desc()).first().movieid # movieid = category_list.first().movieid category_list = self.sqlContext.sql("SELECT * FROM data WHERE movieid = {0}".format(movieid)) category_list.registerTempTable("data") user_list = self.sqlContext.sql("SELECT DISTINCT userid FROM data LIMIT 50") print(user_list.count()) user_list.show() user_list.registerTempTable("users") category_list = self.sqlContext.sql("SELECT d.userid AS userid, d.movieid AS movieid, d.rating AS rating, d.time AS time FROM data d, users u WHERE d.userid = u.userid").repartition(1) #category_list = self.sqlContext.createDataFrame(category_list, category_schema) category_list = category_list.map(lambda line: str(line.userid) + "," + str(line.movieid) + "," + str(line.rating) + "," + str(line.time)) category_list = category_list.repartition(1) category_list.saveAsTextFile("data.txt")
def Compute_Average(rinput, output): ## ratings data ### Craeting ratings-rdd my_RDD_strings = sc.textFile(rinput + '/' + 'ratings.csv') data = my_RDD_strings.map(lambda line: readline(line)) ## Extraacting header row header_info = data.first() ### Selects all the rows except the header row data_mr = data.filter(lambda ratings: ratings != header_info) data_mr = data_mr.map(lambda ratings: string_to_float(ratings)) data_mr_sum_count = data_mr.aggregateByKey( (0, 0), lambda U, s: (U[0] + s, U[1] + 1), lambda U, V: (U[0] + V[0], U[1] + V[1])) ### format (tag(sum,count)) avg_ratings = data_mr_sum_count.map(lambda (a, (b, c)): (a, float(b) / c)) ### here a=movie,b=sum,c=count sorted_avg_ratings = avg_ratings.sortByKey( ) ## sorting by movieID in accending order ## Creating output csv file based on the dataset-folder if 'ml-20m' in rinput: result_csv = 'Prashanth_Manja_result_task1_big.csv' else: result_csv = 'Prashanth_Manja_result_task1_small.csv' sqlContext = SQLContext(sc) data_frame = sqlContext.createDataFrame(sorted_avg_ratings) panda_data_frame = data_frame.toPandas() ## Output as csv file panda_data_frame.to_csv(output + '/' + result_csv, encoding='utf-8', header=['movieID', 'rating_avg'], index=False)
def pytest_sessionstart(session): print('Starting spark context') pysparkrpc.clear() sc = SparkContext() sqlContext = SQLContext(sc) spark = SparkSession.builder.getOrCreate() pytest.spark = spark pytest.sc = sc pytest.sqlcontext = sqlContext
def test_da_resampler_resample_dataframe_with_correct_number_of_rows( spark, df): uat = da.Resampler(SQLContext.getOrCreate(spark.sparkContext)) result = uat.resample(df, time_col='time', timezone='Europe/Vienna', step_size='500ms', join_tolerance='180ms') df.show() result.show() assert result.count() == ((df.count()) * 2) - 1
def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new one based on the options set in this builder. """ with self._lock: from pyspark.conf import SparkConf from pyspark.context import SparkContext from pyspark.sql.context import SQLContext sparkConf = SparkConf() for key, value in self._options.items(): sparkConf.set(key, value) sparkContext = SparkContext.getOrCreate(sparkConf) return SQLContext.getOrCreate(sparkContext).sparkSession
def read_existing_parquet(): sc = get_configured_context() sql_context = SQLContext(sparkContext=sc) # Loads parquet file located in AWS S3 / minio into RDD Data Frame parquet_file = sql_context.read.parquet("s3a://testparquet/nation.parquet") parquet_file.registerTempTable("parquet_file") # Run standard SQL queries against temporary table nations_all_sql = sql_context.sql("SELECT * FROM parquet_file") nations_all = nations_all_sql.rdd.map( lambda p: "Country: {0:15} Ipsum Comment: {1}".format( p.N_NAME, p.N_COMMENT)) for idx, nation in enumerate(nations_all.collect()): if idx == 0: print("All Nations and Comments -- `SELECT * FROM parquet_file`") print_horizontal() print(nation) else: print_horizontal() # Use standard SQL to filter nations_filtered_sql = sql_context.sql( "SELECT N_NAME FROM parquet_file WHERE N_NAME LIKE '%IND%'") nations_filtered = nations_filtered_sql.rdd.map( lambda p: "Country: {0:20}".format(p.N_NAME)) for idx, nation in enumerate(nations_filtered.collect()): if idx == 0: print( "Nations Filtered -- `SELECT name FROM parquet_file WHERE name LIKE '%IND%'`" ) print_horizontal() print(nation) else: print_horizontal()
def test_estimator_graph_dataframe_exception(self): tf.reset_default_graph() model = SimpleModel() file_path = os.path.join(resource_path, "orca/learn/ncf.csv") sc = init_nncontext() sqlcontext = SQLContext(sc) df = sqlcontext.read.csv(file_path, header=True, inferSchema=True) est = Estimator.from_graph(inputs=[model.user, model.item], labels=[model.label], outputs=[model.logits], loss=model.loss, optimizer=tf.train.AdamOptimizer(), metrics={"loss": model.loss}) with self.assertRaises(Exception) as context: est.fit(data=df, batch_size=8, epochs=10, feature_cols=['user', 'item'], validation_data=df) self.assertTrue( 'label columns is None; it should not be None in training' in str( context.exception)) est.fit(data=df, batch_size=8, epochs=10, feature_cols=['user', 'item'], labels_cols=['label']) with self.assertRaises(Exception) as context: predictions = est.predict(df, batch_size=4).collect() self.assertTrue( 'feature columns is None; it should not be None in prediction' in str(context.exception)) with self.assertRaises(Exception) as context: est.fit(data=df, batch_size=8, epochs=10, feature_cols=['user', 'item'], labels_cols=['label'], validation_data=[1, 2, 3]) self.assertTrue( 'train data and validation data should be both Spark DataFrame' in str(context.exception))
def __init__( self, sparkContext: SparkContext, jsparkSession: Optional[JavaObject] = None, options: Dict[str, Any] = {}, ): from pyspark.sql.context import SQLContext self._sc = sparkContext self._jsc = self._sc._jsc self._jvm = self._sc._jvm assert self._jvm is not None if jsparkSession is None: if ( self._jvm.SparkSession.getDefaultSession().isDefined() and not self._jvm.SparkSession.getDefaultSession().get().sparkContext().isStopped() ): jsparkSession = self._jvm.SparkSession.getDefaultSession().get() getattr(getattr(self._jvm, "SparkSession$"), "MODULE$").applyModifiableSettings( jsparkSession, options ) else: jsparkSession = self._jvm.SparkSession(self._jsc.sc(), options) else: getattr(getattr(self._jvm, "SparkSession$"), "MODULE$").applyModifiableSettings( jsparkSession, options ) self._jsparkSession = jsparkSession self._jwrapped = self._jsparkSession.sqlContext() self._wrapped = SQLContext(self._sc, self, self._jwrapped) _monkey_patch_RDD(self) install_exception_handler() # If we had an instantiated SparkSession attached with a SparkContext # which is stopped now, we need to renew the instantiated SparkSession. # Otherwise, we will use invalid SparkSession when we call Builder.getOrCreate. if ( SparkSession._instantiatedSession is None or SparkSession._instantiatedSession._sc._jsc is None # type: ignore[attr-defined] ): SparkSession._instantiatedSession = self SparkSession._activeSession = self assert self._jvm is not None self._jvm.SparkSession.setDefaultSession(self._jsparkSession) self._jvm.SparkSession.setActiveSession(self._jsparkSession)
def main(): #main function to execute code sqlContext = SQLContext(sc) zk_host = zk_ip + ":2181" consumer_group = "reading-consumer-group" kafka_partitions = {topic: 1} #create kafka stream kvs = KafkaUtils.createStream(ssc, zk_host, consumer_group, kafka_partitions, valueDecoder=decoder) lines = kvs.map(lambda x: x[1]) readings = lines.map(lambda x: Row(device_id=x["device_id"],\ metric_time=datetime.datetime.fromtimestamp(int(x["metric_time"])),\ metric_name=x["metric_name"],\ metric_value=float(x["metric_value"]))) readings.foreachRDD(process) ssc.start() ssc.awaitTermination()
def run(sc, args): min_tokens = 70 min_length = 20 output_path = 'text-reuse/pipeline/wiki_preprocessed' input_file = 'text-reuse/wiki_00' sqlC = SQLContext(sc) df = sqlC.read.format('com.databricks.spark.xml').options( charset="UTF-8", nullValue="", rowTag='doc', mode='DROPMALFORMED').load(input_file) df = df.selectExpr("_id as id", "_title as title", "content as content") df.show() #Preprocess Wikipedia #generate paragarphs df = generate_text_paragraphs(df, 'content', 'paragraphs') df.show() #Filter out empty paragraphs df = df.filter(size(col('paragraphs')) > 0) #normalize paragraphs df = normalize_paragraphs(df, 'paragraphs', 'paragraphs', min_tokens) df.show() #flatten docs into paragraphs df = flatten_paragraphs(df, 'paragraphs', 'paragraph') df.show() #clean-up the text df = clean_up_text(df, 'paragraph', 'paragraph') df.show() #filter only paragraphs that are longer than min_tokens df = filter_paras_by_length(df, 'paragraph', min_length) #rename columns and save the df df = df.selectExpr('para_id as p_id', 'id as d_id', 'paragraph') df.rdd.saveAsPickleFile(output_path)
def getOrCreate(self): """Gets an existing :class:`SparkSession` or, if there is no existing one, creates a new one based on the options set in this builder. This method first checks whether there is a valid thread-local SparkSession, and if yes, return that one. It then checks whether there is a valid global default SparkSession, and if yes, return that one. If no valid global default SparkSession exists, the method creates a new SparkSession and assigns the newly created SparkSession as the global default. In case an existing SparkSession is returned, the config options specified in this builder will be applied to the existing SparkSession. """ with self._lock: from pyspark.conf import SparkConf from pyspark.context import SparkContext from pyspark.sql.context import SQLContext sparkConf = SparkConf() for key, value in self._options.items(): sparkConf.set(key, value) sparkContext = SparkContext.getOrCreate(sparkConf) return SQLContext.getOrCreate(sparkContext).sparkSession
def test_pyspark_gateway(self): pg = PysparkGateway() import pyspark from pyspark import SparkContext, SparkConf from pyspark.sql.context import SQLContext from pyspark.sql.functions import udf conf = SparkConf().set('spark.io.encryption.enabled', 'true') sc = SparkContext(gateway=pg.gateway, conf=conf) sqlContext = SQLContext.getOrCreate(sc) self.assertEqual(type(sc), SparkContext) df = sqlContext.createDataFrame([(1,2,'value 1')], ['id1', 'id2', 'val']) self.assertEqual(df.count(), 1) rows = df.collect() self.assertEqual(rows[0].id1, 1) pd = df.toPandas() self.assertEqual(type(pd), pandas.core.frame.DataFrame) data = [(1,2,'a'),(3,4,'b'),(5,6,'c')] df = sqlContext.createDataFrame(data, ['foo', 'bar', 'baz']) df.createOrReplaceTempView('foo_table') def squared(v): return v * v sqlContext.udf.register('squared', squared) squared_df = sqlContext.sql('select squared(foo) AS val from foo_table') rows = squared_df.collect() self.assertEqual(rows[2].val, '25') sc.stop() pg.gateway.shutdown()
def main(input_data_directory): # parser = argparse.ArgumentParser(description='Process html files') # # parser.add_argument('--input_location', # dest='input_location', # type=str, # default=os.path.join(__WORKDIR__, "data", "clean"), # help='Input location for the html files') # # parser.add_argument('--spark_context_name', # dest='spark_context_name', # type=str, # default="dudewhat", # help='Name of the Spark context') # # args = parser.parse_args() conf = SparkConf().setAppName("Reporter Review.") sc = SparkContext(conf=conf).getOrCreate() sqlContext = SQLContext(sc) df = clean_data(sc, sqlContext, input_data_directory) generate_insights(sc, sqlContext, df)
def isNull(column_value): if column_value is None: return '' else: return column_value if __name__ == '__main__': conf = SparkConf().setAppName('Type2_Example').setMaster('local[*]') # create spark context and sql context sc = SparkContext(conf=conf) sql_context = SQLContext(sc) # read the input data file and create pandas dataframe type_2_dataframe = sql_context.read.format("com.databricks.spark.csv")\ .option("header", "false") \ .option("inferschema", "true") \ .option("delimiter", "|") \ .option("mode", "DROPMALFORMED") \ .load("/home/mandar/ProjectWorkspace/Example/com/spark/example/type2_data") type_2_dataframe.printSchema() # print "--------------------------Total number of record -------------------------" # print type_2_dataframe.count() type_2_dataframe.show(10)
try: spark = SparkSession._create_shell_session() except Exception: import sys import traceback warnings.warn("Failed to initialize Spark session.") traceback.print_exc(file=sys.stderr) sys.exit(1) sc = spark.sparkContext sql = spark.sql atexit.register((lambda sc: lambda: sc.stop())(sc)) # for compatibility sqlContext = SQLContext._get_or_create(sc) sqlCtx = sqlContext print( r"""Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ /__ / .__/\_,_/_/ /_/\_\ version %s /_/ """ % sc.version ) print( "Using Python version %s (%s, %s)" % (platform.python_version(), platform.python_build()[0], platform.python_build()[1])
from pyspark.context import SparkContext from pyspark.sql.context import SQLContext sc = SparkContext(master='local[*]') sql_context = SQLContext(sc)
import json import time import csv import cryptography from cryptography.fernet import Fernet from pyspark.sql.context import SQLContext from pyspark import SparkContext from pyspark.streaming import StreamingContext from pyspark.sql.functions import lit # Create a local StreamingContext with two working thread and batch interval of 1 second sc = SparkContext("local[2]", "NetworkWordCount") ssc = StreamingContext(sc, 5) sqlContext = SQLContext(sc) players_filepath = '/project/input/players.csv' df_players = sqlContext.read.load(players_filepath, format='com.databricks.spark.csv', header='true', inferSchema='true') df_players = df_players.withColumn('PlayerContribution', lit(0)) df_players = df_players.withColumn('PlayerPerformance', df_players.PlayerContribution) df_players = df_players.withColumn('PlayerRanking', lit(0.5)) df_players = df_players.withColumn('Chemisty', lit(0.5)) df_players.show() df_players.printSchema() teams_filepath = '/project/input/teams.csv' df_teams = sqlContext.read.load(teams_filepath,
import sys import os import json from pyspark.sql.types import StructField, StructType, IntegerType ascontext=None if len(sys.argv) < 2 or sys.argv[1] != "-test": import spss.pyspark.runtime ascontext = spss.pyspark.runtime.getContext() sc = ascontext.getSparkContext() sqlCtx = ascontext.getSparkSQLContext() df = ascontext.getSparkInputData() schema = ascontext.getSparkInputSchema() else: sc = SparkContext('local') sqlCtx = SQLContext(sc) # get an input dataframe with sample data by looking in working directory for file DRUG1N.json wd = os.getcwd() df = sqlCtx.read.json(sys.argv[2]).repartition(4) # argv[2] of form file://DRUG1N.json schema = df.schema modelpath_base = "/tmp/model1234" modelpath = "file://"+modelpath_base+"/model" metadatapath = modelpath_base+"/metadata" model_metadata = json.loads(open(metadatapath,"r").read()) prediction_field = "$K-cluster" prediction_type = IntegerType() output_schema = StructType(schema.fields + [StructField(prediction_field, prediction_type, nullable=True)])
def convert_to_line(json_list): json_string = "" for line in json_list: json_string += json.dumps(line) + "\n" print(json_string) return json_string def parse_json(json_data, sc): r = convert_to_line(json_data) mylist = [] for line in r.splitlines(): mylist.append(line) rdd = sc.parallelize(mylist, 8) df = sqlContext.read.json(rdd) return df if __name__ == '__main__': sprk = Spark_Session() conn = sprk.Spark_Context() sql_conn = sprk.Spark_Connect() sqlContext = SQLContext(conn) ##https://api.github.com/users?since=100 with urllib.request.urlopen("https://api.github.com/users?since=100") as url: data = parse_json(parse(url.read().decode("utf-8")), conn) data.show()
__author__ = 'hanhanw' import sys from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext from pyspark.sql import Row from pyspark.sql.types import StructType, StructField, StringType, IntegerType conf = SparkConf().setAppName("shortest path") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) assert sc.version >= '1.5.1' inputs = sys.argv[1] output = sys.argv[2] source_node = sys.argv[3] dest_node = sys.argv[4] textinput = sc.textFile(inputs) def get_graphedges(line): list1 = line.split(':') if list1[1] == '': return None list2 = list1[1].split(' ') list2 = filter(None, list2) results = [] s = list1[0] for d in list2: results.append((s, d)) return results
import os import json from pyspark.sql.types import StructField, StructType, StringType, DoubleType ascontext=None try: import spss.pyspark.runtime ascontext = spss.pyspark.runtime.getContext() sc = ascontext.getSparkContext() sqlCtx = ascontext.getSparkSQLContext() df = ascontext.getSparkInputData() target = '%%target_field%%' schema = ascontext.getSparkInputSchema() except: sc = SparkContext('local') sqlCtx = SQLContext(sc) # get an input dataframe with sample data by looking in working directory for file DRUG1N.json wd = os.getcwd() df = sqlCtx.load("file://"+wd+"/DRUG1N.json","json").repartition(4) schema = df.schema modelpath_base = "/tmp/model1234" modelpath = "file://"+modelpath_base+"/model" metadatapath = modelpath_base+"/metadata" model_metadata = json.loads(open(metadatapath,"r").read()) target = model_metadata["target"] prediction_field = "$T-" + target prediction_type = StringType() output_schema = StructType(schema.fields + [StructField(prediction_field, prediction_type, nullable=True)])
class MainApp(object): def __init__(self): pass def init(self): os.environ[ "SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local") conf.setAppName("My application") conf.set("spark.executor.memory", "2g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) self.df_user = self.sqlContext.read.json("dataset/user.json").cache() self.df_review = self.sqlContext.read.json( "dataset/review.json").cache() self.df_business = self.sqlContext.read.json( "dataset/business.json").cache() self.df_user.registerTempTable("user") def getS3File(self, s3FilePath, destinationPathOnLocal): r = requests.get(s3FilePath) fileOb = open(destinationPathOnLocal, 'w') fileOb.write(r.text) fileOb.close() def writeToS3File(self, s3FilePath, sourcePathOnLocal): fileOb = open(sourcePathOnLocal, 'r') payload = fileOb.read() fileOb.close() headers = {"x-amz-acl": "public-read-write"} return requests.put(s3FilePath, headers=headers, data=payload) def reads3spark(self, path): # path = "s3n://b-datasets/flight_data/*" x = self.sc.textFile(path) # we can just specify all the files. return x def writes3spark(self, x, path): x.saveAsTextFile(path) def createFeatures(self): userData = self.sqlContext.sql( "SELECT user_id, name, review_count, votes, fans, yelping_since, elite FROM user" ) userData = userData.map(mapUsers).coalesce(1) res = self.sqlContext.createDataFrame(userData) review_user = self.df_review.select(self.df_review.business_id, self.df_review.user_id) business_loc = self.df_business.select(self.df_business.business_id, self.df_business.city, self.df_business.state) df_join_reviewAndBusiness = review_user.join( business_loc, review_user.business_id == business_loc.business_id).select( "user_id", "city", "state") df_grouped = df_join_reviewAndBusiness.groupBy( ["user_id", "city", "state"]).count() df_panda = res.toPandas() for name, group in df_grouped: if (group['city'] > 10): user_id = df_grouped.get_group(name)[0]['user_id'] df_panda[user_id]['k'] = df_panda[user_id]['k'] + 1 res = self.sqlContext.createDataFrame(df_panda) res.toJSON().saveAsTextFile('user_features.json')
def __init__(self, sparkContext, snappyContext=None): SQLContext.__init__(self, sparkContext) if snappyContext: self._scala_SnappyContext = snappyContext
# -*- coding: utf-8 -*- ''' Created on 2018年1月5日 @author: root ''' from com.bjsxt.python.test import SparkUtil from pyspark.sql.context import SQLContext conf = SparkUtil.initSparkConf(True, "DataFrameOpsFromFile") sc = SparkUtil.initSparkContext(conf) sqlContext = SQLContext(sc) df = sqlContext.read.json("../data/people.json") df.registerTempTable("people") sqlContext.sql("select * from people where age > 20").show() sc.stop()
from pyspark.sql.functions import rand, randn, mean, min, max from pyspark.sql.context import SQLContext from pyspark.context import SparkConf, SparkContext conf = SparkConf().setMaster("local").setAppName("sparkDataFrame") sc = SparkContext(conf = conf) sqlcontext = SQLContext(sc) # 1. Create a DataFrame with one int column and 10 rows. df = sqlcontext.range(0, 10) df.show() # Generate two other columns using uniform distribution and normal distribution. df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")) df.show() # 2. Summary and Descriptive Statistics df = sqlcontext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27)) df.describe('uniform', 'normal').show() df.select([mean('uniform'), min('uniform'), max('uniform')]).show() # 3. Sample covariance and correlation # Covariance is a measure of how two variables change with respect to each other. # A positive number would mean that there is a tendency that as one variable increases, # the other increases as well. # A negative number would mean that as one variable increases, # the other variable has a tendency to decrease. df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27)) df.stat.cov('rand1', 'rand2') df.stat.cov('id', 'id')
ascontext=None try: import spss.pyspark.runtime ascontext = spss.pyspark.runtime.getContext() sc = ascontext.getSparkContext() df = ascontext.getSparkInputData() model_type = '%%model_type%%' target = '%%target_field%%' lambda_param = float('%%lambda%%') predictors = map(lambda x: x.strip(),"%%predictor_fields%%".split(",")) modelpath = ascontext.createTemporaryFolder() except: import os sc = SparkContext('local') sqlCtx = SQLContext(sc) # get an input dataframe with sample data by looking in working directory for file DRUG1N.json wd = os.getcwd() df = sqlCtx.load("file://"+wd+"/DRUG1N.json","json").repartition(4) # specify predictors and target predictors = ["Drug","BP", "Sex", "Age"] target = "Cholesterol" lambda_param = 1.0 modelpath_base = "/tmp/model1234" import shutil try: shutil.rmtree(modelpath_base) except: pass modelpath = "file://"+modelpath_base+"/model"
import sys import re import datetime from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext inputs1 = sys.argv[1] inputs2 = sys.argv[2] output = sys.argv[3] conf = SparkConf().setAppName("load logs") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) assert sc.version >= '1.5.1' text = sc.textFile(inputs1)+sc.textFile(inputs2) def parseline(line): linere = re.compile('^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$') match = re.search(linere, line) if match: m = re.match(linere, line) host = m.group(1) dt = datetime.datetime.strptime(m.group(2), '%d/%b/%Y:%H:%M:%S') path = m.group(3) bys = float(m.group(4)) dct = {"host": host, "datetime": dt, "path": path, "bys": bys} return dct
class MainApp(object): def __init__(self): pass def init(self): os.environ["SPARK_HOME"] = "/Users/abhinavrungta/Desktop/setups/spark-1.5.2" # os.environ['AWS_ACCESS_KEY_ID'] = <YOURKEY> # os.environ['AWS_SECRET_ACCESS_KEY'] = <YOURKEY> conf = SparkConf() conf.setMaster("local") conf.setAppName("PySparkShell") conf.set("spark.executor.memory", "2g") # conf.set("spark.driver.memory", "1g") self.sc = SparkContext(conf=conf) self.sqlContext = SQLContext(self.sc) def loadData(self): self.df_review = self.sqlContext.read.json( "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json" ).cache() # self.df_review = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_review.json").cache() self.df_business = self.sqlContext.read.json( "../yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_business.json" ).cache() # self.df_business = self.sqlContext.read.json("s3n://ds-emr-spark/data/yelp_academic_dataset_business.json").cache() self.df_review.registerTempTable("reviews") self.df_business.registerTempTable("business") def createCheckInDataPerUser(self): review_user = self.sqlContext.sql("SELECT business_id, user_id FROM reviews") business_loc = self.sqlContext.sql("SELECT business_id, latitude, longitude FROM business") review_user.registerTempTable("reviews_user") business_loc.registerTempTable("business_loc") self.df_join_reviewAndBusiness = self.sqlContext.sql( "SELECT r.user_id, b.latitude, b.longitude FROM reviews_user r JOIN business_loc b ON r.business_id = b.business_id" ).cache() self.df_join_reviewAndBusiness.registerTempTable("userBusiness") self.df_unique_users = self.sqlContext.sql( 'SELECT DISTINCT user_id FROM userBusiness where user_id = "SIfJLNMv7vBwo-fSipxNgg"' ) self.df_unique_users.registerTempTable("users") pd = self.df_join_reviewAndBusiness.toPandas() global_db = self.sc.broadcast(pd) schema = StructType([StructField("latitude", FloatType()), StructField("longitude", FloatType())]) partialFunc = partial(getLocationsOfUser, business_db=global_db.value) self.get_locations = udf(partialFunc, ArrayType(schema)) self.get_centers = udf(getCentersOfUser, ArrayType(schema)) self.df_unique_users = self.df_unique_users.withColumn( "user_locations", self.get_locations(self.df_unique_users["user_id"]) ) self.df_unique_users.registerTempTable("users") self.df_unique_users.repartition(1).write.save("user.json", "json", "overwrite") print(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"])) self.df_unique_users = self.df_unique_users.withColumn( "user_centers", self.get_centers(self.df_unique_users["user_locations"]) ) self.df_unique_users.registerTempTable("users") self.df_unique_users.repartition(1).write.save("center.json", "json", "overwrite") self.df_unique_users.show() def distanceCalc(self): self.df_unique_users = self.sqlContext.read.json( "user.json/part-r-00000-23a1b514-f5fe-4f61-9a64-01ebbc88c146" ).cache() print(len(getCentersOfUser(self.df_unique_users.toPandas().iloc[0]["user_locations"])))