def task4(): spark.udf.register("sanitize", cleantext.sanitize) # UDF spark.udf.register("connect_all_string", connect_all_string) querytask4 = spark.sql( "SELECT Input_id, connect_all_string(sanitize(comment_body)) AS n_grams, labeldjt FROM task2_table" ) querytask4.write.saveAsTable("task4_table")
def task10(): df = spark.createDataFrame(states, StringType()).write.saveAsTable("states_table") comments.createOrReplaceTempView("comment_data") submissions.createOrReplaceTempView("submission_data") # across all posts all_posts = spark.sql( "SELECT SUM(CASE WHEN pos = 1 THEN 1 ELSE 0 END) / COUNT(*) as pos_perc, SUM(CASE WHEN neg = 1 THEN 1 ELSE 0 END) / COUNT(*) as neg_perc FROM task9_table" ) # all_posts.show() all_posts.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("all_posts.csv") # across all days all_days = spark.sql( "SELECT DATE(FROM_UNIXTIME(comment_timestamp)) as `date`, SUM(CASE WHEN pos = 1 THEN 1 ELSE 0 END) / COUNT(*) as Positive, SUM(CASE WHEN neg = 1 THEN 1 ELSE 0 END) / COUNT(*) as Negative FROM task9_table GROUP BY DATE(FROM_UNIXTIME(comment_timestamp))" ) # all_days.show() all_days.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("all_days.csv") # across all states query_string = "SELECT task9_table.state as state, SUM(CASE WHEN pos = 1 THEN 1 ELSE 0 END) / COUNT(*) as Positive, SUM(CASE WHEN neg = 1 THEN 1 ELSE 0 END) / COUNT(*) as Negative FROM task9_table JOIN states_table ON task9_table.state = states_table.value GROUP BY state" all_states = spark.sql(query_string) # all_states.show() all_states.repartition(1).write.format("com.databricks.spark.csv").option( "header", "true").save("all_states.csv") # across comment score all_comment_score = spark.sql( "SELECT comment_data.score as comment_score, SUM(CASE WHEN pos = 1 THEN 1 ELSE 0 END) / COUNT(*) as Positive, SUM(CASE WHEN neg = 1 THEN 1 ELSE 0 END) / COUNT(*) as Negative FROM task9_table JOIN comment_data ON task9_table.id = comment_data.id GROUP BY comment_data.score" ) # all_comment_score.show() all_comment_score.repartition(1).write.format( "com.databricks.spark.csv").option( "header", "true").save("all_comment_score.csv") # across story score (submission score) all_submission_score = spark.sql( "SELECT submission_data.score as submission_score, SUM(CASE WHEN pos = 1 THEN 1 ELSE 0 END) / COUNT(*) as Positive, SUM(CASE WHEN neg = 1 THEN 1 ELSE 0 END) / COUNT(*) as Negative FROM task9_table JOIN comment_data ON task9_table.id = comment_data.id JOIN submission_data ON (Replace(comment_data.link_id, 't3_', '')) = submission_data.id GROUP BY submission_data.score" ) # all_submission_score.show() all_submission_score.repartition(1).write.format( "com.databricks.spark.csv").option( "header", "true").save("all_submission_score.csv")
def modelfit(): # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. pos = spark.sql("SELECT features,positive_djt AS label FROM task6_table ") neg = spark.sql( "SELECT features ,negative_djt AS label FROM task6_table ") poslr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.2) neglr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10).setThreshold(0.25) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 0.3. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator(estimator=poslr, evaluator=posEvaluator, estimatorParamMaps=posParamGrid, numFolds=5) negCrossval = CrossValidator(estimator=neglr, evaluator=negEvaluator, estimatorParamMaps=negParamGrid, numFolds=5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.write().overwrite().save("www/pos.model") negModel.write().overwrite().save("www/neg.model")
def task8(): #1 comments.createOrReplaceTempView("comment_data") submissions.createOrReplaceTempView("submission_data") #sqlDF = spark.sql("SELECT comment_data.created_utc as comment_timestamp, comment_data.id, comment_data.body FROM comment_data JOIN (SELECT title FROM comment_data JOIN submission_data ON (Replace(comment_data.link_id, 't3_', '')) = submission_data.id) t2 ON ") sqlDF = spark.sql( "SELECT comment_data.id, comment_data.created_utc as comment_timestamp, comment_data.body AS comment_body, submission_data.title, submission_data.author_flair_text as state FROM comment_data JOIN submission_data ON (Replace(comment_data.link_id, 't3_', '')) = submission_data.id" ) # sqlDF.show() #debugging purpose sqlDF.write.saveAsTable("task8_table")
def task2(): comments.createOrReplaceTempView("comment_table") #comment_table = spark.sql("SELECT id, body FROM cmnt_table") labeled_data.createOrReplaceTempView("data_table") # csv_table = spark.sql("SELECT * FROM df_table") query = spark.sql( "SELECT data_table.Input_id, data_table.labeldem, data_table.labelgop, data_table.labeldjt, comment_table.body as comment_body FROM data_table JOIN comment_table ON data_table.Input_id = comment_table.id" ) query.write.saveAsTable("task2_table")
def task9(task6model): querytask9_0 = spark.sql( "SELECT id,comment_timestamp,title,state,comment_body FROM task8_table WHERE comment_body NOT LIKE '>%' AND comment_body NOT LIKE '%/s%'" ) querytask9_0.write.saveAsTable("task9_table1") querytask9_1 = spark.sql( "SELECT id, connect_all_string(sanitize(comment_body)) AS n_grams, comment_timestamp,title,state,comment_body FROM task9_table1" ) querytask9_2 = querytask9_1.select( split(col("n_grams"), ",\s*").alias("n_grams"), col("id"), col("comment_timestamp"), col("title"), col("state"), col("comment_body")) task9df = task6model.transform(querytask9_2) task9df.printSchema() task9df = task9df.write.saveAsTable("task9_table2") querytask9_3 = spark.sql( "SELECT id, n_grams, comment_timestamp,title,state,comment_body, features, features AS features_backup FROM task9_table2" ) model_pos = CrossValidatorModel.load("www/pos.model") model_neg = CrossValidatorModel.load("www/neg.model") pos_ans = model_pos.transform(querytask9_3).write.saveAsTable("pos_table") task9df_withPos = spark.sql( "SELECT id,comment_timestamp,title,state,comment_body,prediction AS pos, features_backup AS features, probability AS pos_probability FROM pos_table" ) task9df_withPos.show() neg_ans = model_neg.transform(task9df_withPos).write.saveAsTable( "neg_table") task9result = spark.sql( "SELECT id,comment_timestamp,title,state,comment_body, pos , prediction AS neg FROM neg_table" ) task9result.write.parquet("task9result_parquet") #store parquet final_task9result = spark.read.parquet("task9result_parquet") final_task9result.write.saveAsTable("task9_table") spark.sql("SELECT * FROM task9_table").show()
def read_pos_from_db() -> pyspark.sql.DataFrame: """ Reads POS data from ``d4sa_us_disc.bluesky_pos_data`` and aggregates out the channel Returns ------- pyspark.sql.DataFrame PySpark dataframe with pos_qty and pos_dollar """ q = """ SELECT week_ending_date, retailer, state, mdlz_business, mdlz_category, mdlz_brand, mdlz_ppg, sum(pos_qty) as pos_qty, sum(pos_dollar) as pos_dollar FROM d4sa_us_disc.bluesky_pos_data GROUP BY 1, 2, 3, 4, 5, 6, 7 """ return pyspark.sql(q)
def task6(): querytask6 = spark.sql( "SELECT Input_id,n_grams , IF(labeldjt='1',1,0) AS positive_djt, IF(labeldjt='-1',1,0) AS negative_djt FROM task4_table" ) #querytask6.show() #reference https://stackoverflow.com/questions/38189088/convert-comma-separated-string-to-array-in-pyspark-dataframe querytask6 = querytask6.select( split(col("n_grams"), ",\s*").alias("n_grams"), col("positive_djt"), col("negative_djt") ) #convert the "combined n_grams " from string form to actual array form #reference: http://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html cv = CountVectorizer(minDF=5.0, vocabSize=1 << 18, binary=True, inputCol="n_grams", outputCol="features") model = cv.fit(querytask6) task6Result = model.transform(querytask6) task6Result.printSchema( ) # for a better look of the table , remove "truncate=False" task6Result.write.saveAsTable("task6_table") return model
#%sql SET spark.sql.shuffle.partitions = 3 spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .getOrCreate() #.config("spark.some.config.option", "some-value") \ lines = spark.readStream \ .format("socket") \ .option("host", "54.213.33.240") \ .option("port", 9002) \ .load() edits = lines.select(json_tuple('value', "channel", "timestamp", "isRobot", "isAnonymous")) \ .selectExpr("c0 as channel", "cast(c1 as timestamp) as time", "c2 as page") \ .createOrReplaceTempView("edits") parquetData = sql("select * from edits") #display(parquetData) editCounts = spark.sql( """SELECT count(*), channel, date_format(window(time, '10 seconds').start, 'HH:mm:ss') as time FROM edits GROUP BY channel, window(time, '10 seconds') ORDER BY time""") #display(editCounts)
import pyspark as spark from pyspark.sql import functions as F # from pyspark.sql import HiveContext # from pyspark.sql.window import Window # from pyspark import SparkConf, SparkContext import pandas as pd import numpy as np focus_customer_id = spark.sql(""" SELECT customer_id, count(1) as consumpt_cnt FROM btmp_cmd.nt85610_loc_poc WHERE week >= 1 AND week <= 8 AND loc_type_town IS NOT NULL GROUP BY customer_id HAVING consumpt_cnt >= 10 """).select('customer_id') df_raw = spark.sql( """SELECT * FROM btmp_cmd.nt85610_loc_poc WHERE merchant_flag = 'Y' AND loc_type_town IS NOT NULL""" ) train_intent_df = df_raw.filter(F.col('week').between(0, 8)). \ groupby(['customer_id', 'dayofweek', 'consumption_category_desc', 'loc_type_town']). \ agg(F.count(F.lit(1)).alias('train_consum_cnt'), F.sum('txn_amt').alias('train_consum_tot_amt')). \ withColumn('train_consum_cnt_rk', F.row_number().over(Window.partitionBy(['customer_id', 'dayofweek']). \ orderBy(F.desc('train_consum_cnt'), F.desc('train_consum_tot_amt')))). \ filter(F.col('train_consum_cnt_rk') == 1). \ select(['customer_id', 'dayofweek', 'consumption_category_desc', 'loc_type_town']) train_merchant_df = df_raw.filter(F.col('week').between(0, 8)). \
from pyspark.sql import * df = spark.read.csv("/home/kris/datasets/small/iris.csv") df.show() df = spark.read.option("header", "true").option( "inferSchema", "true").csv("/home/kris/datasets/small/iris.csv") df.registerTempTable("mydf") sql("SELECT SUM(sepal_length) FROM mydf").show() sql("SELECT SUM(sepal_length) AS sum_sep_len FROM mydf").show() df.schema df.printSchema() from operator import add linesdf = spark.read.text("/home/kris/datasets/text/jane_austen.txt") linesdf.show(10, False) lines = spark.read.text("/home/kris/datasets/text/jane_austen.txt").rdd.map( lambda r: r[0]) counts = lines.flatMap(lambda x: x.split(' ')) \ .map(lambda x: (x, 1)) \ .reduceByKey(add)
# ### View Tables #### # We use the Sparksession catalog attribute to extract different # pieces of information # Print the tables in the catalog print(spark.catalog.listTables()) #%% # ### Maka a query ### query = "FROM flights SELECT * LIMIT 10" # Get the first 10 rows of flights flights10 = spark.sql(query) # Show the results flights10.show() # ##### PySpark Dataframe to Pandas ##### query = "SELECT origin, dest, COUNT(*) as N FROM flights GROUP BY origin, dest" # Run the query flight_counts = spark.sql(query) # Convert the results to a pandas DataFrame pd_counts = flight_counts.toPandas() # Print the head of pd_counts
# Parse out the date only df = df.withColumn( 'date_only', F.regexp_replace(df.end_date, ' (\d+)[:](\d+)[:](\d+).*$', '')) # Split a string and index a field df = df.withColumn('city', F.split(df.location, '-')[1]) # Perform a date diff function df = df.withColumn( 'date_diff', F.datediff(F.to_date(df.end_date), F.to_date(df.start_date))) # COMMAND ---------- df.registerTempTable("sample_df") display(sql("select * from sample_df")) # COMMAND ---------- # DBTITLE 1,I want to convert the DataFrame back to JSON strings to send back to Kafka. # There is an underlying toJSON() function that returns an RDD of JSON strings using the column names and schema to produce the JSON records. rdd_json = df.toJSON() rdd_json.take(2) # COMMAND ---------- # DBTITLE 1,My UDF takes a parameter including the column to operate on. How do I pass this parameter? # There is a function available called lit() that creates a constant column. from pyspark.sql import functions as F