def task4():
    spark.udf.register("sanitize", cleantext.sanitize)  # UDF
    spark.udf.register("connect_all_string", connect_all_string)

    querytask4 = spark.sql(
        "SELECT Input_id, connect_all_string(sanitize(comment_body)) AS n_grams, labeldjt  FROM task2_table"
    )
    querytask4.write.saveAsTable("task4_table")
def task10():
    df = spark.createDataFrame(states,
                               StringType()).write.saveAsTable("states_table")
    comments.createOrReplaceTempView("comment_data")
    submissions.createOrReplaceTempView("submission_data")
    # across all posts
    all_posts = spark.sql(
        "SELECT SUM(CASE WHEN pos = 1 THEN 1 ELSE 0 END) / COUNT(*) as pos_perc, SUM(CASE WHEN neg = 1 THEN 1 ELSE 0 END) / COUNT(*) as neg_perc FROM task9_table"
    )
    # all_posts.show()
    all_posts.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("all_posts.csv")

    # across all days
    all_days = spark.sql(
        "SELECT DATE(FROM_UNIXTIME(comment_timestamp)) as `date`, SUM(CASE WHEN pos = 1 THEN 1 ELSE 0 END) / COUNT(*) as Positive, SUM(CASE WHEN neg = 1 THEN 1 ELSE 0 END) / COUNT(*) as Negative FROM task9_table GROUP BY DATE(FROM_UNIXTIME(comment_timestamp))"
    )
    # all_days.show()
    all_days.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("all_days.csv")

    # across all states
    query_string = "SELECT task9_table.state as state, SUM(CASE WHEN pos = 1 THEN 1 ELSE 0 END) / COUNT(*) as Positive, SUM(CASE WHEN neg = 1 THEN 1 ELSE 0 END) / COUNT(*) as Negative FROM task9_table JOIN states_table ON task9_table.state = states_table.value GROUP BY state"
    all_states = spark.sql(query_string)
    # all_states.show()
    all_states.repartition(1).write.format("com.databricks.spark.csv").option(
        "header", "true").save("all_states.csv")

    # across comment score
    all_comment_score = spark.sql(
        "SELECT comment_data.score as comment_score, SUM(CASE WHEN pos = 1 THEN 1 ELSE 0 END) / COUNT(*) as Positive, SUM(CASE WHEN neg = 1 THEN 1 ELSE 0 END) / COUNT(*) as Negative FROM task9_table JOIN comment_data ON task9_table.id = comment_data.id GROUP BY comment_data.score"
    )
    # all_comment_score.show()
    all_comment_score.repartition(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").save("all_comment_score.csv")

    # across story score (submission score)
    all_submission_score = spark.sql(
        "SELECT submission_data.score as submission_score, SUM(CASE WHEN pos = 1 THEN 1 ELSE 0 END) / COUNT(*) as Positive, SUM(CASE WHEN neg = 1 THEN 1 ELSE 0 END) / COUNT(*) as Negative FROM task9_table JOIN comment_data ON task9_table.id = comment_data.id JOIN submission_data ON (Replace(comment_data.link_id, 't3_', '')) = submission_data.id GROUP BY submission_data.score"
    )
    # all_submission_score.show()
    all_submission_score.repartition(1).write.format(
        "com.databricks.spark.csv").option(
            "header", "true").save("all_submission_score.csv")
def modelfit():
    # Initialize two logistic regression models.
    # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
    pos = spark.sql("SELECT features,positive_djt  AS label FROM task6_table ")
    neg = spark.sql(
        "SELECT features ,negative_djt  AS label FROM task6_table ")
    poslr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10).setThreshold(0.2)
    neglr = LogisticRegression(labelCol="label",
                               featuresCol="features",
                               maxIter=10).setThreshold(0.25)
    # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
    posEvaluator = BinaryClassificationEvaluator()
    negEvaluator = BinaryClassificationEvaluator()
    # There are a few parameters associated with logistic regression. We do not know what they are a priori.
    # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
    # We will assume the parameter is 0.3. Grid search takes forever.
    posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
    negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
    # We initialize a 5 fold cross-validation pipeline.
    posCrossval = CrossValidator(estimator=poslr,
                                 evaluator=posEvaluator,
                                 estimatorParamMaps=posParamGrid,
                                 numFolds=5)
    negCrossval = CrossValidator(estimator=neglr,
                                 evaluator=negEvaluator,
                                 estimatorParamMaps=negParamGrid,
                                 numFolds=5)
    # Although crossvalidation creates its own train/test sets for
    # tuning, we still need a labeled test set, because it is not
    # accessible from the crossvalidator (argh!)
    # Split the data 50/50
    posTrain, posTest = pos.randomSplit([0.5, 0.5])
    negTrain, negTest = neg.randomSplit([0.5, 0.5])
    # Train the models
    print("Training positive classifier...")
    posModel = posCrossval.fit(posTrain)
    print("Training negative classifier...")
    negModel = negCrossval.fit(negTrain)

    # Once we train the models, we don't want to do it again. We can save the models and load them again later.
    posModel.write().overwrite().save("www/pos.model")
    negModel.write().overwrite().save("www/neg.model")
def task8():
    #1
    comments.createOrReplaceTempView("comment_data")
    submissions.createOrReplaceTempView("submission_data")
    #sqlDF = spark.sql("SELECT comment_data.created_utc as comment_timestamp, comment_data.id, comment_data.body FROM comment_data JOIN (SELECT title FROM comment_data JOIN submission_data ON (Replace(comment_data.link_id, 't3_', '')) = submission_data.id) t2 ON ")
    sqlDF = spark.sql(
        "SELECT comment_data.id, comment_data.created_utc as comment_timestamp, comment_data.body AS comment_body, submission_data.title, submission_data.author_flair_text as state FROM comment_data JOIN submission_data ON (Replace(comment_data.link_id, 't3_', '')) = submission_data.id"
    )
    # sqlDF.show() #debugging purpose
    sqlDF.write.saveAsTable("task8_table")
def task2():
    comments.createOrReplaceTempView("comment_table")
    #comment_table = spark.sql("SELECT id, body FROM cmnt_table")

    labeled_data.createOrReplaceTempView("data_table")

    # csv_table = spark.sql("SELECT * FROM df_table")

    query = spark.sql(
        "SELECT data_table.Input_id, data_table.labeldem, data_table.labelgop, data_table.labeldjt, comment_table.body as comment_body FROM data_table JOIN comment_table ON data_table.Input_id = comment_table.id"
    )
    query.write.saveAsTable("task2_table")
def task9(task6model):
    querytask9_0 = spark.sql(
        "SELECT id,comment_timestamp,title,state,comment_body FROM task8_table  WHERE comment_body NOT LIKE '&gt%' AND comment_body NOT LIKE '%/s%'"
    )
    querytask9_0.write.saveAsTable("task9_table1")
    querytask9_1 = spark.sql(
        "SELECT id, connect_all_string(sanitize(comment_body)) AS n_grams, comment_timestamp,title,state,comment_body  FROM task9_table1"
    )
    querytask9_2 = querytask9_1.select(
        split(col("n_grams"), ",\s*").alias("n_grams"), col("id"),
        col("comment_timestamp"), col("title"), col("state"),
        col("comment_body"))

    task9df = task6model.transform(querytask9_2)
    task9df.printSchema()
    task9df = task9df.write.saveAsTable("task9_table2")
    querytask9_3 = spark.sql(
        "SELECT id,  n_grams, comment_timestamp,title,state,comment_body, features, features AS features_backup  FROM task9_table2"
    )

    model_pos = CrossValidatorModel.load("www/pos.model")
    model_neg = CrossValidatorModel.load("www/neg.model")
    pos_ans = model_pos.transform(querytask9_3).write.saveAsTable("pos_table")

    task9df_withPos = spark.sql(
        "SELECT id,comment_timestamp,title,state,comment_body,prediction AS pos, features_backup AS features, probability AS pos_probability  FROM pos_table"
    )
    task9df_withPos.show()
    neg_ans = model_neg.transform(task9df_withPos).write.saveAsTable(
        "neg_table")

    task9result = spark.sql(
        "SELECT id,comment_timestamp,title,state,comment_body, pos , prediction AS neg FROM neg_table"
    )

    task9result.write.parquet("task9result_parquet")  #store parquet

    final_task9result = spark.read.parquet("task9result_parquet")
    final_task9result.write.saveAsTable("task9_table")
    spark.sql("SELECT * FROM task9_table").show()
def read_pos_from_db() -> pyspark.sql.DataFrame:
    """
    Reads POS data from ``d4sa_us_disc.bluesky_pos_data`` and aggregates out the channel

    Returns
    -------
    pyspark.sql.DataFrame
        PySpark dataframe with pos_qty and pos_dollar
    """
    q = """
        SELECT
        week_ending_date,
        retailer,
        state,
        mdlz_business,
        mdlz_category,
        mdlz_brand,
        mdlz_ppg,
        sum(pos_qty) as pos_qty,
        sum(pos_dollar) as pos_dollar
        FROM d4sa_us_disc.bluesky_pos_data
        GROUP BY 1, 2, 3, 4, 5, 6, 7
        """
    return pyspark.sql(q)
def task6():

    querytask6 = spark.sql(
        "SELECT Input_id,n_grams , IF(labeldjt='1',1,0) AS positive_djt, IF(labeldjt='-1',1,0) AS negative_djt FROM task4_table"
    )
    #querytask6.show()
    #reference https://stackoverflow.com/questions/38189088/convert-comma-separated-string-to-array-in-pyspark-dataframe
    querytask6 = querytask6.select(
        split(col("n_grams"), ",\s*").alias("n_grams"), col("positive_djt"),
        col("negative_djt")
    )  #convert the "combined n_grams " from string form to actual array form

    #reference: http://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html
    cv = CountVectorizer(minDF=5.0,
                         vocabSize=1 << 18,
                         binary=True,
                         inputCol="n_grams",
                         outputCol="features")
    model = cv.fit(querytask6)
    task6Result = model.transform(querytask6)
    task6Result.printSchema(
    )  # for a better look of the table , remove "truncate=False"
    task6Result.write.saveAsTable("task6_table")
    return model
#%sql SET spark.sql.shuffle.partitions = 3

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .getOrCreate()
#.config("spark.some.config.option", "some-value") \


lines = spark.readStream \
    .format("socket") \
    .option("host", "54.213.33.240") \
    .option("port", 9002) \
    .load()

edits = lines.select(json_tuple('value', "channel", "timestamp", "isRobot", "isAnonymous")) \
  .selectExpr("c0 as channel", "cast(c1 as timestamp) as time", "c2 as page") \
  .createOrReplaceTempView("edits")

parquetData = sql("select * from edits")
#display(parquetData)

editCounts = spark.sql(
    """SELECT count(*), channel, date_format(window(time, '10 seconds').start, 'HH:mm:ss') as time 
                              FROM edits 
                              GROUP BY channel, window(time, '10 seconds')
                              ORDER BY time""")

#display(editCounts)
Exemple #10
0
import pyspark as spark
from pyspark.sql import functions as F
# from pyspark.sql import HiveContext
# from pyspark.sql.window import Window
# from pyspark import SparkConf, SparkContext
import pandas as pd
import numpy as np

focus_customer_id = spark.sql("""
    SELECT customer_id, count(1) as consumpt_cnt
    FROM btmp_cmd.nt85610_loc_poc
    WHERE week >= 1 AND week <= 8 AND loc_type_town IS NOT NULL
    GROUP BY customer_id
    HAVING consumpt_cnt >= 10
    """).select('customer_id')

df_raw = spark.sql(
    """SELECT * FROM btmp_cmd.nt85610_loc_poc WHERE merchant_flag = 'Y' AND loc_type_town IS NOT NULL"""
)
train_intent_df = df_raw.filter(F.col('week').between(0, 8)). \
    groupby(['customer_id', 'dayofweek', 'consumption_category_desc', 'loc_type_town']). \
    agg(F.count(F.lit(1)).alias('train_consum_cnt'),
        F.sum('txn_amt').alias('train_consum_tot_amt')). \
    withColumn('train_consum_cnt_rk',
               F.row_number().over(Window.partitionBy(['customer_id', 'dayofweek']). \
                                   orderBy(F.desc('train_consum_cnt'), F.desc('train_consum_tot_amt')))). \
    filter(F.col('train_consum_cnt_rk') == 1). \
    select(['customer_id', 'dayofweek', 'consumption_category_desc', 'loc_type_town'])


train_merchant_df = df_raw.filter(F.col('week').between(0, 8)). \
from pyspark.sql import *
df = spark.read.csv("/home/kris/datasets/small/iris.csv")
df.show()
df = spark.read.option("header", "true").option(
    "inferSchema", "true").csv("/home/kris/datasets/small/iris.csv")

df.registerTempTable("mydf")
sql("SELECT SUM(sepal_length) FROM mydf").show()
sql("SELECT SUM(sepal_length) AS sum_sep_len FROM mydf").show()
df.schema
df.printSchema()

from operator import add

linesdf = spark.read.text("/home/kris/datasets/text/jane_austen.txt")
linesdf.show(10, False)

lines = spark.read.text("/home/kris/datasets/text/jane_austen.txt").rdd.map(
    lambda r: r[0])

counts = lines.flatMap(lambda x: x.split(' ')) \
              .map(lambda x: (x, 1)) \
              .reduceByKey(add)
# ### View Tables ####

# We use the Sparksession catalog attribute to extract different
# pieces of information

# Print the tables in the catalog
print(spark.catalog.listTables())

#%%
# ### Maka a query ###

query = "FROM flights SELECT * LIMIT 10"

# Get the first 10 rows of flights
flights10 = spark.sql(query)

# Show the results
flights10.show()

# ##### PySpark Dataframe to Pandas #####

query = "SELECT origin, dest, COUNT(*) as N FROM flights GROUP BY origin, dest"

# Run the query
flight_counts = spark.sql(query)

# Convert the results to a pandas DataFrame
pd_counts = flight_counts.toPandas()

# Print the head of pd_counts
# Parse out the date only
df = df.withColumn(
    'date_only', F.regexp_replace(df.end_date, ' (\d+)[:](\d+)[:](\d+).*$',
                                  ''))

# Split a string and index a field
df = df.withColumn('city', F.split(df.location, '-')[1])

# Perform a date diff function
df = df.withColumn(
    'date_diff', F.datediff(F.to_date(df.end_date), F.to_date(df.start_date)))

# COMMAND ----------

df.registerTempTable("sample_df")
display(sql("select * from sample_df"))

# COMMAND ----------

# DBTITLE 1,I want to convert the DataFrame back to JSON strings to send back to Kafka.
# There is an underlying toJSON() function that returns an RDD of JSON strings using the column names and schema to produce the JSON records.

rdd_json = df.toJSON()
rdd_json.take(2)

# COMMAND ----------

# DBTITLE 1,My UDF takes a parameter including the column to operate on. How do I pass this parameter?
# There is a function available called lit() that creates a constant column.
from pyspark.sql import functions as F