def test_udf_in_generate(self): from pyspark.sql.functions import udf, explode df = self.spark.range(5) f = udf(lambda x: list(range(x)), ArrayType(LongType())) row = df.select(explode(f(*df))).groupBy().sum().first() self.assertEqual(row[0], 10) df = self.spark.range(3) res = df.select("id", explode(f(df.id))).collect() self.assertEqual(res[0][0], 1) self.assertEqual(res[0][1], 0) self.assertEqual(res[1][0], 2) self.assertEqual(res[1][1], 0) self.assertEqual(res[2][0], 2) self.assertEqual(res[2][1], 1) range_udf = udf(lambda value: list(range(value - 1, value + 1)), ArrayType(IntegerType())) res = df.select("id", explode(range_udf(df.id))).collect() self.assertEqual(res[0][0], 0) self.assertEqual(res[0][1], -1) self.assertEqual(res[1][0], 0) self.assertEqual(res[1][1], 0) self.assertEqual(res[2][0], 1) self.assertEqual(res[2][1], 0) self.assertEqual(res[3][0], 1) self.assertEqual(res[3][1], 1)
def process(rdd): print(">>>> BEGIN CASS") wonbids = getSqlContextInstance(rdd.context).createDataFrame(rdd) wonbids.registerTempTable("wonbids") wonbids.write.format("org.apache.spark.sql.cassandra").\ options(keyspace="text_bids", table="bidswon").\ save(mode="append") #sqlContext.cacheTable('wonbids') # wonbids.show() symptoms = wonbids.select(wonbids.id,wonbids.created_utc,explode(wonbids.symptomtags).alias('symptom')) symptoms.registerTempTable("symptoms") symptoms.write.format("org.apache.spark.sql.cassandra").\ options(keyspace="text_bids", table="symptoms").\ save(mode="append") # symptoms.show() conditions = wonbids.select(wonbids.id,wonbids.created_utc,explode(wonbids.conditiontags).alias('condition')) conditions.registerTempTable("conditions") conditions.write.format("org.apache.spark.sql.cassandra").\ options(keyspace="text_bids", table="conditions").\ save(mode="append") # conditions.show() # send back to master to process for w in wonbids.collect(): event.Event('toES', {'id':w.id,'pharmatag':w.pharmatag,'price':w.price,'created_utc':w.created_utc,'symptomtags':w.symptomtags,'conditiontags':w.conditiontags}) print(">>>> END CASS")
def test_explode(self): from pyspark.sql.functions import explode, explode_outer, posexplode_outer d = [ Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"}), Row(a=1, intlist=[], mapfield={}), Row(a=1, intlist=None, mapfield=None), ] rdd = self.sc.parallelize(d) data = self.spark.createDataFrame(rdd) result = data.select(explode(data.intlist).alias("a")).select("a").collect() self.assertEqual(result[0][0], 1) self.assertEqual(result[1][0], 2) self.assertEqual(result[2][0], 3) result = data.select(explode(data.mapfield).alias("a", "b")).select("a", "b").collect() self.assertEqual(result[0][0], "a") self.assertEqual(result[0][1], "b") result = [tuple(x) for x in data.select(posexplode_outer("intlist")).collect()] self.assertEqual(result, [(0, 1), (1, 2), (2, 3), (None, None), (None, None)]) result = [tuple(x) for x in data.select(posexplode_outer("mapfield")).collect()] self.assertEqual(result, [(0, 'a', 'b'), (None, None, None), (None, None, None)]) result = [x[0] for x in data.select(explode_outer("intlist")).collect()] self.assertEqual(result, [1, 2, 3, None, None]) result = [tuple(x) for x in data.select(explode_outer("mapfield")).collect()] self.assertEqual(result, [('a', 'b'), (None, None), (None, None)])
def test_explode(self): from pyspark.sql.functions import explode d = [Row(a=1, intlist=[1, 2, 3], mapfield={"a": "b"})] rdd = self.sc.parallelize(d) data = self.sqlCtx.createDataFrame(rdd) result = data.select(explode(data.intlist).alias("a")).select("a").collect() self.assertEqual(result[0][0], 1) self.assertEqual(result[1][0], 2) self.assertEqual(result[2][0], 3) result = data.select(explode(data.mapfield).alias("a", "b")).select("a", "b").collect() self.assertEqual(result[0][0], "a") self.assertEqual(result[0][1], "b")
def data(self): from pyspark.sql.functions import array, explode, col, lit return self.spark.range(10).toDF('id') \ .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ .withColumn("v", explode(col('vs'))) \ .drop('vs') \ .withColumn('w', lit(1.0))
def create_one_hot_dict(self,input_df): """Creates a one-hot-encoder dictionary based on the input data. Args: input_df (DataFrame with 'features' column): A DataFrame where each row contains a list of (featureID, value) tuples. Returns: dict: A dictionary where the keys are (featureID, value) tuples and map to values that are unique integers. """ distinct_feats = input_df.select(explode(input_df.features)).distinct() #print distinct_feats.take(1)[0] return distinct_feats.rdd.map(lambda x: tuple(x[0])).zipWithIndex().collectAsMap()
def get_average_expected_change(timeframe, partner, purchase): """ returns the average expected change of day 1 to day 7 input: a period (string), a partner (string), a spark dataframe output: a dictionary with key=day and value=average_expected_change """ keys = [1,2,3,4,5,6,7] result = {} result = {key: 0 for key in keys} timeframe_is = get_date(timeframe) unlisted = purchase.select(explode(purchase.prediction.days).alias("test")).collect() # print 'length of unlisted:', len(unlisted) total = len(unlisted) / len(keys) # print total for i in range(len(unlisted)): result[i%7+1] += unlisted[i][0].expected_change / total return result
#linesOut.saveAsTextFile("hdfs:///tmp/fact_icd9_encounter_08242016_supP2_rdd_.csv") # linesOut.saveAsTextFile('hdfs:///tmp/fact_icd9_encounter_08242016_supP2_rdd.csv') linesOut.saveAsTextFile("/Users/jayurbain/Dropbox/MCW/fact_icd9_encounter_08242016_supP2_rdd.txt") linesOut.reduce( lambda k,v: (k)) ################ from pyspark.sql import Row from pyspark.sql.functions import explode df = sqlContext.createDataFrame([Row(a=1, b=[1,2,3],c=[7,8,9]), Row(a=2, b=[4,5,6],c=[10,11,12])]) df1 = df.select(df.a,explode(df.b).alias("b"),df.c) df2 = df1.select(df1.a,df1.b,explode(df1.c).alias("c")) ################################### # fpgrowth example itemsets = parts.map(lambda p: ( p[3].strip().split(',') ) ) itemsets.getNumPartitions() model_fp = FPGrowth.train(itemsets, minSupport=0.005, numPartitions=10) result = model_fp.freqItemsets().collect() for i in sorted(result, key=operator.itemgetter(1), reverse=True): print '(', ', '.join(i.items), ')', 'freq=', str(i.freq)
from pyspark.sql import SQLContext from pyspark.sql.functions import desc, explode from pyspark.sql.types import * from storage import Sqlite PARTITIONS = 500 THRESHOLD = 50 if __name__ == "__main__": conf = SparkConf().setAppName("reddit") conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer') conf.set('spark.local.dir', '/mnt/work') conf.set('spark.driver.maxResultSize', '12g') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) fields = [StructField("subreddit", StringType(), True), StructField("body", StringType(), True)] rawDF = sqlContext.read.json("file:///mnt/s3/2015/*", StructType(fields)) # split comments into words tokenizer = Tokenizer(inputCol="body", outputCol="words") wordsDataFrame = tokenizer.transform(rawDF) remover = StopWordsRemover(inputCol="words", outputCol="filtered") filteredDataFrame = remover.transform(wordsDataFrame) # explode terms into individual rows termDataFrame = filteredDataFrame.select(['subreddit', explode(filteredDataFrame.filtered).alias("term")]) # group by subreddit and term, then count occurence of term in subreddits countsDataFrame = termDataFrame.groupBy(['subreddit', 'term']).count() db = Sqlite() countsDataFrame.select(['subreddit', 'term', 'count']).filter('count > {}'.format(THRESHOLD)).foreachPartition(db.saveSubredditWords)
from pyspark.sql import SparkSession from pyspark.sql.functions import split, explode, col spark = SparkSession.builder.appName("wordcount").getOrCreate() lines = spark.read.text("README.md") words = lines.select(explode(split(lines.value, ",")).alias("words")) words.withColumn('word', explode(split(col('words'), ' ')))\ .groupBy('word')\ .count()\ .sort('count', ascending=False)\ .show()
evaluator = MulticlassClassificationEvaluator() acc = evaluator.evaluate(pred) ##################################################################### # https://www.datacamp.com/courses/recommendation-engines-in-pyspark ############# Collaborative Filtering mo.printSchema() mo = mo.select(mo.UserId.cast('integer'), mo.MovieId.cast('integer'), mo.rating.cast('double')) mo.show() # coverting data into row-based dataframe from pyspark.sql.functions import array, col, explode, lit, struct def to_long(df, by = ["userId"]): # "by" is the column by which you want the final output dataframe to be grouped by cols = [c for c in df.columns if c not in by] kvs = explode(array([struct(lit(c).alias("movieId"), col(c).alias("rating")) for c in cols])).alias("kvs") long_df = df.select(by + [kvs]).select(by + ["kvs.movieId", "kvs.rating"]).filter("rating IS NOT NULL") # Excluding null ratings values since ALS in Pyspark doesn't want blank/null values return long_df mo = to_long(mo) # sparsity n_rating = mo.select('rating').count() n_users = mo.select('UserId').distinct().count() n_movies = mo.select('MovieId').distinct().count() sparsity = (1 - (n_rating * 1.0 / (n_users * n_movies)))*100 print("The dataframe is ", "%.2f" %sparsity + " % empty") # preprocessing from pyspark.sql.functions import monotonically_increasing_id
host = sys.argv[1] port = int(sys.argv[2]) spark = SparkSession\ .builder\ .appName("StructuredNetworkWordCount")\ .getOrCreate() # Create DataFrame representing the stream of input lines from connection to host:port lines = spark\ .readStream\ .format('socket')\ .option('host', host)\ .option('port', port)\ .load() # Split the lines into words words = lines.select(explode(split(lines.value, ' ')).alias('word')) # Generate running word count wordCounts = words.groupBy('word').count() # Start running the query that prints the running counts to the console query = wordCounts\ .writeStream\ .outputMode('complete')\ .format('console')\ .start() query.awaitTermination()
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame: # MAGIC + The first issue is that that we need to split each line by its spaces. # MAGIC + The second issue is we need to filter out empty lines or words. # MAGIC # MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row. To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions). # MAGIC # MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''. # MAGIC # MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import split, explode shakeWordsDF = (shakespeareDF .select(explode(split(shakespeareDF.value , ' ')).alias('word')) .filter("word<>''")) shakeWordsDF.show() shakeWordsDFCount = shakeWordsDF.count() print shakeWordsDFCount # COMMAND ---------- # TEST Remove empty elements (4d) Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount') Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'") # COMMAND ---------- # MAGIC %md # MAGIC ** (4e) Count the words **
casSession = cascluster.connect('test2') for aggItem in agg: if aggItem[0] != "": casSession.execute('INSERT INTO author_pub_simu (author, pub) VALUES (%s, %s)', (str(aggItem[0]), str(aggItem[1]))) casSession.shutdown() cascluster.shutdown() sc = SparkContext("spark://ip-172-31-2-40:7077", "2016_test") sqlContext = SQLContext(sc) # read in data from HDFS and select columns df1 = sqlContext.read.json("hdfs://ec2-52-34-128-244.us-west-2.compute.amazonaws.com:9000//simulated/fake_data_p1*.json").dropna() df_sel = df1.select('recid', 'authors','co-authors','references', 'creation_date').withColumnRenamed('co-authors', 'co_authors').persist(StorageLevel.MEMORY_AND_DISK) # explode references list and group by citation id to calcualte the number of times that one publication has been cited df_references = df_sel.select('recid', explode('references')).withColumnRenamed('_c0','cited_id').groupBy('cited_id').count().withColumnRenamed('count','num_cited') # combine author and co-author list to generate a total list of authors and convert rdd into dataframe rdd_authors = df_sel.rdd.map(lambda x:{'recid':x.recid, 'authors': append_author(x.authors, x.co_authors), 'creation_date': fetch_year(x.creation_date)}) df_authors = sqlContext.createDataFrame(rdd_authors) # join citation and author dataframes df_join = df_references.join(df_authors, df_references.cited_id == df_authors.recid, 'inner').drop(df_authors.recid) # explode author and save to Cassandra database df_explode_author = df_join.select('cited_id', 'num_cited', explode('authors'), 'creation_date').withColumnRenamed('_c0', 'author') df_explode_author.persist(StorageLevel.MEMORY_AND_DISK) df_sel.unpersist() df_explode_author.rdd.foreachPartition(aggToCassandra2) # combine each author publication list, group by author and calculate H-index for each author
.builder \ .appName("StructuredNetworkWordCount") \ .getOrCreate() # create dataframe representing the stream of input lines from connection to localhost:9999 lines = spark \ .readStream \ .format("socket") \ .option("host", "localhost") \ .option("port", 9999) \ .load() # split the lines into words: words = lines.select(explode(split( lines.value, " ")).alias("word")) # word is the new column name created with alias # generate running word count: wordCounts = words.groupBy('word').count() # we have now set up the query on the streaming data. All that is left is to actually start receiving data and computing counts # To do this, we set it up to print the complete set of counts (specified by outputMode('complete')) to the console every time they are updated # and then start the streaming computation using start(). # the lines dataframe is the input table, and wordCounts dataframe is the result table # Start running the query that prints the running counts to the console # open a terminal and type nc -lk 9999
def main(self, sc, *args): spark = SparkSession(sc) observations_parquet_path = args[0] gene_parquet_path = args[1] stats_parquet_path = args[2] ontology_metadata_parquet_path = args[3] orthologe_parquet_path = args[4] output_path = args[5] stats_df = spark.read.parquet(stats_parquet_path) observations_df = spark.read.parquet(observations_parquet_path) gene_df = spark.read.parquet(gene_parquet_path) ontology_df = spark.read.parquet(ontology_metadata_parquet_path) orthologe_df = spark.read.parquet(orthologe_parquet_path) group_by_cols = [ "gene_symbol", "gene_accession_id", "allele_symbol", "allele_accession_id", "life_stage_name", "zygosity", "strain_name", "strain_accession_id", ] grouped_stats_cols = [ "mp_term_id", "top_level_mp_term_id", ] stats_df = stats_df.withColumnRenamed("marker_symbol", "gene_symbol") stats_df = stats_df.withColumnRenamed( "marker_accession_id", "gene_accession_id" ) stats_df = stats_df.withColumn("life_stage_name", explode("life_stage_name")) batch_query_df = ( stats_df.where(col("significant")) .groupBy(*group_by_cols) .agg( *[ collect_set(col_name).alias(col_name) for col_name in grouped_stats_cols ] ) ) grouped_obs_cols = [ "procedure_stable_id", "procedure_name", "parameter_stable_id", "parameter_name", ] experiment_data = observations_df.groupBy(*group_by_cols).agg( *[collect_set(col_name).alias(col_name) for col_name in grouped_obs_cols] ) batch_query_df = batch_query_df.join( experiment_data, group_by_cols, "left_outer" ) gene_df = gene_df.select( col("mgi_accession_id").alias("gene_accession_id"), "ensembl_gene_id", "assignment_status", "conditional_allele_production_status", "es_cell_production_status", "mouse_production_status", "phenotype_status", ) batch_query_df = batch_query_df.join(gene_df, "gene_accession_id", "left_outer") grouped_orth_cols = ["hg_hgnc_acc_id", "hg_symbol"] orthologe_df = orthologe_df.withColumnRenamed( "mg_mgi_gene_acc_id", "gene_accession_id" ) orthologe_df = orthologe_df.where( (col("o_is_max_human_to_mouse") == "max") & (col("o_is_max_mouse_to_human") == "max") & (col("mmf_category_for_threshold") == "one-to-one") & (col("hmf_category_for_threshold") == "one-to-one") & (col("o_support_count") >= 5) ).select("gene_accession_id", "hg_hgnc_acc_id", "hg_symbol") # Remove the ones that have more than one orthologue mmf_category_for_threshold=one-to-one hmf_category_for_threshold=one-to-one orthologe_df = orthologe_df.groupBy("gene_accession_id").agg( *[collect_set(col_name).alias(col_name) for col_name in grouped_orth_cols] ) batch_query_df = batch_query_df.join( orthologe_df, "gene_accession_id", "left_outer" ) batch_query_df.write.parquet(output_path)
sc = SparkContext() spark = SparkSession \ .builder \ .appName("Python Spark SQL basic example") \ .config("spark.some.config.option", "some-value") \ .getOrCreate() path = sys.argv[1] df = spark.read.option("multiline", "true").json(path) name = df.select("meta.view.name").collect() table_id = df.select("meta.view.id").collect() descr = df.select("meta.view.description").collect() category = df.select("meta.view.category").collect() data = df.select("data").collect() table_size = len(data) t = pd.DataFrame({'ID':table_id,'Name': name,'Description':descr,'Category':category,'Size':table_size}) schema_list = df.select(explode("meta.view.columns").alias("col")).select("col.name").collect() schema_l = list() for i in schema_list: schema_l.append(sc.parallelize(i).take(1)[0]) fields = [StructField(field_name, StringType(), True) for field_name in schema_l] schema = StructType(fields) raw_data = spark.createDataFrame(data[0][0],schema) #print(len(data[0][0][0])) raw_data.show()
from pyspark.sql import SparkSession from pyspark.sql.functions import explode from pprint import pprint if __name__ == "__main__": session = SparkSession.builder.appName("Payload").getOrCreate() dataFrameReader = session.read responses = dataFrameReader \ .option("header", "true") \ .option("inferSchema", value = True) \ .json("payload/payload500.json") print("=== Print out schema ===") responses.printSchema() pprint(responses.columns) df = responses.select(explode('events')) df.show() #pprint(df.collect()) #responses.show() session.stop()
#Convert text_entry to Lower Case columnName="text_entry" df2 = df2.withColumn(columnName, lower(col(columnName))) #Remove punctuations from text_entry df2 = df2.withColumn(columnName, regexp_replace(col(columnName), '[^\sa-zA-Z0-9]', '')) #Drop extra columns from df2 df2 = df2.drop('line_id', 'line_number', 'play_name', 'speaker', 'speech_number', 'type') #Split text_entry column into words by using the split function df2 = df2.withColumn("text_entry", split("text_entry", " ")) #Explode eachtext_entry value into multiple rows to get _id with each word of text_entry df2 = df2.withColumn("token", explode(col("text_entry"))) #Calculating Term Frequency by grouping based on ‘_id’ and ‘token’ and counting how many times each token occurs in each document df_tf = df2.groupby("_id", "token").agg(F.count("text_entry").alias("tf")) #Calculating Document Frequency by grouping on each token and counting the number of documents it occurs in df_idf = df2.groupby("token").agg(F.countDistinct("_id").alias("df")) #Converting ‘df’ column to Double Type in order for easy calculation later on df_idf = df_idf.withColumn("df", df_idf["df"].cast(DoubleType())) #Calculating IDF values df_idf = df_idf.withColumn("idf", F.log10(N/df_idf["df"])) #Joining df_tf and df_idf based on token columns tokensWithTfIdf = df_tf.join(df_idf, df_tf["token"] == df_idf["token"], how='left').drop(df_idf["token"])
# MAGIC %md # MAGIC ### Comments # COMMAND ---------- # MAGIC %md # MAGIC count:long # MAGIC items:array # MAGIC total_count:long # COMMAND ---------- from pyspark.sql.functions import explode, when df_comments = df.select(df.comments.count.alias('comments_count'), explode(df.comments.items), df.comments.total_count.alias('total_count'), df.checkin_id) print(df_comments.columns) # COMMAND ---------- for col in df_comments_flattened.columns: splits = col.split('col_') name = splits[len(splits) - 1] df_comments_flattened = df_comments_flattened.withColumnRenamed(col, name) display(df_comments_flattened) # COMMAND ---------- create_register_delta_table(df_comments_flattened, 'comments',
userSchema = StructType().add("ID", "string").add("Lang", "string").add( "Date", "string").add("Source", "string").add("len", "integer").add( "likes", "integer").add("RT", "integer").add("Hash", "string").add( "UserN", "string").add("UserID", "string").add("Names", "string").add( "Place", "string").add("Follow", "integer").add("Friend", "integer") csvdf=spark \ .readStream \ .option("sep",";") \ .schema(userSchema) \ .csv("hdfs://localhost:9000/stream") #print(type(csvdf)) tags = csvdf.select("Hash") #.rdd.flatmap(lambda x:(x.split(','),1)) #print(type(tags)) words = tags.select(explode(split("Hash", ",")).alias("splithash")) wordcount = words.groupby("splithash").count() most = wordcount.select("splithash", "count").orderBy("count", ascending=False).limit(5) #query1=tags.collect() query = most.writeStream.outputMode("complete").format("console").start() query.awaitTermination(60) query.stop()
def with_explode_column(df): import pyspark.sql.functions as F df2 = df.withColumn('values', F.array(F.lit(1), F.lit(2))) df2 = df2.withColumn('value', F.explode(df2.values)) return df2
import base64 import re # ********************************CONFIGURATION********************************** end = cfg.end_time start = cfg.start_time # HDFS path to the impalaQueries_<date>.json file created in step 1 hdfsPath = cfg.hdfs_path + "jsons_" + str(start.month) + "-" + str(start.day) + "_to_" + str(end.month) + "-" + str(end.day) + "/" # ******************************************************************************* # Read in the JSON data: spark = SparkSession.builder.getOrCreate() df = spark.read.json(hdfsPath).withColumn("queries", explode("queries")).select("queries.*") # Add any missing top-level columns and then select them: raw_columns = ['coordinator', 'database', 'detailsAvailable', 'durationMillis', 'endTime', 'queryId', 'querystate', 'queryType', 'rowsProduced', 'startTime', 'statement', 'user'] for i in range(len(raw_columns)): if raw_columns[i] not in df.columns: df = df.withColumn(raw_columns[i], lit(None).cast(StringType())) df = df.selectExpr('attributes.*', *raw_columns) # Add any missing attribute-level columns and then select them: attribute_columns = ['admission_result','admission_wait','bytes_streamed','client_fetch_wait_time','client_fetch_wait_time_percentage','connected_user','ddl_type','delegated_user','estimated_per_node_peak_memory','file_formats','hdfs_average_scan_range','hdfs_bytes_read','hdfs_bytes_read_from_cache','hdfs_bytes_read_from_cache_percentage','hdfs_bytes_read_local','hdfs_bytes_read_local_percentage','hdfs_bytes_read_remote','hdfs_bytes_read_remote_percentage','hdfs_bytes_read_short_circuit','hdfs_bytes_read_short_circuit_percentage','hdfs_bytes_skipped','hdfs_bytes_written','hdfs_scanner_average_bytes_read_per_second','impala_version','memory_accrual','memory_aggregate_peak','memory_per_node_peak','memory_per_node_peak_node','memory_spilled','network_address','oom','original_user','planning_wait_time','planning_wait_time_percentage','pool','query_status','rows_inserted','session_id','session_type','stats_corrupt','stats_missing','thread_cpu_time','thread_cpu_time_percentage','thread_network_receive_wait_time','thread_network_receive_wait_time_percentage','thread_network_send_wait_time','thread_network_send_wait_time_percentage','thread_storage_wait_time','thread_storage_wait_time_percentage','thread_total_time'] for i in range(len(attribute_columns)): if attribute_columns[i] not in df.columns: df = df.withColumn(attribute_columns[i], lit(None).cast(StringType())) df = df.withColumn("statement", regexp_replace("statement", "\\s+"," ")) df = df.selectExpr("admission_result", "cast(admission_wait as int)", "cast(bytes_streamed as bigint)", "cast(client_fetch_wait_time as int)", "cast(client_fetch_wait_time_percentage as tinyint)", "connected_user", "ddl_type", "delegated_user", "cast(estimated_per_node_peak_memory as bigint)", "file_formats", "cast(hdfs_average_scan_range as float)", "cast(hdfs_bytes_read as bigint)", "cast(hdfs_bytes_read_from_cache as tinyint)", "cast(hdfs_bytes_read_from_cache_percentage as tinyint)", "cast(hdfs_bytes_read_local as bigint)", "cast(hdfs_bytes_read_local_percentage as tinyint)", "cast(hdfs_bytes_read_remote as bigint)", "cast(hdfs_bytes_read_remote_percentage as tinyint)", "cast(hdfs_bytes_read_short_circuit as bigint)", "cast(hdfs_bytes_read_short_circuit_percentage as tinyint)", "cast(hdfs_bytes_skipped as int)", "cast(hdfs_bytes_written as int)", "cast(hdfs_scanner_average_bytes_read_per_second as float)", "cast(memory_accrual as float)", "cast(memory_aggregate_peak as float)", "cast(memory_per_node_peak as float)", "memory_per_node_peak_node", "cast(memory_spilled as bigint)", "network_address", "cast(oom as boolean)", "original_user", "cast(planning_wait_time as smallint)", "cast(planning_wait_time_percentage as smallint)", "pool", "query_status", "cast(rows_inserted as int)", "session_id ", "session_type", "cast(stats_corrupt as boolean)", "cast(stats_missing as boolean)", "cast(thread_cpu_time as int)", "cast(thread_cpu_time_percentage as tinyint)", "cast(thread_network_receive_wait_time as int)", "cast(thread_network_receive_wait_time_percentage as tinyint)", "cast(thread_network_send_wait_time as int)", "cast(thread_network_send_wait_time_percentage as tinyint)", "cast(thread_storage_wait_time as int)", "cast(thread_storage_wait_time_percentage as tinyint)", "cast(thread_total_time as int)", "coordinator.hostid as hostid", "database", "cast(durationmillis as bigint)", "endtime", "queryid ", "querystate", "querytype", "cast(rowsproduced as bigint)", "starttime", "statement", "user")
from pyspark.sql import SparkSession from pyspark.sql import functions as func spark = SparkSession.builder.appName("WordCount").getOrCreate() dataFrame = spark.read.text("file:///SparkCourse/book.txt") words = dataFrame.select(func.explode(func.split(dataFrame.value,"\\W++")).alias("word")) words.filter(words.word != "") words = words.select(func.lower(words.word).alias("word")) words = words.groupBy("word").count() wordsSource = words.sort("count") wordsSource.show(wordsSource.count())
s = io.BytesIO(binary) r = sr.Recognizer() with sr.AudioFile(s) as source: audio = r.record(source) try: print("Transcribing...") text = r.recognize_sphinx(audio) print("Done!") return text except: msg = "no_transcription_available" print("Darn! Could not transcribe audio.") return msg sttudf = udf(lambda z:recognize(z), StringType()) splitudf = udf(lambda x: splitWav(x), ArrayType(BinaryType())) convertudf = udf(lambda x: convertToWav(x), BinaryType()) df = spark.read.format("binaryFile").option("pathGlobFilter", "DTNS*.mp3").option("recursiveFileLookup", "true").load("s3a://jordan-podcast-s3/") df = df.withColumn("WAVAudio", convertudf(df.content)).drop("modificationTime","length","content") df = df.withColumn("splitwavs", splitudf(df.WAVAudio)).drop("WAVAudio") df = df.withColumn("splitwavs", explode(df.splitwavs)) df = df.repartition(36) df = df.withColumn("transcriptions", sttudf(df.splitwavs)).drop("splitwavs") df = df.groupby("path").agg(collect_list('transcriptions').alias("transcriptions")) df = df.withColumn("transcriptions", concat_ws(" ", "transcriptions")) df.write.format('org.elasticsearch.spark.sql')\ .option('es.nodes', '10.0.0.6:9200, 10.0.0.14:9200, 10.0.0.10:9200')\ .option('es.port', 9200)\ .option('es.resource', "podcast2/test")\ .save()
StructField("Len", StringType(), True), StructField("Likes", StringType(), True), StructField("RTs", StringType(), True), StructField("Hashtags", StringType(), True), StructField("UserMentionNames", StringType(), True), StructField("UserMentionID", StringType(), True), StructField("name", StringType(), True), StructField("Place", StringType(), True), StructField("Followers", StringType(), True), StructField("Friends", StringType(), True)]) inputDF = spark.readStream.schema(schema).option("delimiter",";").option("maxFilesPerTrigger",1).csv(inputPath) query1 =inputDF.select(explode(split("Hashtags", ",")).alias("Hashtags")).groupBy("Hashtags").count().orderBy('count', ascending=False) #query2 = inputDF.withColumn("Ratio",inputDF.Followers/inputDF.Friends).groupBy('name').agg(max('Ratio').alias('FRRatio')).orderBy('FRRatio', ascending=False) query1.writeStream.outputMode("complete").format("console").option("numRows",5).start().awaitTermination(100) #query2.writeStream.outputMode("complete").format("console").option("numRows",1).start().awaitTermination(100)
def data(self): return self.spark.range(10).toDF('id') \ .withColumn("vs", array([lit(i) for i in range(20, 30)])) \ .withColumn("v", explode(col('vs'))).drop('vs')
def sample(df, seed=None, samples_per_wiki=1000000): """Choose a representative sample of queries from input dataframe. Takes in the unsampled query click logs and filters it down into a smaller representative sample that can be run through the machine learning pipeline. Note that when using data in the `discovery.query_clicks_daily` table the query needs to be post-processed to normalize the queries for grouping. Parameters ---------- df : pyspark.sql.DataFrame Input dataframe with columns wikiid, query, and session_id. seed : int or None, optional The random seed used when sampling. If None a seed will be chosen randomly. (Default: None) samples_per_wiki : int, optional The desired number of distinct (query, hit_page_id) pairs in the output. This constraint is approximate and the returned number of queries may vary per wiki. (Default: 1000000) Returns ------- pyspark.sql.DataFrame The input DataFrame with all columns it origionally had sampled down based on the provided constraints. """ mjolnir.spark.assert_columns(df, ['wikiid', 'query', 'hit_page_ids', 'norm_query_id', 'session_id']) # We need this df twice, and by default the df coming in here is from # mjolnir.norm_query which is quite expensive. df.cache() # Figure out the percentage of each wiki's norm_query_id's we need to approximately # have samples_per_wiki final training samples. hit_page_id_counts = ( df .select('wikiid', 'query', F.explode('hit_page_ids').alias('hit_page_id')) # We could groupBy('wikiid').agg(F.countDistinct('query', 'hit_page_id')) # directly, but this causes spark to blow out memory limits by # collecting too much data on too few executors. .groupBy('wikiid', 'query') .agg(F.countDistinct('hit_page_id').alias('num_hit_page_ids')) .groupBy('wikiid') .agg(F.sum('num_hit_page_ids').alias('num_hit_page_ids')) .collect()) hit_page_id_counts = {row.wikiid: row.num_hit_page_ids for row in hit_page_id_counts} wiki_percents = {} needs_sampling = False for wikiid, num_hit_page_ids in hit_page_id_counts.items(): wiki_percents[wikiid] = min(1., float(samples_per_wiki) / num_hit_page_ids) if wiki_percents[wikiid] < 1.: needs_sampling = True if not needs_sampling: return hit_page_id_counts, df # Aggregate down into a unique set of (wikiid, norm_query_id) and add in a # count of the number of unique sessions per pair. We will sample per-strata # based on percentiles of num_sessions. df_queries_unique = ( df .groupBy('wikiid', 'norm_query_id') .agg(F.countDistinct('session_id').alias('num_sessions')) # This rdd will be used multiple times through strata generation and # sampling. Cache to not duplicate the filtering and aggregation work. .cache()) df_queries_sampled = _sample_queries(df_queries_unique, wiki_percents, seed=seed) # Select the rows chosen by sampling from the input df df_sampled = ( df .join(df_queries_sampled, how='inner', on=['wikiid', 'norm_query_id']) .cache()) return hit_page_id_counts, df_sampled
import findspark findspark.init() from pyspark.sql import SparkSession from pyspark.sql import functions as func spark = SparkSession.builder.appName('WordCount').getOrCreate() inputDF = spark.read.text('book.txt') words = inputDF.select(func.explode(func.split(inputDF.value, '\\W+')).alias('word')) words.filter(words.word != '') lowerCaseWords = words.select(func.lower(words.word).alias('word')) wordCount = lowerCaseWords.groupBy('word').count() wordCountSorted = wordCount.sort('count') wordCountSorted.show(wordCountSorted.count()) spark.stop()
from pyspark.sql import SparkSession from pyspark.sql import Row from pyspark.sql.functions import explode, split, regexp_replace import re spark = SparkSession.builder.master("local").appName( "Word Count").getOrCreate() sc = spark.sparkContext df = spark.createDataFrame([("<#spark><java>", 1), ("<ab.c><p++ython><p.hp>", 2), ("<a>", 3)], ["Tags", "UserId"]) df.show() df = df.withColumn("Tags", explode(split("Tags", "(?=<)|(?<=>)"))) df = df.withColumn("Tags", regexp_replace("Tags", r'\<|\>', "")) df.filter(df.Tags != "").show() def split_tags(row): owner_user_id = row.UserId tags = row.Tags result = re.findall("<(.*?)>", tags) list = [] for res in result: list.append([owner_user_id, res]) print(list) return spark.parallelize(list) # df.rdd.map(split_tags).toDF().show()
sentenceDF.show(truncate=False) (sentenceDF .select(removePunctuation(col('sentence'))) .show(truncate=False)) # loading text file fileName = "" shakespeareDF = sqlContext.read.text(fileName).select(removePunctuation(col('value'))) shakespeareDF.show(15, truncate=False) # sample for splitting lines to words and making words as seperate rows in data frames from pyspark.sql.functions import split, explode shakeWordsDF1 = shakespeareDF.select(explode(split(shakespeareDF.sentence,' ')).alias('word')) shakeWordsDF = shakeWordsDF1.where(shakeWordsDF1.word !="") shakeWordsDF.show() shakeWordsDFCount = shakeWordsDF.count() print shakeWordsDFCount from pyspark.sql.functions import desc topWordsAndCountsDF = wordCount(shakeWordsDF).orderBy(desc("count")) topWordsAndCountsDF.show() # sample for creating collection of few records from faker import Factory fake = Factory.create() fake.seed(4321) from pyspark.sql import Row def fake_entry():
.getOrCreate() # Create DataFrame representing the stream of input lines from connection to host:port data_schema = [ StructField("ID", StringType(), True), StructField("Language", StringType(), True), StructField("Date", StringType(), True), StructField("Source", StringType(), True), StructField("Length", IntegerType(), True), StructField("Likes", IntegerType(), True), StructField("Retweets", IntegerType(), True), StructField("Hashtags", StringType(), True), StructField("UserMentionNames", StringType(), True), StructField("UserMentionID", StringType(), True), StructField("Name", StringType(), True), StructField("Place", StringType(), True), StructField("Followers", IntegerType(), True), StructField("Friends", IntegerType(), True) ] finalschema = StructType(fields=data_schema) v = spark \ .readStream \ .format("csv") \ .option("sep", ";") \ .schema(finalschema) \ .load("hdfs://localhost:9000/stream") v = v.select((explode(split("Hashtags", ","))).alias("Hashtags")) v = v.groupby("Hashtags").count() v.createOrReplaceTempView("twitter") top = spark.sql("SELECT Hashtags, count FROM twitter ORDER BY 2 DESC LIMIT 5") finalquery(top)
# MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''. # MAGIC # MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import split, explode # shakeWordsDF = (shakespeareDF # .select(shakespeareDF[0].alias('sentence')) # ) shakeWordsDF = (shakespeareDF .select(split(shakespeareDF[0], " ").alias('wordLst')) ) shakeWordsDF = (shakeWordsDF .select(explode(shakeWordsDF.wordLst).alias('word')) .where("word != ''") ) shakeWordsDF.show() shakeWordsDFCount = shakeWordsDF.count() print shakeWordsDFCount # COMMAND ---------- # TEST Remove empty elements (4d) Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount') Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'") # COMMAND ----------
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--docs_path', default='data/wiki-sample/AA') parser.add_argument('-p', '--prepro_path', default='data/prepro') parser.add_argument('-q', '--queries_path', default='data/queries/sample.json') parser.add_argument('-o', '--output_path', default='data/output') parser.add_argument('-m', '--mode', choices=['prepro', 'fit', 'query'], default='prepro') parser.add_argument('-dl', '--docs_limit', type=int) parser.add_argument('-ql', '--queries_limit', type=int) parser.add_argument('-il', '--inverted_index_limit', type=int, default=5000) args = parser.parse_args() print('Running BigramPipeline with args: {}'.format(args)) spark = SparkSession.builder.appName('BigramModel').getOrCreate() tokenIdsUdf = udf(lambda x: x.indices.tolist(), ArrayType(IntegerType())) tfIdfModelPath = os.path.join(args.prepro_path, 'tf_idf_model') docsTfIdfPath = os.path.join(args.prepro_path, 'docs_tf_idf') docsTokenIdsPath = os.path.join(args.prepro_path, 'docs_token_ids') docsBigramsPath = os.path.join(args.prepro_path, 'docs_bigrams') parser = WikiParser(inputCol='text', outputCol='text_parsed', minParagraphs=1, minCharacters=500) tokenizer = Tokenizer(inputCol='text_parsed', outputCol='unigrams') ngrams = NGram(inputCol='unigrams', outputCol='bigrams', n=2) concat = Concat(inputCols=['unigrams', 'bigrams'], outputCol='tokens') if args.mode == 'prepro': spark.sparkContext.setJobGroup('input', 'Read input data') docs = spark.read.json(args.docs_path) if args.docs_limit is not None: docs = docs.limit(args.docs_limit) spark.sparkContext.setJobGroup('parse_docs', 'Parse wiki documents') docsParsed = parser.transform(docs) docsParsed = checkpoint(spark, docsParsed, os.path.join(args.prepro_path, 'docs_parsed')) spark.sparkContext.setJobGroup('tokenize', 'Tokenize documents') docsTokenized = tokenizer.transform(docsParsed) docsTokenized = checkpoint( spark, docsTokenized, os.path.join(args.prepro_path, 'docs_tokenized')) spark.sparkContext.setJobGroup('ngrams', 'Compute bigrams') docsBigrams = ngrams.transform(docsTokenized) docsBigrams = concat.transform(docsBigrams) docsBigrams.write.parquet(docsBigramsPath) elif args.mode == 'fit': spark.sparkContext.setJobGroup('input', 'Read input data') docsBigrams = spark.read.parquet(docsBigramsPath).select( 'id', 'tokens') tf = CountVectorizer(inputCol='tokens', outputCol='tf', vocabSize=10000000, minDF=2.0, minTF=3.0) idf = IDF(inputCol='tf', outputCol='idf') spark.sparkContext.setJobGroup('tf', 'Fit TF model') tfModel = tf.fit(docsBigrams) docsTf = tfModel.transform(docsBigrams) docsTf = checkpoint(spark, docsTf, os.path.join(args.prepro_path, 'docs_tf')) spark.sparkContext.setJobGroup('idf', 'Fit IDF model') idfModel = idf.fit(docsTf) docsTfIdf = idfModel.transform(docsTf) docsTfIdf = docsTfIdf.select(docsTfIdf.id.alias('doc_id'), docsTfIdf.idf.alias('doc_idf')) docsTfIdf = checkpoint(spark, docsTfIdf, docsTfIdfPath) tfIdfModel = PipelineModel( stages=[tokenizer, ngrams, concat, tfModel, idfModel]) tfIdfModel.save(tfIdfModelPath) spark.sparkContext.setJobGroup('docs_token_ids', 'Compute inverted index') docsTokenIds = docsTfIdf.select( docsTfIdf.doc_id, explode(tokenIdsUdf(docsTfIdf.doc_idf)).alias('token_id')) docsTokenIds.write.parquet(docsTokenIdsPath) elif args.mode == 'query': assert args.queries_path is not None spark.sparkContext.setJobGroup('input', 'Read input data') tfIdfModel = PipelineModel.load(tfIdfModelPath) docsTfIdf = spark.read.parquet(docsTfIdfPath) docsTokenIds = spark.read.parquet(docsTokenIdsPath) queries = spark.read.json(args.queries_path) if args.queries_limit is not None: queries = queries.limit(args.queries_limit) queries = queries.select(queries._id.alias('query_id'), queries.question.alias('text_parsed')) spark.sparkContext.setJobGroup('queries_tf_idf', 'Apply TF-IDF to queries') queriesTfIdf = tfIdfModel.transform(queries) queriesTfIdf = queriesTfIdf.select(queriesTfIdf.query_id, queriesTfIdf.tf.alias('query_tf')) queriesTfIdf = checkpoint( spark, queriesTfIdf, os.path.join(args.output_path, 'queries_tf_idf')) print('Finished query TF IDF') spark.sparkContext.setJobGroup('queries_token_ids', 'Compute query token IDs') queriesTokenIds = queriesTfIdf.select( queriesTfIdf.query_id, explode(tokenIdsUdf(queriesTfIdf.query_tf)).alias('token_id')) queriesTokenIds = checkpoint( spark, queriesTokenIds, os.path.join(args.output_path, 'queries_token_ids')) print('Finished query token IDs') spark.sparkContext.setJobGroup('doc_queries', 'Perform inverted index filtering') docQueries = docsTokenIds.join(queriesTokenIds, on='token_id').groupby( 'query_id', 'doc_id').count() window = Window.partitionBy(docQueries.query_id).orderBy( col('count').desc()) docQueries = docQueries.withColumn('rank', row_number().over(window)) \ .filter(col('rank') <= args.inverted_index_limit) \ .select('query_id', 'doc_id') docQueries = checkpoint(spark, docQueries, os.path.join(args.output_path, 'doc_queries')) print('Finished inverted index filter') spark.sparkContext.setJobGroup('score', 'Perform scoring') docQueries = docQueries.join(docsTfIdf, on='doc_id').join(queriesTfIdf, on='query_id') \ .select('query_id', 'doc_id', 'query_tf', 'doc_idf') docQueries = Dot(inputCols=['doc_idf', 'query_tf'], outputCol='score').transform(docQueries) queryResults = docQueries.select('query_id', 'doc_id', 'score') queryResults.write.parquet( os.path.join(args.output_path, 'query_results')) print('Wrote output to {}'.format(args.output_path)) spark.stop()
.builder \ .appName("StructuredNetworkWordCount") \ .getOrCreate() # Create DataFrame representing the stream of input lines from connection to localhost:9999 lines = spark \ .readStream \ .format("socket") \ .option("host", "localhost") \ .option("port", 9999) \ .load() # Split the lines into words words = lines.select( explode( split(lines.value, " ") ).alias("word") ) # Generate running word count wordCounts = words.groupBy("word").count() output_mode = "complete" # output_mode = "append" # Start running the query that prints the running counts to the console query = wordCounts \ .writeStream \ .outputMode(output_mode) \ .format("console") \ .start()
data = data.withColumn("subject", regexp_replace("subject", "\s+", "")) # data = data.withColumn("subject", regexp_replace("subject"," ","")) # Question 1 Answer # Get the count of the emails data.select("location").distinct().count() text_file = open("Q1.txt", "w") text_file.write("%s" % int(data.select("location").distinct().count())) text_file.close() # Verified by calculating the number of files from os directory data = data.select( "*", (explode(split(col("to"), ",")).alias("to_clean"))).where('to_clean != ""') data = data.withColumn("to_clean", regexp_replace("to_clean", "\s+", "")) data = data.withColumn("to_clean", regexp_replace("to_clean", " ", "")) # Question 2 # Here we explode to to column as it is a comma separated list and then perform data_q2 = data.groupBy('to_clean').agg( count("message").alias("total_messages")).orderBy("total_messages", ascending=False) data_q2 = data_q2.toPandas() data_q2.to_csv("Q2.txt", index=False, sep="\t") # Verfied by looking at the volume of message for a person they seem to be inline # Question 3: # Here we do a self join and try to match the from and to variable and then match the subject with "Re:" Removed to denote response
# TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import split, explode shakeWordsDF = (shakespeareDF <FILL IN>) shakeWordsDF.show() shakeWordsDFCount = shakeWordsDF.count() print shakeWordsDFCount # COMMAND ---------- # ANSWER from pyspark.sql.functions import split, size, explode shakeWordsDF = (shakespeareDF .select(split('sentence', '\s+').alias('words')) .select(explode('words').alias('word')) .where(col('word') != '')) shakeWordsDF.show() shakeWordsDFCount = shakeWordsDF.count() print shakeWordsDFCount # COMMAND ---------- # TEST Remove empty elements (4d) Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount') Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'") # COMMAND ---------- # PRIVATE_TEST Remove empty elements (4d)
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame: # MAGIC + The first issue is that that we need to split each line by its spaces. # MAGIC + The second issue is we need to filter out empty lines or words. # MAGIC # MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row. To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions). # MAGIC # MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''. # MAGIC # MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import split, explode shakeWordsDF = ( shakespeareDF.select(explode(split(shakespeareDF.sentence, " "))) .where("col != ''") .selectExpr("col as word")) shakeWordsDF.show() shakeWordsDFCount = shakeWordsDF.count() print shakeWordsDFCount # COMMAND ---------- # TEST Remove empty elements (4d) Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount') Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'") # COMMAND ----------
# MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame: # MAGIC + The first issue is that that we need to split each line by its spaces. # MAGIC + The second issue is we need to filter out empty lines or words. # MAGIC # MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row. To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions). # MAGIC # MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''. # MAGIC # MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import split, explode shakeWordsDF = (shakespeareDF .select(explode(split(shakespeareDF.sentence, ' ')) .alias("word")) .where("word != ''")) shakeWordsDF.show(truncate=False) shakeWordsDFCount = shakeWordsDF.count() print shakeWordsDFCount # COMMAND ---------- # TEST Remove empty elements (4d) Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount') Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'") # COMMAND ----------
def rdd_to_recordstore(rdd_transform_context_rdd): if rdd_transform_context_rdd.isEmpty(): MonMetricsKafkaProcessor.log_debug( "rdd_to_recordstore: nothing to process...") else: sql_context = SQLContext(rdd_transform_context_rdd.context) data_driven_specs_repo = DataDrivenSpecsRepoFactory.\ get_data_driven_specs_repo() pre_transform_specs_df = data_driven_specs_repo.\ get_data_driven_specs( sql_context=sql_context, data_driven_spec_type=DataDrivenSpecsRepo. pre_transform_specs_type) # # extract second column containing raw metric data # raw_mon_metrics = rdd_transform_context_rdd.map( lambda nt: nt.rdd_info[1]) # # convert raw metric data rdd to dataframe rdd # raw_mon_metrics_df = \ MonMetricUtils.create_mon_metrics_df_from_json_rdd( sql_context, raw_mon_metrics) # # filter out unwanted metrics and keep metrics we are interested in # cond = [ raw_mon_metrics_df.metric.name == pre_transform_specs_df.event_type] filtered_metrics_df = raw_mon_metrics_df.join( pre_transform_specs_df, cond) # # validate filtered metrics to check if required fields # are present and not empty # In order to be able to apply filter function had to convert # data frame rdd to normal rdd. After validation the rdd is # converted back to dataframe rdd # # FIXME: find a way to apply filter function on dataframe rdd data validated_mon_metrics_rdd = filtered_metrics_df.rdd.filter( MonMetricsKafkaProcessor._validate_raw_mon_metrics) validated_mon_metrics_df = sql_context.createDataFrame( validated_mon_metrics_rdd, filtered_metrics_df.schema) # # record generator # generate a new intermediate metric record if a given metric # metric_id_list, in pre_transform_specs table has several # intermediate metrics defined. # intermediate metrics are used as a convenient way to # process (aggregated) metric in mutiple ways by making a copy # of the source data for each processing # gen_mon_metrics_df = validated_mon_metrics_df.select( validated_mon_metrics_df.meta, validated_mon_metrics_df.metric, validated_mon_metrics_df.event_processing_params, validated_mon_metrics_df.event_type, explode(validated_mon_metrics_df.metric_id_list).alias( "this_metric_id"), validated_mon_metrics_df.service_id) # # transform metrics data to record_store format # record store format is the common format which will serve as # source to aggregation processing. # converting the metric to common standard format helps in writing # generic aggregation routines driven by configuration parameters # and can be reused # record_store_df = gen_mon_metrics_df.select( (gen_mon_metrics_df.metric.timestamp / 1000).alias( "event_timestamp_unix"), from_unixtime( gen_mon_metrics_df.metric.timestamp / 1000).alias( "event_timestamp_string"), gen_mon_metrics_df.event_type.alias("event_type"), gen_mon_metrics_df.event_type.alias("event_quantity_name"), (gen_mon_metrics_df.metric.value / 1.0).alias( "event_quantity"), when(gen_mon_metrics_df.metric.dimensions.state != '', gen_mon_metrics_df.metric.dimensions.state).otherwise( 'NA').alias("event_status"), lit('1.0').alias('event_version'), lit('metrics').alias("record_type"), # resource_uuid when(gen_mon_metrics_df.metric.dimensions.instanceId != '', gen_mon_metrics_df.metric.dimensions.instanceId).when( gen_mon_metrics_df.metric.dimensions.resource_id != '', gen_mon_metrics_df.metric.dimensions.resource_id). otherwise('NA').alias("resource_uuid"), when(gen_mon_metrics_df.metric.dimensions.tenantId != '', gen_mon_metrics_df.metric.dimensions.tenantId).when( gen_mon_metrics_df.metric.dimensions.tenant_id != '', gen_mon_metrics_df.metric.dimensions.tenant_id).when( gen_mon_metrics_df.metric.dimensions.project_id != '', gen_mon_metrics_df.metric.dimensions.project_id).otherwise( 'NA').alias("tenant_id"), when(gen_mon_metrics_df.metric.dimensions.mount != '', gen_mon_metrics_df.metric.dimensions.mount).otherwise( 'NA').alias("mount"), when(gen_mon_metrics_df.metric.dimensions.device != '', gen_mon_metrics_df.metric.dimensions.device).otherwise( 'NA').alias("device"), when(gen_mon_metrics_df.meta.userId != '', gen_mon_metrics_df.meta.userId).otherwise('NA').alias( "user_id"), when(gen_mon_metrics_df.meta.region != '', gen_mon_metrics_df.meta.region).when( gen_mon_metrics_df.event_processing_params .set_default_region_to != '', gen_mon_metrics_df.event_processing_params .set_default_region_to).otherwise( 'NA').alias("region"), when(gen_mon_metrics_df.meta.zone != '', gen_mon_metrics_df.meta.zone).when( gen_mon_metrics_df.event_processing_params .set_default_zone_to != '', gen_mon_metrics_df.event_processing_params .set_default_zone_to).otherwise( 'NA').alias("zone"), when(gen_mon_metrics_df.metric.dimensions.hostname != '', gen_mon_metrics_df.metric.dimensions.hostname).when( gen_mon_metrics_df.metric.value_meta.host != '', gen_mon_metrics_df.metric.value_meta.host).otherwise( 'NA').alias("host"), when(gen_mon_metrics_df.service_id != '', gen_mon_metrics_df.service_id).otherwise( 'NA').alias("service_group"), when(gen_mon_metrics_df.service_id != '', gen_mon_metrics_df.service_id).otherwise( 'NA').alias("service_id"), from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000, 'yyyy-MM-dd').alias("event_date"), from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000, 'HH').alias("event_hour"), from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000, 'mm').alias("event_minute"), from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000, 'ss').alias("event_second"), gen_mon_metrics_df.this_metric_id.alias("metric_group"), gen_mon_metrics_df.this_metric_id.alias("metric_id")) # # get transform context # rdd_transform_context = rdd_transform_context_rdd.first() transform_context = rdd_transform_context.transform_context_info # # cache record store rdd # if cfg.CONF.service.enable_record_store_df_cache: storage_level_prop = \ cfg.CONF.service.record_store_df_cache_storage_level storage_level = StorageUtils.get_storage_level( storage_level_prop) record_store_df.persist(storage_level) # # start processing metrics available in record_store data # MonMetricsKafkaProcessor.process_metrics(transform_context, record_store_df) # remove df from cache if cfg.CONF.service.enable_record_store_df_cache: record_store_df.unpersist() # # extract kafka offsets and batch processing time # stored in transform_context and save offsets # offsets = transform_context.offset_info # batch time batch_time_info = \ transform_context.batch_time_info MonMetricsKafkaProcessor.save_kafka_offsets( offsets, rdd_transform_context_rdd.context.appName, batch_time_info) # call pre hourly processor, if its time to run if (cfg.CONF.stage_processors.pre_hourly_processor_enabled is True and PreHourlyProcessor.is_time_to_run( batch_time_info)): PreHourlyProcessor.run_processor( record_store_df.rdd.context, batch_time_info)
def gapply(grouped_data, func, schema, *cols): """Applies the function ``func`` to data grouped by key. In particular, given a dataframe grouped by some set of key columns key1, key2, ..., keyn, this method groups all the values for each row with the same key columns into a single Pandas dataframe and by default invokes ``func((key1, key2, ..., keyn), values)`` where the number and order of the key arguments is determined by columns on which this instance's parent :class:`DataFrame` was grouped and ``values`` is a ``pandas.DataFrame`` of columns selected by ``cols``, in that order. If there is only one key then the key tuple is automatically unpacked, with ``func(key, values)`` called. ``func`` is expected to return a ``pandas.DataFrame`` of the specified schema ``schema``, which should be of type :class:`StructType` (output columns are of this name and order). If ``spark.conf.get("spark.sql.retainGroupColumns")`` is not ``u'true'``, then ``func`` is called with an empty key tuple (note it is set to ``u'true'`` by default). If no ``cols`` are specified, then all grouped columns will be offered, in the order of the columns in the original dataframe. In either case, the Pandas columns will be named according to the DataFrame column names. The order of the rows passed in as Pandas rows is not guaranteed to be stable relative to the original row order. :note: Users must ensure that the grouped values for every group must fit entirely in memory. :note: This method is only available if Pandas is installed. :param grouped_data: data grouped by key :param func: a two argument function, which may be either a lambda or named function :param schema: the return schema for ``func``, a :class:`StructType` :param cols: list of column names (string only) :raise ValueError: if ``"*"`` is in ``cols`` :raise ValueError: if ``cols`` contains duplicates :raise ValueError: if ``schema`` is not a :class:`StructType` :raise ImportError: if ``pandas`` module is not installed :raise ImportError: if ``pandas`` version is too old (less than 0.7.1) :return: the new :class:`DataFrame` with the original key columns replicated for each returned value in each group's resulting pandas dataframe, the schema being the original key schema prepended to ``schema``, where all the resulting groups' rows are concatenated. Of course, if retaining group columns is disabled, then the output will exactly match ``schema`` since no keys can be prepended. >>> import pandas as pd >>> from pyspark.sql import SparkSession >>> from spark_sklearn.group_apply import gapply >>> from spark_sklearn.util import createLocalSparkSession >>> spark = createLocalSparkSession() >>> df = (spark ... .createDataFrame([Row(course="dotNET", year=2012, earnings=10000), ... Row(course="Java", year=2012, earnings=20000), ... Row(course="dotNET", year=2012, earnings=5000), ... Row(course="dotNET", year=2013, earnings=48000), ... Row(course="Java", year=2013, earnings=30000)]) ... .select("course", "year", "earnings")) >>> def yearlyMedian(_, vals): ... all_years = set(vals['year']) ... # Note that interpolation is performed, so we need to cast back to int. ... yearly_median = [(year, int(vals['earnings'][vals['year'] == year].median())) ... for year in all_years] ... return pd.DataFrame.from_records(yearly_median) >>> newSchema = StructType().add("year", LongType()).add("median_earnings", LongType()) >>> gapply(df.groupBy("course"), yearlyMedian, newSchema).orderBy("median_earnings").show() +------+----+---------------+ |course|year|median_earnings| +------+----+---------------+ |dotNET|2012| 7500| | Java|2012| 20000| | Java|2013| 30000| |dotNET|2013| 48000| +------+----+---------------+ <BLANKLINE> >>> def twoKeyYearlyMedian(_, vals): ... return pd.DataFrame.from_records([(int(vals["earnings"].median()),)]) >>> newSchema = StructType([df.schema["earnings"]]) >>> gapply(df.groupBy("course", "year"), twoKeyYearlyMedian, newSchema, "earnings").orderBy( ... "earnings").show() +------+----+--------+ |course|year|earnings| +------+----+--------+ |dotNET|2012| 7500| | Java|2012| 20000| | Java|2013| 30000| |dotNET|2013| 48000| +------+----+--------+ <BLANKLINE> >>> spark.stop(); SparkSession._instantiatedContext = None """ import pandas as pd minPandasVersion = '0.7.1' if LooseVersion(pd.__version__) < LooseVersion(minPandasVersion): raise ImportError('Pandas installed but version is {}, {} required' .format(pd.__version__, minPandasVersion)) # Do a null aggregation to retrieve the keys first (should be no computation) # Also consistent with spark.sql.retainGroupColumns keySchema = grouped_data.agg({}).schema keyCols = grouped_data.agg({}).columns if not cols: # Extract the full column list with the parent df javaDFName = "org$apache$spark$sql$RelationalGroupedDataset$$df" parentDF = java_gateway.get_field(grouped_data._jgd, javaDFName) allCols = DataFrame(parentDF, None).columns keyColsSet = set(keyCols) cols = [col for col in allCols if col not in keyColsSet] if "*" in cols: raise ValueError("cols expected to contain only singular columns") if len(set(cols)) < len(cols): raise ValueError("cols expected not to contain duplicate columns") if not isinstance(schema, StructType): raise ValueError("output schema should be a StructType") inputAggDF = grouped_data.agg({col: 'collect_list' for col in cols}) # Recover canonical order (aggregation may change column order) canonicalOrder = chain(keyCols, [inputAggDF['collect_list(' + col + ')'] for col in cols]) inputAggDF = inputAggDF.select(*canonicalOrder) # Wraps the user-provided function with another python function, which prepares the # input in the form specified by the documentation. Then, once the function completes, # this wrapper prepends the keys to the output values and converts from pandas. def pandasWrappedFunc(*args): nvals = len(cols) keys, collectedCols = args[:-nvals], args[-nvals:] paramKeys = tuple(keys) if len(paramKeys) == 1: paramKeys = paramKeys[0] valuesDF = pd.DataFrame.from_dict(dict(zip(cols, collectedCols))) valuesDF = valuesDF[list(cols)] # reorder to canonical outputDF = func(paramKeys, valuesDF) valCols = outputDF.columns.tolist() for key, keyName in zip(keys, keyCols): outputDF[keyName] = key outputDF = outputDF[keyCols + valCols] # reorder to canonical # To recover native python types for serialization, we need # to convert the pandas dataframe to a numpy array, then to a # native list (can't go straight to native, since pandas will # attempt to perserve the numpy type). return outputDF.values.tolist() keyPrependedSchema = StructType(list(chain(keySchema, schema))) outputAggSchema = ArrayType(keyPrependedSchema, containsNull=False) pandasUDF = udf(pandasWrappedFunc, outputAggSchema) outputAggDF = inputAggDF.select(pandasUDF(*inputAggDF)) explodedDF = outputAggDF.select(explode(*outputAggDF).alias("gapply")) # automatically retrieves nested schema column names return explodedDF.select("gapply.*")
# drop() is like the opposite of select(): Instead of selecting specific columns from a DataFrame, it drops a specifed column from a DataFrame dataDF.drop('occupation').drop('age').show() # the sample() transformation returns a new DataFrame with a random sample sampledDF = dataDF.sample(withReplacement=False, fraction=0.10) print sampledDF.count() sampledDF.show() # split() and explode() transformations from pyspark.sql.functions import split, explode shakeWordsSplit = (shakespeareDF .select(split(shakespeareDF.word,' ').alias('word'))) # here split(DF,' ') splits the sentence at a space and returns each word in a single row shakeWordsExplode = (shakeWordsSplit .select(explode(shakeWordsSplit.word).alias('word'))) # explode() Returns a new row for each element in the given array shakeWordsDF = shakeWordsExplode.filter(shakeWordsExplode.word != '') # removes all the blanks shakeWordsDF.show() shakeWordsDFCount = shakeWordsDF.count() print shakeWordsDFCount ############################################################### ### ### ### ### ### GROUP BY ### ### ### ### ### ###############################################################
from pyspark.sql import SparkSession from pyspark.sql import functions as F from pyspark.sql.types import * from pyspark.sql import Window spark = SparkSession.builder.appName("product desc").master( "local").getOrCreate() schemaprod = StructType([ StructField("Id", StringType(), True), StructField("Productdesc", StringType(), True) ]) product = spark.read.schema(schemaprod).csv("D://productdesc.csv") productexpl = product.withColumn("tshrt", F.split("Productdesc", " ")).select( "Id", "Productdesc", F.explode("tshrt").alias("words")) productfilter = productexpl.filter( F.col("words") == "Tshirt").drop("Productdesc") win = Window.partitionBy("Id") priducttshrtt = productfilter.select( "Id", F.count("Id").over(win).alias("Cnt")).filter(F.col("Cnt") == 2).distinct() #priducttshrt=productfilter.groupBy("Id").count().filter(F.col("count") == 2) priducttshrtt.show()
.add("Date", "string").add("Source", "string").add("Len", "string") \ .add("Likes", "string").add("RTs", "string").add("Hashtags", "string") \ .add("UserMentionName", "string").add("UserMentionID", "string").add("name", "string") \ .add("Place", "string").add("Followers", "float").add("Friends", "float") lines = ssc \ .readStream \ .format("csv") \ .option("header", True) \ .schema(schema) \ .option("sep", ";") \ .csv('hdfs://localhost:9000/user/chaitra/stream/') words = lines.select( explode( split(lines.Hashtags, ",") ).alias("Hashtags"), "ID" ) # lines.createOrReplaceTempView("frr") hash1 = words.groupBy("Hashtags").count().sort(desc("count"),asc("Hashtags")) hash1.createOrReplaceTempView("hashtable") hash1 = ssc.sql("select * from hashtable limit 5") # frr = ssc.sql("select * from frr ") # frr = lines.withColumn('FRRatio', frr.Followers/frr.Friends) # frr=frr.groupBy((['name', frr.Followers, frr.Friends,frr.FRRatio])).count().sort(col("FRRatio").desc()) # frr = frr.drop("count") # frr = frr.drop("Followers") # frr = frr.drop("Friends")
print wordCount # COMMAND ---------- # TEST Test.assertEquals(wordCount, 1903220, 'incorrect word count') # COMMAND ---------- # MAGIC %md # MAGIC Next, we'll compute the word count using `select`, the function `func.explode()`, and then taking a `count()` on the `DataFrame`. Make sure to name the column returned by the `explode` function 'word'. # COMMAND ---------- # ANSWER wordList = noStopWords.select(func.explode('words').alias('word')) # Note that we have one word per Row now print wordList.take(3) wordListCount = wordList.count() print wordListCount # COMMAND ---------- # TEST Test.assertEquals(wordListCount, 1903220, 'incorrect value for wordListCount') # COMMAND ---------- # MAGIC %md # MAGIC For your final task, you'll group by word and count the number of times each word occurs. Make sure to return the counts in descending order and to call them `counts`.
def get_weather_station_weather_df(spark, stations_id): ''' Download the weather station data during all hours of the 5 years for given station ids and return a dataframe ''' cache_file = workdir + 'data/weather_stations.parquet' if isdir(cache_file): print('Skip downloading weather station: already done') return spark.read.parquet(cache_file) get_station_weather_month_udf = \ udf(get_station_weather_month, ArrayType(StructType([ StructField('day', IntegerType()), StructField('hour', IntegerType()), StructField('dew_point_temp', FloatType()), StructField('rel_hum', FloatType()), StructField('wind_dir', FloatType()), StructField('wind_spd', FloatType()), StructField('visibility', FloatType()), StructField('stn_press', FloatType()), StructField('hmdx', FloatType()), StructField('wind_chill', FloatType()), StructField('temp', FloatType()), StructField('risky_weather', FloatType()) ]))) month_per_year_df = spark.createDataFrame(zip(range(1, 13), ), ['month']) years_df = spark.createDataFrame(zip(range(2012, 2018), ), ['year']) months_df = years_df.crossJoin(month_per_year_df) stations_months_df = stations_id.crossJoin(months_df) c = col('col') def create_date(year, month, day): return datetime.datetime.strptime(f'{year}-{month}-{day}', "%Y-%m-%d") create_date_udf = udf(create_date, DateType()) df = (stations_months_df.repartition(200, 'year', 'month').withColumn( 'weather', get_station_weather_month_udf('station_id', 'year', 'month')).select( 'station_id', 'year', 'month', explode('weather')).select( 'station_id', create_date_udf('year', 'month', c['day']).alias('date'), c['hour'].alias('hour'), c['dew_point_temp'].alias('dew_point_temp'), c['rel_hum'].alias('rel_hum'), c['wind_dir'].alias('wind_dir'), c['wind_spd'].alias('wind_spd'), c['visibility'].alias('visibility'), c['stn_press'].alias('stn_press'), c['hmdx'].alias('hmdx'), c['wind_chill'].alias('wind_chill'), c['temp'].alias('temp'), c['risky_weather'].alias('risky_weather'))) # We make a moving average of risky_weather since the effect of a risky # weather is distributed in the next hours def weighted_average(c, window, offsets, weights): def value(i): return lag(c, -i).over(window) values = [ coalesce(value(i) * w, lit(0)) for i, w in zip(offsets, weights) ] return sum(values, lit(0)) window = (Window.partitionBy('station_id').orderBy('date')) offsets = range(-23, 1) weights = [exp(0.5 * t) for t in offsets] weights = [w / sum(weights) for w in weights] df = df.withColumn( 'risky_weather', weighted_average('risky_weather', window, offsets, weights)) df.write.parquet(cache_file) return df
from lib.pos_tags import PosTags from lib.chunks import Chunks t = Tokens() p = PosTags() c = Chunks() c.train(c.load_training_data("../data/chunker_training_50_fixed.json")) def pipeline(s): """ Given a string, return a list of relations """ return c.assemble(c.tag(p.tag(t.tokenize(s)))) pipeline_udf = sql.udf(pipeline, types.ArrayType(types.MapType(types.StringType(), types.StringType()))) phrases = ( notes.withColumn("phrases", pipeline_udf(notes["document"])) .select(sql.explode(sql.col("phrases")).alias("text")) .filter(sql.col("text")["tag"] == "NP") .select(sql.lower(sql.col("text")["phrase"]).alias("phrase")) .groupBy(sql.col("phrase")) .count() ) phrases.write.parquet("../data/idigbio_phrases.parquet")
.add("2", "string") \ .add("3", "string") \ .add("4", "string") \ .add("5", "string") \ .add("6", "string") \ .add("7", "string") \ .add("c8", "string") \ .add("9", "string") \ .add("10", "string") \ .add("11", "string") \ .add("12", "string") \ .add("c13", "string") \ .add("c14", "string") csvDF = spark \ .readStream \ .option("sep", ";") \ .schema(userSchema) \ .csv("hdfs://localhost:9000/stream/") hashtags = csvDF.select("c8") words = hashtags.select(explode(split(hashtags.c8, ","))) words = words.withColumnRenamed("col", "Hashtags") word = words.groupBy("Hashtags").count().orderBy("count", ascending = False).limit(5).writeStream.outputMode("complete").format("console").start().awaitTermination(60) # word.stop() spark.stop()
# MAGIC # MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame: # MAGIC + The first issue is that that we need to split each line by its spaces. # MAGIC + The second issue is we need to filter out empty lines or words. # MAGIC # MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row. To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions). # MAGIC # MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''. # MAGIC # MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import split, explode shakeWordsDF = (shakespeareDF.select(explode(split(shakespeareDF[0],"\s+")).alias("word"))).where("length(word) > 0") shakeWordsDF.show() shakeWordsDFCount = shakeWordsDF.count() print shakeWordsDFCount # COMMAND ---------- # TEST Remove empty elements (4d) Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount') Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'") # COMMAND ---------- # MAGIC %md # MAGIC ** (4e) Count the words **
def test_count(self, input_df, default_params): expected_count = input_df.select( sql_funcs.explode( input_df[default_params["path_to_array"]])).count() actual_count = Exploder(**default_params).transform(input_df).count() assert expected_count == actual_count
@f.pandas_udf(schema, f.PandasUDFType.GROUPED_MAP) def composite_udf(df): rows = [] all_sum = float(df['count'].sum()) / float(df['composite_time_count'].sum()) rows.append({"fraction": all_sum}) data = pd.DataFrame(rows).assign(_dummy=1) grouping_variables = df[grouping_fields].iloc[:1].assign(_dummy=1) result = grouping_variables.merge(data, on="_dummy").drop("_dummy", axis=1) return result composite_times = (subsessions.filter("app_build_id = '20181228093007'") .filter("normalized_channel == 'nightly'") .select('client_id','branch', 'composite_time_count', 'composite_time_sum', f.explode('composite_time').alias("bucket", "count") ) .groupBy('client_id', 'branch', 'bucket') .apply(composite_udf) ) composite_times.createOrReplaceGlobalTempView("composite_times") composite_times.take(3) # COMMAND ---------- # MAGIC %r # MAGIC metrics = tbl(sc, "global_temp.composite_times") # MAGIC per_user_build = metrics %>% # MAGIC select(branch, client_id, ends_with("_count"), ends_with("_sum")) %>% # MAGIC select(-starts_with("device_reset_reason")) %>% # MAGIC group_by(branch, client_id) %>%
# MAGIC # MAGIC Before we can use the `wordcount()` function, we have to address two issues with the format of the DataFrame: # MAGIC + The first issue is that that we need to split each line by its spaces. # MAGIC + The second issue is we need to filter out empty lines or words. # MAGIC # MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row. To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions). # MAGIC # MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''. # MAGIC # MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import split, explode shakeWordsDF = (shakespeareDF.select(explode(split(col('sentence'),' ')).alias('word'))) shakeWordsDF = shakeWordsDF.filter(col('word') != '') shakeWordsDF.show() shakeWordsDFCount = shakeWordsDF.count() print shakeWordsDFCount # COMMAND ---------- # TEST Remove empty elements (4d) Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount') Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'") # COMMAND ----------
def data(self): return self.spark.range(10).toDF('id') \ .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ .withColumn("v", explode(col('vs'))) \ .drop('vs') \ .withColumn('w', lit(1.0))
import random from pyspark.sql.types import * # we randomly select maximum 10 points within a same polygon of size 11 (30m) def sample(latitudes, longitudes): l = list(zip(latitudes, longitudes)) return random.sample(l, min(len(l), 10)) sample_schema = ArrayType(StructType([StructField("latitude", DoubleType()), StructField("longitude", DoubleType())])) sample_udf = udf(sample, sample_schema) sample_df = ( points_df .groupBy(to_h3(F.col("latitude"), F.col("longitude"), F.lit(11))) .agg(F.collect_list(F.col("latitude")).alias("latitudes"), F.collect_list(F.col("longitude")).alias("longitudes")) .withColumn('sample', F.explode(sample_udf(F.col('latitudes'), F.col('longitudes')))) .select('sample.latitude', 'sample.longitude') ) sample_df.cache().repartition(sc.defaultParallelism * 20) sample_count = sample_df.count() print("num_points: ", num_points) print("sample_count:", sample_count) print("sample %: ", (sample_count / num_points) * 100 ) #display( # sample_df # .groupBy(to_h3(F.col("latitude"), F.col("longitude"), F.lit(9)).alias("h3")) # .count() # .orderBy(F.desc("count")) #)
from pyspark.sql.functions import size df.select(size(split(col("Description"), " "))).show(2) # shows 5 and 3 # COMMAND ---------- from pyspark.sql.functions import array_contains df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2) # COMMAND ---------- from pyspark.sql.functions import split, explode df.withColumn("splitted", split(col("Description"), " "))\ .withColumn("exploded", explode(col("splitted")))\ .select("Description", "InvoiceNo", "exploded").show(2) # COMMAND ---------- from pyspark.sql.functions import create_map df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\ .show(2) # COMMAND ---------- df.select(map(col("Description"), col("InvoiceNo")).alias("complex_map"))\ .selectExpr("complex_map['WHITE METAL LANTERN']").show(2)
spark = SparkSession\ .builder\ .appName("StructuredKafkaWordCount")\ .getOrCreate() # Create DataSet representing the stream of input lines from kafka lines = spark\ .readStream\ .format("kafka")\ .option("kafka.bootstrap.servers", bootstrapServers)\ .option(subscribeType, topics)\ .load()\ .selectExpr("CAST(value AS STRING)") # Split the lines into words words = lines.select( # explode turns each item in an array into a separate row explode(split(lines.value, ' ')).alias('word')) # Generate running word count wordCounts = words.groupBy('word').count() # Start running the query that prints the running counts to the console query = wordCounts\ .writeStream\ .outputMode('complete')\ .format('console')\ .start() query.awaitTermination()
.getOrCreate() userSchema = StructType().add("value", "string") # Create DataFrame representing the stream of input lines from connection to host:port lines = spark\ .readStream\ .format('csv')\ .schema(userSchema)\ .load('history') # Split the lines into words words = lines.select( # explode turns each item in an array into a separate row explode( split(lines.value, ' ') ).alias('word') ) # Generate running word count wordCounts = words.groupBy('word').count() # Start running the query that prints the running counts to the console query = wordCounts\ .writeStream\ .outputMode('complete')\ .format('memory')\ .queryName('table')\ .start() # TODO
def create_subsample(spark, path): merge_nambiguous, em_nambiguous, merge_ambiguous, em_ambiguous = filter_df( spark, path ) w_pattern = Window.partitionBy("pattern") w_entity = Window.partitionBy("nb_entities") w_domain = Window.partitionBy("domain") w_pattern_entity = Window.partitionBy("pattern", "nb_entities") # UNAMBIGUOUS DATA merge_w = merge_nambiguous.select( "*", F.count("*").over(w_pattern).alias("pattern_count"), F.count("*").over(w_domain).alias("domain_count"), ).select( "articleOffset", "articleUID", "full_text", "masked_text", "quotation", "entities", "speaker", "targets", "rand", F.when(F.col("domain_count") >= 100, F.col("domain")) .otherwise("others") .alias("domain"), F.when(F.col("nb_entities") <= 20, F.col("nb_entities")) .otherwise(21) .alias("nb_entities"), F.when(F.col("pattern_count") >= 500, F.col("pattern")) .otherwise("others") .alias("pattern"), ) @F.udf(returnType=FloatType()) def get_proba(nb_samples, max_samples=400): return min(1.0, max_samples / nb_samples) subsample = ( merge_w.select("*", F.count("*").over(w_pattern_entity).alias("pe_count")) .withColumn("proba", get_proba("pe_count")) .filter("rand <= proba") .drop("rand", "pe_count", "proba") ) subsample_pos, subsample_neg = subsample.randomSplit([0.8, 0.2], seed=SEED) subsample_pos.coalesce(32).write.parquet( join(path, "sampling/quootstrap_subsample_lower"), "overwrite", compression="gzip", ) subsample_neg.rdd.map(create_neg_example).filter( lambda x: x is not None ).toDF().write.parquet( join(path, "sampling/quootstrap_subsample_neg_lower"), "overwrite", compression="gzip", ) neg_examples = ( em_nambiguous.select("*", F.explode("targets").alias("target")) .filter(F.col("target") == 0) .drop("target") ) neg_examples.write.parquet( join(path, "sampling/neg_examples_lower"), "overwrite", compression="gzip", ) em_nambiguous_target = em_nambiguous.join( neg_examples, on=["articleUID", "articleOffset"], how="leftanti" ) em_w = em_nambiguous_target.select( "*", F.count("*").over(w_entity).alias("entities_count") ).select( "articleOffset", "articleUID", "full_text", "masked_text", "quotation", "entities", "speaker", "targets", "rand", "entities_count", F.when(F.col("nb_entities") <= 20, F.col("nb_entities")) .otherwise(21) .alias("nb_entities"), ) @F.udf(returnType=FloatType()) def get_proba_bis(nb_samples, max_samples=220_000): return min(1.0, max_samples / nb_samples) em_subsample = ( em_w.withColumn("proba", get_proba_bis("entities_count")) .filter("rand <= proba") .drop("rand", "entities_count", "proba") ) em_subsample_pos, em_subsample_neg = em_subsample.randomSplit([0.8, 0.2], seed=SEED) em_subsample_pos.write.parquet( join(path, "sampling/em_subsample_lower"), "overwrite", compression="gzip", ) em_subsample_neg.rdd.map(create_neg_example).filter( lambda x: x is not None ).toDF().write.parquet( join(path, "sampling/em_subsample_neg_lower"), "overwrite", compression="gzip", )
# MAGIC + The second issue is we need to filter out empty lines or words. # MAGIC # MAGIC Apply a transformation that will split each 'sentence' in the DataFrame by its spaces, and then transform from a DataFrame that contains lists of words into a DataFrame with each word in its own row. To accomplish these two tasks you can use the `split` and `explode` functions found in [pyspark.sql.functions](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions). # MAGIC # MAGIC Once you have a DataFrame with one word per row you can apply the [DataFrame operation `where`](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where) to remove the rows that contain ''. # MAGIC # MAGIC > Note that `shakeWordsDF` should be a DataFrame with one column named `word`. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import split, explode shakeWordsDF = (shakespeareDF .select( explode( split( shakespeareDF.sentence, '\s') ).alias('word') ) ).where("word != ''") shakeWordsDF.show() shakeWordsDFCount = shakeWordsDF.count() print shakeWordsDFCount # COMMAND ---------- # TEST Remove empty elements (4d) Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount') Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'") # COMMAND ----------
def funnel_statistic_report_(cms_df): """解析 appstore_onclick\game_onclick(详情页)、download_onclick(点击下载)、download_fin(下载完成)、install_fin(安装完成) 日志数据""" ##json日志文件的路径 json_path = "hdfs://master:9000/data/{0}/*/*.gz".format(str_dt_0) ##判断路径文件条件 cmd = "hadoop fs -ls -R /data/{0} | egrep '.gz$' | wc -l".format(str_dt_0) if_zero = subprocess.check_output(cmd, shell=True).strip().split('\n')[0] ##判断日志文件路径是否存在 if int(if_zero) == 0: print("the logs does not exists!") raise SystemExit(123) else: #json日志数据路径,并解析 ##与cms_df连接条件 condition_0_1 = (F.coalesce(F.col("t_0.package_id"), F.lit("123")) == F.coalesce( F.col("t_1.fsk_pid"), F.lit("123"))) ##与cms_df进行左连接,得到应用类别 df_download_onclick = spark.read.json(json_path).select( 'custom_uuid', 'rectime', F.when(F.col('site') == 'ALI', 'youku').when(F.col('site') == 'IQIYI', 'iqiyi').when( F.col('site') == 'BESTV', 'bestv').otherwise('others').alias('site'), F.explode('data.download_onclick').alias('download_onclick') ).filter(F.col("download_onclick.time").isNotNull()).select([ 'custom_uuid', F.regexp_replace(F.lit(str_dt_0), "-", "").cast("int").alias("date"), F.lit('download_onclick').alias('featureName'), F.col("site").alias("site"), F.col("download_onclick.package_id").alias("package_id") ]).alias("t_0").join(cms_df.alias("t_1"), condition_0_1, "left_outer").select( F.col("t_0.custom_uuid").alias("custom_uuid"), F.col("t_0.date").alias("date"), F.col("t_0.featureName").alias("featureName"), F.col("t_0.site").alias("site"), F.col("t_0.package_id").alias("package_id"), F.col("t_1.fsk_title").alias("title"), F.col("t_1.fsk_catalog").alias("fsk_catalog")) sql_download_fin = """ select custom_uuid,cast(regexp_replace(date,'-','') as int) as date,site,package_id from sharp.download_fin where dt='{date_0}' """.format( date_0=str_dt_0) sql_install_fin = """ select custom_uuid,cast(regexp_replace(date,'-','') as int) as date,site,package_id from sharp.install_fin where dt='{date_0}' """.format( date_0=str_dt_0) spark.sql("show databases") spark.sql("use sharp") df_download_fin = spark.sql(sql_download_fin).alias("t_0").join( cms_df.alias("t_1"), condition_0_1, "left_outer").select( F.col("t_0.custom_uuid").alias("custom_uuid"), F.col("t_0.date").alias("date"), F.lit('download_fin').alias('featureName'), F.col("t_0.site").alias("site"), F.col("t_0.package_id").alias("package_id"), F.col("t_1.fsk_title").alias("title"), F.col("t_1.fsk_catalog").alias("fsk_catalog")) df_install_fin = spark.sql(sql_install_fin).alias("t_0").join( cms_df.alias("t_1"), condition_0_1, "left_outer").select( F.col("t_0.custom_uuid").alias("custom_uuid"), F.col("t_0.date").alias("date"), F.lit('install_fin').alias('featureName'), F.col("t_0.site").alias("site"), F.col("t_0.package_id").alias("package_id"), F.col("t_1.fsk_title").alias("title"), F.col("t_1.fsk_catalog").alias("fsk_catalog")) ##聚合操作 df = df_download_onclick.unionAll(df_download_fin).unionAll( df_install_fin) df.persist(pyspark.StorageLevel.MEMORY_AND_DISK) df.createOrReplaceTempView("v_df") sql_df_1 = """select date,featureName,site channelName,fsk_catalog typeName,grouping_id() id_1,count(custom_uuid) totalPlayCount from v_df group by date,featureName,site,fsk_catalog grouping sets((date,featureName,site,fsk_catalog),(date,featureName,site),(date,featureName,fsk_catalog),(date,featureName)) """ sql_df_2 = """ select date,package_id appId,title appName,featureName,count(custom_uuid) totalPlayCount from v_df group by date,package_id,title,featureName """ spark.sql("show databases") spark.sql("use sharp") funnel_report_1 = spark.sql(sql_df_1) funnel_report_2 = spark.sql(sql_df_2) df.unpersist() ##最终报表 return funnel_report_1, funnel_report_2