def create_tag_frequencies(self, dataframe): """Produces a PySpark dataframe containing a column representing the total frequency of the tags by record. The frequency of tags is determined by their proportion of the total number of tags in the dataframe. :param dataframe: the PySpark dataframe :returns: the PySpark dataframe containing the tag frequency field and all fields in the supplied dataframe """ df_tags = dataframe.selectExpr("tag1 AS tag").union(dataframe.selectExpr("tag2 AS tag")).union(dataframe.selectExpr("tag3 AS tag")) \ .union(dataframe.selectExpr("tag4 AS tag")).union(dataframe.selectExpr("tag5 AS tag")) df_tags = df_tags.na.drop(subset=["tag"]) tags_total_count = df_tags.count() print("Total number of tags used, including duplicates:",tags_total_count) df_tag_freq = df_tags.groupBy("tag").count().orderBy(desc("count")) df_tag_freq = df_tag_freq.withColumn("frequency", col("count")/tags_total_count) df_tag_freq.orderBy(desc("frequency")).show(10) def one_hot_encode_top_n_tags(dataframe,n): """Produces a PySpark dataframe containing columns indicating whether each of the top n tags are present. :param dataframe: the PySpark dataframe :param n: the number of the top ranked tags to return as tag fields :returns: the PySpark dataframe containing the top n tag fields and all fields in the supplied dataframe """ top_n = [t.tag for t in df_tag_freq.orderBy(desc("frequency")).select("tag").limit(n).collect()] for tag in top_n: # replace tag name ".net" with "dotnet", for example, to avoid problems with periods in tag names tag_column_name = ("tag_"+tag).replace(".","dot") dataframe = dataframe.withColumn(tag_column_name, array_contains(dataframe.tags_split, tag).cast("int")) return dataframe dataframe = one_hot_encode_top_n_tags(dataframe,20) tag_columns = [col for col in dataframe.columns if col.startswith('tag')] print("Tag-related columns") dataframe.select(tag_columns).show(10,False) dataframe.createOrReplaceTempView('df') df_tag_freq.createOrReplaceTempView('df_tag_freq') for n in range(1,6): dataframe = self.sqlContext.sql("SELECT df.*, df_tag_freq.frequency AS frequency_tag{} FROM df LEFT JOIN df_tag_freq ON df.tag{} = df_tag_freq.tag".format(n,n)) dataframe = dataframe.na.fill({"frequency_tag{}".format(n): 0}) dataframe.createOrReplaceTempView('df') dataframe = dataframe.withColumn("frequency_sum", col("frequency_tag1")+col("frequency_tag2")+col("frequency_tag3")+col("frequency_tag4")+col("frequency_tag5")) # Remove temporary columns dataframe = dataframe.select([c for c in dataframe.columns if c not in {"tags_split","tag1","tag2","tag3","tag4","tag5","frequency_tag1","frequency_tag2", \ "frequency_tag3","frequency_tag4","frequency_tag5"}]) return(dataframe)
def process_file(date_update): """Process downloaded MEDLINE folder to parquet file""" print("Process MEDLINE file to parquet") # remove if folder still exist if glob(os.path.join(save_dir, 'medline_*.parquet')): subprocess.call(['rm', '-rf', 'medline_*.parquet']) date_update_str = date_update.strftime("%Y_%m_%d") path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000) parse_results_rdd = path_rdd.\ flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict) for publication_dict in pp.parse_medline_xml(x)]) medline_df = parse_results_rdd.toDF() medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str), mode='overwrite') window = Window.partitionBy(['pmid']).orderBy(desc('file_name')) windowed_df = medline_df.select( max('delete').over(window).alias('is_deleted'), rank().over(window).alias('pos'), '*') windowed_df.\ where('is_deleted = False and pos = 1').\ write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str), mode='overwrite') # parse grant database parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\ .filter(lambda x: x is not None)\ .map(lambda x: Row(**x)) grant_df = parse_grant_rdd.toDF() grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str), mode='overwrite')
def getProxyIntelHits(self, fromdate, todate): ''' :return: ''' self.proxyDF = self.buildParquetFileList('proxysg', fromdate, todate) self.sc.setLocalProperty("spark.scheduler.pool", "dashboard") self.proxyDF.createOrReplaceTempView('proxysg') self.session.read.parquet('/data/srm/dbs/dw_srm.db/otx').createOrReplaceTempView('otx') self.session.read.parquet('/data/srm/dbs/dw_srm.db/c2').createOrReplaceTempView('c2') #self.proxyDF.persist(StorageLevel.MEMORY_AND_DISK_SER) sgotx = self.session.sql('select proxysg.host from proxysg join otx on otx.ip=proxysg.host') sgc2 = self.session.sql('select proxysg.host from proxysg join c2 on c2.host=proxysg.host') sgall = sgotx.unionAll(sgc2) # This breaks the Kryo serializer - unknown class entries = sgall.groupBy(sgall.host).count().orderBy(desc('count')).limit(20).collect() # Build json object for the table data = [] descriptionTable = { "host": ("string", "Malware host"), "count": ("number", "Hits") } for entry in entries: data.append({"host": entry.host, "count": int(entry[1])}) data_table = gviz_api.DataTable(descriptionTable) data_table.LoadData(data) # Creating a JSon string jsonTable = data_table.ToJSon(columns_order=("host", "count"), order_by="count") return jsonTable
def makeWordCloud(row, df): rows = df.filter(df['subreddit'] == row['subreddit']).orderBy(desc('score')) count = rows.count() if count > 50: # map to term frequency tuples frequencies = rows.select(['term', 'score']).take(max(count, 500)).map(lambda x: (x['term'], x['score'])).collect() save_word_cloud(row['subreddit'], frequencies)
def pagerank_topk(ranks_in, k=10): '''retrieve the top 'k' pagerank vertices and associated with metadata''' df_rank_slice = ranks_in.vertices.select('id', 'department', 'type', 'pagerank')\ .orderBy(desc("pagerank"))\ .limit(k) df_ranks = df_rank_slice.toPandas().sort_values(by='pagerank', ascending=False) df_ranks.id = df_ranks.id.astype(long) return df_ranks
def compute(day): # On veut les jours day-30 à day-1 sums = wikipediadata.where( (wikipediadata.day >= day-30) & (wikipediadata.day <= day-1)) # Sous-ensemble de test #sums = sums.where((sums.page == 'Cadillac_Brougham') | ((sums.page == 'Roald_Dahl') & (sums.projectcode == 'fr'))) # On somme les heures de la journées sums = sums.groupby('projectcode', 'page', 'day').sum('views') # On cache pour plus tard sums.cache() # on définit une windows := jour precedent window_spec = Window.partitionBy(sums.projectcode, sums.page) \ .orderBy(sums.day.asc()).rowsBetween(-1, -1) # on calcule la différence entre views(d) - views(d-1) diffs = sums.withColumn('diff', sums.views - F.sum(sums.views) \ .over(window_spec)) # on calcule les coefs à appliquer à chaque jour coefs = pd.DataFrame({'day': range(day-30, day)}) coefs['coef'] = 1. / (day - coefs.day) coefs = hc.createDataFrame(coefs) diffs = diffs.join(coefs, 'day') # on calcul le score de chaque jour diffs = diffs.withColumn('sub_score', diffs.diff * diffs.coef) totals = diffs.groupby('projectcode', 'page').sum('views', 'sub_score') # on normalise par la racine de la somme des views totals = totals.withColumn('score', totals['SUM(sub_score)'] / F.sqrt(totals['SUM(views)'])) \ .orderBy(F.desc('score')) \ .withColumnRenamed('SUM(views)', 'total_views') \ .limit(10) views = sums.select('projectcode', 'page', 'day', 'views') \ .join(totals.select('projectcode', 'page', 'total_views', 'score'), (totals.projectcode == sums.projectcode) & (totals.page == sums.page), 'right_outer') df = totals.select('projectcode', 'page', 'total_views', 'score').toPandas() df2 = views.toPandas() df2 = df2.iloc[:, 2:] df2 = df2.pivot_table(values='views', columns=['day'], index=['projectcode', 'page'], fill_value=0) df = df.merge(df2, left_on=['projectcode', 'page'], right_index=True) df.to_csv(filename(day), index=False) # on vide le cache hc.clearCache()
def one_hot_encode_top_n_tags(dataframe,n): """Produces a PySpark dataframe containing columns indicating whether each of the top n tags are present. :param dataframe: the PySpark dataframe :param n: the number of the top ranked tags to return as tag fields :returns: the PySpark dataframe containing the top n tag fields and all fields in the supplied dataframe """ top_n = [t.tag for t in df_tag_freq.orderBy(desc("frequency")).select("tag").limit(n).collect()] for tag in top_n: # replace tag name ".net" with "dotnet", for example, to avoid problems with periods in tag names tag_column_name = ("tag_"+tag).replace(".","dot") dataframe = dataframe.withColumn(tag_column_name, array_contains(dataframe.tags_split, tag).cast("int")) return dataframe
def getVPNUnusualActivity(self): self.sc.setLocalProperty("spark.scheduler.pool", "dashboard") adlocation = self.session.read.csv(header='true', inferSchema='true', path='/user/jleaniz/ad.csv') adlocation.cache() vpn = self.session.read.parquet('/data/srm/dbs/dw_srm.db/ciscovpn').rdd vpn.cache() def func(x): gi = GeoIP.open("GeoIP.dat",GeoIP.GEOIP_MEMORY_CACHE) cc = gi.country_code_by_addr(x.remoteip) return Row(bytesrcv=x.bytesrcv, bytesxmt=x.bytesxmt, duration=x.duration, localip=x.localip, reason=x.reason, remoteip=x.remoteip, source=x.source, time=x.time, user=x.user, date=x.date, remoteipcc=cc) vpnDF = vpn.map(func).toDF() joinDF = vpnDF.join(adlocation, vpnDF.user == adlocation.EmailAddress) joinDF.cache() fromOtherLocations = joinDF.filter("remoteipcc <> c") groupDF = fromOtherLocations.groupBy(fromOtherLocations.user, fromOtherLocations.remoteip, fromOtherLocations.remoteipcc, fromOtherLocations.c)\ .count()\ .orderBy(desc('count')) entries = groupDF.collect() # Build json object for the table dataChart = [] descriptionChart = { "user": ("string", "User"), "c": ("string", "Office"), "remoteip": ("string", "Remote IP"), "remoteipcc": ("string", "Remote IP CC"), "count": ("number", "Count") } for entry in entries: dataChart.append({ "user": entry.user, "c": entry.c, "remoteip": entry.remoteip, "remoteipcc": entry.remoteipcc, "count": int(entry[4])}) data_tableChart = gviz_api.DataTable(descriptionChart) data_tableChart.LoadData(dataChart) # Creating a JSon string vpn_logins = data_tableChart.ToJSon( columns_order=("user", "c", "remoteip","remoteipcc","count"), order_by="count" ) return vpn_logins
def runOtherFunctions(spark, personDf): df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]); # array df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False) # desc, asc personDf.show() personDf.sort(functions.desc("age"), functions.asc("name")).show() # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음 # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능) df2 = spark.createDataFrame([("Splits str around pattern",)], ['value']) df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False) # rownum, rank f1 = StructField("date", StringType(), True) f2 = StructField("product", StringType(), True) f3 = StructField("amount", IntegerType(), True) schema = StructType([f1, f2, f3]) p1 = ("2017-12-25 12:01:00", "note", 1000) p2 = ("2017-12-25 12:01:10", "pencil", 3500) p3 = ("2017-12-25 12:03:20", "pencil", 23000) p4 = ("2017-12-25 12:05:00", "note", 1500) p5 = ("2017-12-25 12:05:07", "note", 2000) p6 = ("2017-12-25 12:06:25", "note", 1000) p7 = ("2017-12-25 12:08:00", "pencil", 500) p8 = ("2017-12-25 12:09:45", "note", 30000) dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema) w1 = Window.partitionBy("product").orderBy("amount") w2 = Window.orderBy("amount") dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"), functions.rank().over(w2).alias("rank")).show()
def getfwMalwareConns(self, fromdate, todate): self.fwDF = self.buildParquetFileList('fw', fromdate, todate) self.fwDF.createOrReplaceTempView('fw') #self.fwDF.persist(StorageLevel.MEMORY_AND_DISK_SER) self.session.read.parquet('/data/srm/dbs/dw_srm.db/otx').createOrReplaceTempView('otx') self.session.read.parquet('/data/srm/dbs/dw_srm.db/c2').createOrReplaceTempView('c2') self.sc.setLocalProperty("spark.scheduler.pool", "dashboard") fwotx = self.session.sql('select fw.dstip from fw join otx on otx.ip=fw.dstip') fwc2 = self.session.sql('select fw.dstip from fw join c2 on c2.host=fw.dstip') fwall = fwotx.unionAll(fwc2) groupcnt = fwall.groupBy(fwall.dstip).count().orderBy(desc('count')) entries = groupcnt.collect() # Build json object for the table dataChart = [] descriptionChart = { "dstip": ("string", "Malicious host"), "count": ("number", "Hits") } for entry in entries: dataChart.append({"dstip": entry.dstip, "count": entry[1]}) data_tableChart = gviz_api.DataTable(descriptionChart) data_tableChart.LoadData(dataChart) # Creating a JSon string fw_mal_conns = data_tableChart.ToJSon( columns_order=("dstip", "count"), order_by="count" ) return fw_mal_conns
.master("local") \ .appName("cassandra Word Count") \ .getOrCreate() spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), \ ('spark.executor.cores', '4'), \ ('spark.cores.max', '4'), \ ('spark.driver.memory','4g')]) # print(spark.sparkContext._conf.getAll()) spRDD = spark.createDataFrame(pdDF).cache() # spRDD.printSchema() # Counts people by age countsByhashtags = spRDD.groupBy("hashtags").count().sort(desc("count")) countsByhashtags.show(10) onlyText = spRDD.select("tweet_text") onlyText.show(5) sparkRDD = onlyText.rdd splitted = sparkRDD.flatMap(lambda line: str(line).split(' ')) # .reduce(lambda word : (word , 1)) reduced = splitted.map(lambda word : (str(word), 1)) \ .reduceByKey(lambda a, b: a + b) \ .sortBy(lambda a: -a[1]) topWords = reduced.collect()[:20]
csvDF = spark \ .readStream \ .option("sep", ";") \ .schema(userSchema) \ .csv("hdfs://localhost:9000/stream/") '''words=csvDF.select(explode(split(csvDF.Hashtags,",")).alias("hashtag")) wordcounts=words.groupBy("hashtag").count().select("hashtag","count") wc=wordcounts.orderBy(desc("count")).select("hashtag") #ws=wordcounts..show(10) ''' ratio = csvDF.select("name", (col("Followers") / col("Friends")).alias("FRRatio")) ratio = ratio.groupBy("name", "FRRatio").count() r = ratio.orderBy(desc("FRRatio")).select(col("name"), col("FRRatio")).limit(1) #ratio = csvDF.select("ratio") #print("sorted bruh") query=r \ .writeStream \ .outputMode ("append") \ .format("console") \ .start() query.awaitTermination(60) query.stop() ''' s=csvDF.groupBy("Hashtags").count() \ .writeStream \ .outputMode("complete") \ .format("console") \
print("生成数据list") rows =[Row(name="Jack", age="37", group="qe"), Row(name="John", age="25", group="qe"), Row(name="Tom", age="34", group="qa"), Row(name="Phoenix", age="29", group="qe"), Row(name="Rose", age="23", group="qa")] sc = SparkContext() sqlContext = SQLContext(sc) print("####生成DataFrame####") data = sqlContext.createDataFrame(rows) print("####展示数据####") data.show() print("####打印schema模式名称####") data.printSchema() print("####选择某一列查看####") data.select("name").show() data.select("name","age",col("group").alias("Group")).show() print("####根据条件筛选显示####") data.filter(col("age") >= 32).orderBy("name").show() data.select("name","age").where(col("age") >= 32).orderBy("name").show() data.select("name","age").where(col("age") >= 32).orderBy(desc("name"),"age").show() print("####对DataFrame进行分组和函数计算####") data.groupBy("group").count().show() data.groupBy("group").agg(avg("age")).show()
yelp_df.filter(yelp_df.useful >= 1).count() yelp_df.filter('useful >= 1').count() # select - subset of data createDataFrame yelp_df.select("useful") yelp_df.select("useful").agg({"useful" : "max"}).collect() # scale yelp_df.select("id", yelp_df.useful/28*100).show(5) yelp_df.select("id", (yelp_df.useful/28*100).cast('int')).show(5) # rename yelp_df.select("id", (yelp_df.useful/28*100).cast('int').alias('useful_pct')).show(5) # order useful_perc_data = yelp_df.select(yelp_df["id"].alias('uid'), (yelp_df.useful/28*100).cast('int').alias('useful_pct')).orderBy(desc('useful_pct')) # join + select useful_perc_data.join( yelp_df, yelp_df.id == useful_perc_data.uid, "inner").select(useful_perc_data.uid, "useful_pct", "review_count").show(5) # cache - after caching, second run became much faster useful_perc_data.join( yelp_df, yelp_df.id == useful_perc_data.uid, "inner").cache().select(useful_perc_data.uid, "useful_pct", "review_count").show(5) # logs # set delimiter to windows line end
from pyspark.sql import SparkSession from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, LongType from pyspark.sql.functions import count import os spark = SparkSession.builder.appName("MinTemperatures").getOrCreate() schema = StructType([ StructField("userID", IntegerType(), True), StructField("movieID", IntegerType(), True), StructField("rating", IntegerType(), True), StructField("timestamp", LongType(), True) ]) # Read file as dataframe curwd = os.getcwd() df = spark.read.option( "sep", "\t").schema(schema).csv(f"file:///{curwd}/datasets/ml-100k/u.data") print("printing schema: ") df.printSchema() # topMoviesIDs = df.groupBy("movieID").count().orderBy(func.desc("count")) topMoviesIDs = df.groupBy("movieID").agg( count("timestamp").alias("countzer")).orderBy(func.desc("countzer")) topMoviesIDs.show(10) # kill session spark.stop()
def getPieColInfo(self, numerical): # If user selects a column in dialog box, give it to them keyFields = self.options.get("keyFields") if keyFields is not None: return keyFields schema = self.entity.schema default=None for field in schema.fields: # Ignore unique ids if field.name.lower() != 'id' and ( not numerical or isNum(field.dataType.__class__.__name__) ): # Find a good column to display in pie ChartDisplay default = default or field.name count = self.entity.count() sample = self.entity.sample(False, (float(200) / count)) if count > 200 else self.entity orderedSample = sample.groupBy(field.name).agg(F.count(field.name).alias("agg")).orderBy(F.desc("agg")).select("agg") if orderedSample.take(1)[0]["agg"] > 10: return field.name # Otherwise, return first non-id column return default
# TEST Remove empty elements (4d) Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount') Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'") # COMMAND ---------- # MAGIC %md # MAGIC ** (4e) Count the words ** # MAGIC # MAGIC We now have a DataFrame that is only words. Next, let's apply the `wordCount()` function to produce a list of word counts. We can view the first 20 words by using the `show()` action; however, we'd like to see the words in descending order of count, so we'll need to apply the [`orderBy` DataFrame method](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.orderBy) to first sort the DataFrame that is returned from `wordCount()`. # MAGIC # MAGIC You'll notice that many of the words are common English words. These are called stopwords. In a later lab, we will see how to eliminate them from the results. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import desc topWordsAndCountsDF = wordCount(shakeWordsDF).orderBy(desc('count')) topWordsAndCountsDF.show() # COMMAND ---------- # TEST Count the words (4e) Test.assertEquals(topWordsAndCountsDF.take(15), [(u'the', 27361), (u'and', 26028), (u'i', 20681), (u'to', 19150), (u'of', 17463), (u'a', 14593), (u'you', 13615), (u'my', 12481), (u'in', 10956), (u'that', 10890), (u'is', 9134), (u'not', 8497), (u'with', 7771), (u'me', 7769), (u'it', 7678)], 'incorrect value for top15WordsAndCountsDF')
print('Number of Method=\'Discover\': ', df.filter(df.Method == 'Discover').count()) ''' Can be written like this as well, >>> from pyspark.sql.functions import sum >>> for row in df.groupBy(df.Method).agg(sum(df.Amount))\ .withColumnRenamed('sum(Amount)', 'Total')\ .orderBy('Total', ascending=False)\ .take(3): print(row) ''' print('>>> Top 3 Methods using Dataframe API >>>') for row in df.groupBy(df.Method).sum()\ .withColumnRenamed('sum(Amount)', 'Total')\ .orderBy(desc('Total'))\ .take(3): print(row) print('>>> Top 3 Methods using SQL >>>') df.createOrReplaceTempView('temp') sql_str = 'select Method, sum(Amount) as Total from temp group by Method order by Total desc' for row in spark.sql(sql_str).take(3): print(row) # code for cleaning up memory... spark.catalog.dropTempView("temp") df.unpersist()
# COMMAND ---------- # TODO FILL_IN # COMMAND ---------- # TEST - Run this cell to test your solution from pyspark.sql.functions import desc ipCountDF2 = (spark .read .parquet("/tmp/ipCount.parquet") .orderBy(desc("count")) ) ip1, count1 = ipCountDF2.first() cols = set(ipCountDF2.columns) dbTest("ET1-P-02-02-01", "213.152.28.bhe", ip1) dbTest("ET1-P-02-02-02", True, count1 > 500000 and count1 < 550000) dbTest("ET1-P-02-02-03", {'count', 'ip'}, cols) print("Tests passed!") # COMMAND ---------- # MAGIC %md # MAGIC Check the load worked by using `%fs ls <path>`. Parquet divides your data into a number of files. If successful, you see a `_SUCCESS` file as well as the data split across a number of parts.
# MAGIC %md # MAGIC ### (5c) Exercise: Listing the Top Twenty 404 Response Code paths # MAGIC # MAGIC Using the DataFrame containing only log records with a 404 response code that you cached in part (5a), print out a list of the top twenty paths that generate the most 404 errors. # MAGIC # MAGIC *Remember, top paths should be in sorted order* # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code top_20_not_found_df = (not_found_paths_df .groupBy('path') .count() .sort(desc('count'))) print 'Top Twenty 404 URLs:\n' top_20_not_found_df.show(n=20, truncate=False) # COMMAND ---------- # TEST Top twenty 404 URLs (5c) top_20_not_found = [(row[0], row[1]) for row in top_20_not_found_df.take(20)] top_20_expected = [ (u'/pub/winvn/readme.txt', 633), (u'/pub/winvn/release.txt', 494), (u'/shuttle/missions/STS-69/mission-STS-69.html', 430), (u'/images/nasa-logo.gif', 319), (u'/elv/DELTA/uncons.htm', 178),
ratings_df = ratings_df.withColumn( 'Timestamp', F.col('SplitValues').getItem(3).cast(IntegerType())) ratings_df = ratings_df.drop(F.col('SplitValues')) ''' # Import users.dat users_df = spark.read.format("csv").load("data/users.dat") users_df = users_df.select(F.split(users_df.columns[0], "::").alias('SplitValues')) users_df = users_df.withColumn('UserID', F.col('SplitValues').getItem(0).cast(IntegerType())) users_df = users_df.withColumn('Gender', F.col('SplitValues').getItem(1)) users_df = users_df.withColumn('Age', F.col('SplitValues').getItem(2).cast(ByteType())) users_df = users_df.withColumn('Occupation', F.col('SplitValues').getItem(3).cast(ByteType())) users_df = users_df.withColumn('Zip-code', F.col('SplitValues').getItem(4).cast(IntegerType())) users_df = users_df.drop(F.col('SplitValues')) ''' # Calculate average rating avgratings_df = ratings_df.select(F.col('MovieID'), F.col('Rating')) avgratings_df = avgratings_df.groupBy(F.col('MovieID')).agg( F.mean(F.col('Rating')).alias('Average Rating')) avgratings_df = avgratings_df.join(movies_df, avgratings_df.MovieID == movies_df.MovieID) # Write the output to output/exercise1.csv output_df = avgratings_df.select('Title', 'Average Rating').sort( F.desc("Average Rating")) output_df.write.mode("overwrite").csv('output/exercise1/avgrating') os.system(f'rm output/exercise1/avgrating.csv') os.system( f'cat output/exercise1/avgrating/p* > output/exercise1/avgrating.csv')
# COMMAND ---------- from pyspark.sql.functions import col, to_date dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm")) dfWithDate.createOrReplaceTempView("dfWithDate") # COMMAND ---------- from pyspark.sql.window import Window from pyspark.sql.functions import desc windowSpec = Window\ .partitionBy("CustomerId", "date")\ .orderBy(desc("Quantity"))\ .rowsBetween(Window.unboundedPreceding, Window.currentRow) # COMMAND ---------- from pyspark.sql.functions import max maxPurchaseQuantity = max(col("Quantity")).over(windowSpec) # COMMAND ---------- from pyspark.sql.functions import dense_rank, rank purchaseDenseRank = dense_rank().over(windowSpec) purchaseRank = rank().over(windowSpec)
# MAGIC Add a single DataFrame transformation (in place of `<FILL_IN>`, below) to limit the results to movies with ratings from at least 500 people. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code movies_with_500_ratings_or_more = movie_names_with_avg_ratings_df.filter(movie_names_with_avg_ratings_df['count'] >= 500) print 'Movies with highest ratings:' movies_with_500_ratings_or_more.show(20, truncate=False) # COMMAND ---------- # TEST Movies with Highest Average Ratings and at least 500 Reviews (1b) Test.assertEquals(movies_with_500_ratings_or_more.count(), 4489, 'incorrect movies_with_500_ratings_or_more.count(). Expected 4489.') top_20_results = [(r['average'], r['title'], r['count']) for r in movies_with_500_ratings_or_more.orderBy(F.desc('average')).take(20)] Test.assertEquals(top_20_results, [(4.446990499637029, u'Shawshank Redemption, The (1994)', 63366), (4.364732196832306, u'Godfather, The (1972)', 41355), (4.334372207803259, u'Usual Suspects, The (1995)', 47006), (4.310175010988133, u"Schindler's List (1993)", 50054), (4.275640557704942, u'Godfather: Part II, The (1974)', 27398), (4.2741796572216, u'Seven Samurai (Shichinin no samurai) (1954)', 11611), (4.271333600779414, u'Rear Window (1954)', 17449), (4.263182346109176, u'Band of Brothers (2001)', 4305), (4.258326830670664, u'Casablanca (1942)', 24349), (4.256934865900383, u'Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)', 6525), (4.24807897901911, u"One Flew Over the Cuckoo's Nest (1975)", 29932), (4.247286821705426, u'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 23220), (4.246001523229246, u'Third Man, The (1949)', 6565),
display(g.outDegrees) # COMMAND ---------- display(g.degrees) # COMMAND ---------- display(g.edges.filter("dst = '4' and HelpfulnessDenominator > 60")) # COMMAND ---------- result = g.stronglyConnectedComponents(maxIter = 10) display(result.select("id", "Component")) # COMMAND ---------- ranks = g.pageRank(resetProbability= 0.15, maxIter = 5) display(ranks.vertices.orderBy(ranks.vertices.pagerank.desc()).limit(100)) # display(ranks.vertices.orderBy(desc("pagerank"))) # COMMAND ---------- from pyspark.sql.functions import desc display(g.edges.filter("Score = '4'").groupBy("src", "dst").avg("HelpfulnessDenominator").sort(desc("avg(HelpfulnessDenominator)")).limit(100)) # COMMAND ----------
StructField("movieID", IntegerType(), True), \ StructField("rating", IntegerType(), True), \ StructField("timestamp", LongType(), True)]) # Load up movie data as dataframe moviesDF = spark.read.option("sep", "\t").schema(schema).csv( "file:///opt/bitnami/spark/datasets/ml-100k/u.data") movieCounts = moviesDF.groupBy("movieID").count() # Create a user-defined function to look up movie names from our broadcasted dictionary def lookupName(movieID): return nameDict.value[movieID] lookupNameUDF = func.udf(lookupName) # Add a movieTitle column using our new udf moviesWithNames = movieCounts.withColumn("movieTitle", lookupNameUDF(func.col("movieID"))) # Sort the results sortedMoviesWithNames = moviesWithNames.orderBy(func.desc("count")) # Grab the top 10 sortedMoviesWithNames.show(10, False) # Stop the session spark.stop()
from pyspark.sql import SparkSession from pyspark.sql import functions as F import pandas as pd from pyspark.sql.functions import desc spark = SparkSession \ .builder \ .appName("Python NBA Salaries") \ .getOrCreate() df = spark.read.csv("NBACleanData/StatsClean.csv", header=True) df.createOrReplaceTempView("stats") df2 = spark.sql("SELECT Pos, SUM(PTS) AS total_points FROM stats GROUP BY Pos") df2 = df2.sort(desc("total_points")) df2.show() pandasDF = df2.toPandas() pandasDF.rename(columns={'total_points': 'Points'},inplace=True) pandasDF.to_csv('NBACleanData/AlltimeScoringPos.csv', index=False)
def get_recommendations_by_cluster_app(cluster, purchased_quantities): # Existing customer products customer_products = [] for i in range(0, len(product_cols)): if purchased_quantities[i] > 0: customer_products.append((product_cols[i], purchased_quantities[i])) df_customer_products = sc.parallelize(customer_products).toDF(["PRODUCT","COUNT"]) # Get popular products in the cluster cluster_products = get_popular_products_in_cluster(cluster) df_cluster_products = sc.parallelize(cluster_products).toDF(["PRODUCT","COUNT"]) df_cluster_products = sc.parallelize(cluster_products).toDF # Filter out products the user has already purchased df_recommended_products = df_cluster_products.alias('cl').join(df_customer_products.alias('cu'), df_cluster_products['PRODUCT'] == df_customer_products['PRODUCT'], 'leftouter') df_recommended_products = df_recommended_products.filter('cu.PRODUCT IS NULL').select('cl.PRODUCT','cl.COUNT').sort(desc('cl.COUNT')) return df_recommended_products
def question_two(self, spark): rawDF = self.question_one(spark, data_only=True) rawDF.groupBy("company")\ .agg(round(sum("purchase_cost"), 2).alias("revenue"))\ .orderBy(desc("revenue"))\ .show(10)
# Create KM model and fit using up to date data kmeans = KMeans(k=650, seed=42, featuresCol="features", predictionCol="prediction", maxIter=10, initSteps=3) kmodel = kmeans.fit(df) #test = kmodel.transform(featuresOut) ''' ########## DEMO ######### ''' df.groupBy(df.prediction).count().orderBy(asc('count')).show(50) groups = df.groupBy(df.prediction.alias("prediction2")).count().orderBy(asc('count')).filter('count < 40') df.join(groups, groups.prediction2==df.prediction).select('command','prediction').distinct().show() df.join(groups, groups.prediction2==df.prediction).select('command').distinct().show(500,truncate=False) groups = df.groupBy(df.prediction.alias("prediction2")).count().orderBy(desc('count')).filter('count > 100000') df.join(groups, groups.prediction2==df.prediction).select('command').distinct().show(500,truncate=False) groups = sc.parallelize(df.groupBy(df.prediction.alias("prediction2")).count().orderBy(desc('count')).head(10)).toDF() df.join(groups, groups.prediction2==df.prediction).select('command').distinct().show(50,truncate=False) # Create a new DF with some weird commands test1 = ctx.createDataFrame([ ], ["command"]) test2 = ctx.createDataFrame([ ("gcc hack.c -o hack;./hack".split(" "),"2015-12-07","root","msr-telemetry-cass06","msr-dev-nsw-o01","sn-discovery-unix",), ("wget http://wwww.my.com/rootkit.gz".split(" "),"2015-12-07","root","msr-telemetry-cass06","msr-dev-nsw","sn-discovery-unix", ), ("echo, $?".split(" "),"2015-12-07","root","msr-telemetry-cass06","msr-dev-nsw-o01.domain.org","sn-discovery-unix", ), ("asdjgiuarsjhgiurewhgjui asdadfsadf sdf".split(" "),"2015-12-07","root","msr-telemetry-cass06","","sn-discovery-unix", ),
DataFrame[emp_id: bigint, emp_name: string, emp_city: string, emp_salary: double] >>> old_df.unionAll(new_df).show() +------+--------+---------+----------+ |emp_id|emp_name| emp_city|emp_salary| +------+--------+---------+----------+ | 1| John| Sydney| 35000.0| | 2| Peter|Melbourne| 45000.0| | 3| Sam| Sydney| 55000.0| | 2| Peter|Melbourne| 55000.0| | 5| Jessie| Brisbane| 42000.0| +------+--------+---------+----------+ >>> union_df=old_df.unionAll(new_df) >>> from pyspark.sql import Window window_agg=Window.partitionBy("emp_id").orderBy(F.desc("emp_salary")) >>> union_df.select(F.row_number().over(window_agg).alias("rn"),"*").filter("rn=1").select(new_df.columns).orderBy("emp_id").show() +------+--------+---------+----------+ |emp_id|emp_name| emp_city|emp_salary| +------+--------+---------+----------+ | 1| John| Sydney| 35000.0| | 2| Peter|Melbourne| 55000.0| | 3| Sam| Sydney| 55000.0| | 5| Jessie| Brisbane| 42000.0| +------+--------+---------+----------+ ===================================================================================================================================
def words_relevance(words_specific_df, sqlContext, high_type='all'): ''' words_specific_df : 需要计算距离的目标词 , pandas dataframe ''' if high_type == 'all': words_specific_df = None coef_col_name = '' storage_table1 = '{0}.effect_words_relevance_product_name'.format( database_name) storage_table2 = '{0}.effect_words_relevance_in_each_product_name'.format( database_name) # 1 extract word column if words_specific_df != None: target_word_df = words_specific_df[['word']].copy() else: target_word_df = None # 2 get word-comments_ cut result idx_name = 'product_id' doc_table = params['shared']['doc_cut_word_table'] comments_words_new = sqlContext.sql( "select product_id as {0},cut_word_flag as comments_words_new from {1}.{2}" .format(idx_name, database_name, doc_table)) # 3 calculate relevance in each comment def words_relevance_one_comment(comment_split_words, target_word_df=target_word_df): ''' 计算单个评价句子中 目标词和有效词性的词的上下文关联度 目标词 target_word_df 不传入时 , 则计算句子中每个词之间的上下文关联度(当前默认) 单个评价 示例 comment_split_words=[{'word':'不用','flag':'v'},{'word':'担心','flag':'v'},{'word':'简约','flag':'a'},{'word':'时尚','flag':'an'},{'word':'拿着','flag':'vu'},{'word':'质量','flag':'n'}] return relevance_in_each_comment 每个句子中 词的关联度 ''' if type(comment_split_words) == type(sp_row()): idx = comment_split_words[idx_name] comment_split_words = comment_split_words['comments_words_new'] effect_flag_weight = pds.DataFrame([('n', 1), ('ns', 1), ('nt', 1), ('nr', 0.5), ('nz', 0.67), ('nv', 0.67), ('vn', 0.67), ('an', 0.67)], columns=['flag', 'weight']) if len(comment_split_words) > 0: comment_words_df = pds.DataFrame(comment_split_words) if target_word_df != None: comment_words_df = comment_words_df.loc[ (comment_words_df.word.isin(target_word_df.word)) | (comment_words_df.flag.isin(effect_flag_weight.flag) ), :].reset_index().rename(columns={'index': 'position'}) flag_words = pds.merge(comment_words_df, effect_flag_weight, on='flag', how='inner', sort=False).rename( columns={ 'position': 'flag_pos', 'flag': 'flag_flag', 'word': 'flag_word' }) target_words = pds.merge(comment_words_df, target_word_df, on='word', how='inner', sort=False).rename( columns={ 'position': 'target_pos', 'flag': 'target_flag', 'word': 'target_word' }) else: comment_words_df = comment_words_df.reset_index().rename( columns={'index': 'position'}) flag_words = comment_words_df.copy() flag_words['weight'] = 1.0 flag_words.rename(columns={ 'position': 'flag_pos', 'flag': 'flag_flag', 'word': 'flag_word' }, inplace=True) target_words = comment_words_df.copy() target_words.rename(columns={ 'position': 'target_pos', 'flag': 'target_flag', 'word': 'target_word' }, inplace=True) if flag_words.shape[0] > 0 and target_words.shape[0] > 0: #目标词和有效词的df 求笛卡尔积 flag_words['dikar_id'] = 0 target_words['dikar_id'] = 0 merge_df = pds.merge(flag_words, target_words, on='dikar_id') # tips :若词与自己进行计算 ,先加一个值0.9,避免求出 inf merge_df.loc[merge_df.flag_pos == merge_df.target_pos, 'target_pos'] = merge_df.loc[merge_df.flag_pos == merge_df.target_pos, 'target_pos'] + 0.9 merge_df['pos_diff'] = npy.abs(merge_df.flag_pos - merge_df.target_pos) merge_df['each_relevance'] = merge_df.weight / (npy.power( merge_df.pos_diff, merge_df.pos_diff / 2.0)) # 使上下文距离的 衰减速度 随距离增大而增大 # 一个句子中,目标词和有效词只能有一个 关联度 ,当存在重复多个时,取单个句子内的最大关联。 merge_df_group = merge_df.groupby( ['flag_word', 'target_word'])['each_relevance'].max().reset_index() merge_df = merge_df.drop_duplicates( subset=['flag_word', 'target_word'])[[ 'flag_word', 'target_word', 'flag_flag', 'target_flag' ]] merge_df_group = pds.merge(merge_df_group.copy(), merge_df, on=['flag_word', 'target_word']) merge_df_group[idx_name] = idx relevance_in_each_comment = merge_df_group.values.tolist() else: relevance_in_each_comment = [] else: relevance_in_each_comment = [] return relevance_in_each_comment relevance_in_each_comment = comments_words_new.rdd.flatMap( words_relevance_one_comment) # relevance_in_each_comment_df = sqlContext.createDataFrame( relevance_in_each_comment, [ 'flag_word', 'target_word', 'each_relevance', 'flag_flag', 'target_flag', idx_name ]) #relevance_in_each_comment_df.persist(StorageLevel(True,True,False,False,1)) #no_out=sqlContext.sql('drop table if exists {0}'.format(storage_table2)).collect() relevance_in_each_comment_df.write.saveAsTable( '{0}'.format(storage_table2), mode='overwrite') # 80 executor 很快。。 relevance_in_each_comment_df = sqlContext.sql( 'select * from {0}'.format(storage_table2)) # 4 calculate summary_relevance effect_words_relevance_tem = relevance_in_each_comment_df.groupBy( ['flag_word', 'target_word']).agg(F.sum('each_relevance'), F.count('each_relevance'), F.avg('each_relevance')) effect_words_relevance_tem = effect_words_relevance_tem.withColumnRenamed( 'sum(each_relevance)', 'sum_relevance').withColumnRenamed( 'count(each_relevance)', 'total_comment_num').withColumnRenamed('avg(each_relevance)', 'avg_relevance') effect_words_relevance = effect_words_relevance_tem.withColumn( 'final_relevance_coef', F.pow(effect_words_relevance_tem.sum_relevance, 2) / effect_words_relevance_tem.total_comment_num) # 4.2 rank in partition windw = Window.partitionBy('flag_word').orderBy( F.desc('final_relevance_coef')) effect_words_relevance = effect_words_relevance.select( '*', F.rank().over(windw).alias('rank_in_flag_word')) # 4.3 append word_flag words_flag_tem = relevance_in_each_comment_df.drop_duplicates( ['flag_word', 'target_word']).select('flag_word', 'target_word', 'flag_flag', 'target_flag') effect_words_relevance = effect_words_relevance.join( words_flag_tem, ['flag_word', 'target_word']) # 4.4 append target word emo_promote_coef if high_type != 'all': words_specific_df = sqlContext.createDataFrame(words_specific_df) effect_words_relevance = effect_words_relevance.join( words_specific_df, effect_words_relevance.target_word == words_specific_df.word).select( 'flag_word', 'target_word', 'flag_flag', 'target_flag', 'sum_relevance', 'total_comment_num', 'avg_relevance', 'final_relevance_coef', 'rank_in_flag_word', coef_col_name) else: effect_words_relevance = effect_words_relevance.select( 'flag_word', 'target_word', 'flag_flag', 'target_flag', 'sum_relevance', 'total_comment_num', 'avg_relevance', 'final_relevance_coef', 'rank_in_flag_word') #effect_words_relevance.persist(StorageLevel(True,True,False,False,1)) #no_out=sqlContext.sql('drop table if exists {0}'.format(storage_table1)).collect() effect_words_relevance.write.saveAsTable('{0}'.format(storage_table1), mode='overwrite') return 'effect_words_relevance run over'
# <h1>Process Data using pyspark.sql</h1> # <p>Set the Hadoop configuration.</p> # In[8]: # Python expressions in a code cell will be outputted after computation expenditures_df.printSchema() # In[9]: # Sorting the data using spark sql from pyspark.sql.functions import desc, asc factor = expenditures_df.sort(desc('(% OF GDP)')).limit(10).toPandas() factor_re = expenditures_df.sort(asc('(% OF GDP)')).limit(10).toPandas() # In[10]: print factor # In[11]: life = life_expectancy_df.sort(desc('(YEARS)')).limit(10).toPandas() life_re = life_expectancy_df.sort(asc('(YEARS)')).limit(10).toPandas() # In[12]:
def question_one(self): self.rawDF.withColumn("raised_funds", self.money_to_int("raised_funds"))\ .groupBy("sweets")\ .agg(f.sum("raised_funds").alias("total_funds"))\ .orderBy(f.desc("total_funds"))\ .show(10, False)
df = spark.createDataFrame([], original_schema) text = row.ctext newRow = spark.createDataFrame([(text, text)], cols) df = df.union(newRow) #df = df.withColumnRenamed("ctext\r","ctext") sentencesDF = df.select(explode(split(df.ctext, "\.")).alias("sentences")) sentencesDF = sentencesDF.na.drop() tokenized = tokenize(sentencesDF) edited = stopwords_removal(tokenized) edgelist = createEdges(edited) edgeData = sc.parallelize(edgelist) schema = StructType([ StructField("src", StringType(), True), StructField("dst", StringType(), True), StructField("score", FloatType(), True) ]) edgeDF = spark.createDataFrame(edgeData, schema) vertices = edited.withColumnRenamed("sentences", "id") gFrame = GraphFrame(vertices, edgeDF) ranks = gFrame.pageRank(resetProbability=0.5, maxIter=20) sorted_ranks = ranks.vertices.orderBy(desc("pagerank")).limit(5) sentence_final = "" for srow in sorted_ranks.collect(): sentence_final = sentence_final + srow.id + "." final_df_row = spark.createDataFrame([(text, sentence_final)], cols) final_df = final_df.union(final_df_row) final_df.repartition(1).write.csv( "s3://project.summary/output.prob.05.max.20/.csv") print("End of Summarization")
postgres_user).option("password", postgres_pwd).load()) df_ratings = (spark.read.format("jdbc").option("url", postgres_db).option( "dbtable", "public.ratings").option("user", postgres_user).option("password", postgres_pwd).load()) #################################### # Tpo 10 movies with more ratings #################################### df_movies = df_movies.alias("m") df_ratings = df_ratings.alias("r") df_join = df_ratings.join(df_movies, df_ratings.movieId == df_movies.movieId).select( "r.*", "m.title") df_result = (df_join.groupBy("title").agg( F.count("timestamp").alias("qty_ratings"), F.mean("rating").alias("avg_rating")).sort( F.desc("qty_ratings")).limit(10)) print("######################################") print("EXECUTING QUERY AND SAVING RESULTS") print("######################################") # Save result to a CSV file df_result.coalesce(1).write.format("csv").mode("overwrite").save( "/usr/local/spark/resources/data/output_postgres", header=True)
(unix_timestamp(max("data_timestamp").over(window_sessions)) - unix_timestamp(min("data_timestamp").over(window_sessions))) / 60) df.groupBy("client_ip").count().show() # This is the sample IP address that I used to creating the queries. test = df.orderBy("client_ip", "data_timestamp").where("client_ip = '156.101.9.1'") test2 = test.withColumn("prev_time", lag(test.data_timestamp).over(window_clients)) test3 = test2.withColumn( "session", sum((coalesce( (unix_timestamp("data_timestamp") - unix_timestamp("prev_time")) / 60, lit(0)) > 15).cast("int")).over(window_clients)) test3.withColumn( "total_session_time", (unix_timestamp(max("data_timestamp").over(window_sessions)) - unix_timestamp(min("data_timestamp").over(window_sessions))) / 60) test3.select("client_ip", "session", "request_url").distinct().groupBy("request_url").count().show() # This is the query to get the average session time. df.select(avg("total_session_time")).show() # This is the query to get the most engaged users. df.select("client_ip", "total_session_time").distinct().orderBy( desc("total_session_time")).show() # This is the query to get the unique URL visits per session df.select("client_ip", "session", "request_url").distinct().groupBy("request_url").count().show()
df.groupBy("state").sum("salary").show() dfGroup=df.groupBy("state") \ .agg(sum("salary").alias("sum_salary")) dfGroup.show(truncate=False) dfFilter = dfGroup.filter(dfGroup.sum_salary > 100000) dfFilter.show() from pyspark.sql.functions import asc dfFilter.sort("sum_salary").show() from pyspark.sql.functions import desc dfFilter.sort(desc("sum_salary")).show() df.groupBy("state") \ .agg(sum("salary").alias("sum_salary")) \ .filter(col("sum_salary") > 100000) \ .sort(desc("sum_salary")) \ .show() df.createOrReplaceTempView("EMP") spark.sql("select state, sum(salary) as sum_salary from EMP " + "group by state having sum_salary > 100000 " + "order by sum_salary desc").show() df.groupBy("state") \ .sum("salary") \ .withColumnRenamed("sum(salary)", "sum_salary") \
aggdf = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dbfs:/FileStore/tables/retail_data_all-db128.txt").coalesce(5) aggdf.cache() aggdf.createOrReplaceTempView("aggtable") #aggdf.show(10) #from pyspark.sql.functions import distinct aggdf.select("InvoiceNo").distinct().count() from pyspark.sql.functions import countDistinct, approx_count_distinct, col aggdf.select(countDistinct(col("StockCode"))).show() from pyspark.sql.functions import first, last, desc aggdf.orderBy(desc("UnitPrice")).show() from pyspark.sql.functions import sum, count, avg, expr aggdf.select( count("Quantity").alias("total_transactions"), sum("Quantity").alias("total_purchases"), avg("Quantity").alias("avg_purchases"), expr("mean(Quantity)").alias("mean_purchases"))\ .selectExpr( "total_purchases/total_transactions", "avg_purchases", "mean_purchases").show() from pyspark.sql.functions import collect_set, collect_list aggdf.agg(collect_set("Description"), collect_list("Description")).show() from pyspark.sql.window import Window
from pyspark.context import SparkContext # In[2]: from config import datalake_features_path, datalake_staged_path, daily_feature_countries # In[3]: spark = SparkSession.builder.appName('covid_daily_feature').getOrCreate() # In[4]: df = (spark.read.option("header", "true").option( "inferSchema", "true").load(datalake_staged_path + "/full")) # In[5]: df = ( df.filter((col("country_region").isin(daily_feature_countries))).filter( col("province_state") == "n/a").orderBy(desc("date")) # Two days for 2 countries .limit(2 * 2)) # In[6]: window = Window.partitionBy("country_region").orderBy("date") df_lag = (df.withColumn('prev_confirmed', lag(col('confirmed')).over(window)).withColumn( 'prev_deaths', lag(col('deaths')).over(window)).withColumn( 'prev_recovered', lag(col('recovered')).over(window))) result = (df_lag.withColumn( 'new_confirmed',
partial(histogram, bins=np.logspace(0, 32, 33, base=2)), ArrayType(LongType()), F.PandasUDFType.GROUPED_AGG, ) hist_l_count = F.pandas_udf( partial(histogram, bins=np.logspace(0, 25, 26, base=2)), ArrayType(LongType()), F.PandasUDFType.GROUPED_AGG, ) hist_d_count = F.pandas_udf( partial(histogram, bins=np.logspace(0, 25, 26, base=2)), ArrayType(LongType()), F.PandasUDFType.GROUPED_AGG, ) w = Window().partitionBy("date", "categories").orderBy(F.desc("view_count")) top_v = ( scaled.withColumn("date", F.date_trunc("week", "upload_date")).select( "date", "upload_date", "categories", "display_id", "view_count", "like_count", "dislike_count", "duration", ((F.col("like_count") + F.col("dislike_count")) / F.col("view_count")).alias("engagement_score"), F.rank().over(w).alias("rank"), ).filter("rank <= 20").drop("rank") # .groupBy("date", "categories")
for center in centers: print(center) # Predict the label of each hacking attempt Final_Model_LocationCoded.transform(FinalData_LocationCoded).select('prediction').show(10) # In[6]: #formingClusters #Without Location clusters_NoLocation = Final_Model_NoLocation.transform(FinalData_NoLocation).select('*') clusters_NoLocation.groupBy("prediction").count().orderBy(F.desc("count")).show() clusters_NoLocation.show() clusters_NoLocation_pd = clusters_NoLocation.toPandas() clusters_NoLocation_pd.to_csv("Clusters_NoLocation.csv") #With LocationIndex clusters_LocationIndex = Final_Model_LocationIndex.transform(FinalData_LocationIndex).select('*') clusters_LocationIndex.groupBy("prediction").count().orderBy(F.desc("count")).show() clusters_LocationIndex.show() clusters_LocationIndex_pd = clusters_LocationIndex.toPandas() clusters_LocationIndex_pd.to_csv("clusters_LocationIndex.csv") #With LocationCoded clusters_LocationCoded = Final_Model_LocationCoded.transform(FinalData_LocationCoded).select('*')
def bar_plot(df_in, top_n=20, rotation=True, output_dir=None, display=False, tracking=False): """ Bar plot for the categorical features in the rdd data frame. :param df_in: the input rdd data frame :param top_n: the number of the most frequent feature to show in the bar plot :param rotation: the flag for rotating the xticks in the plot, the default value is True :param output_dir: the out put directory, the default value is the current working directory :param display: the flag for displaying the figures, the default value is False :param tracking: the flag for displaying CPU time, the default value is False """ _, _, cat_fields, date_fields, _ = dtypes_class(df_in) cat_fields = cat_fields + date_fields if cat_fields: df_in = df_in.select(cat_fields) if output_dir is None: out_path = os.getcwd() + '/Audited' else: out_path = output_dir + '/Audited' mkdir(out_path) print( '================================================================') print('The Bar plot Bar_plots.pdf was located at:') print(out_path) if tracking: start = time.time() pdf = PdfPages(out_path + '/03-Bar_plots.pdf') for col in df_in.columns: p_data = df_in.select(col).na.drop().groupBy(col).count().sort( F.desc('count')).limit(top_n).toPandas() if tracking: print('Plotting barplot of {}.... Please be patient!'.format( col)) plt.ioff() fig = plt.figure(figsize=(20, 15)) sns.barplot(x=col, y="count", data=p_data) plt.title('Barplot of {}'.format(col), fontsize=20) plt.xlabel('{}'.format(col), fontsize=20) plt.ylabel('number of counts', fontsize=20) if rotation: plt.xticks(rotation=90) pdf.savefig(fig) if display: plt.show() plt.close(fig) if tracking: print('Bar plots are DONE!!!') pdf.close() if tracking: end = time.time() print('Generate bar plots took = ' + str(end - start) + ' s') else: print('Caution: no categorical features in the dataset!!!')
# How many users are female? # TODO: write your code to answer question 3 females = user_log.filter(user_log.gender == 'F') \ .select('userId', 'gender') \ .dropDuplicates() \ .count() print(f'Famale users: {females}\n') ########################## Question 4 ########################## # How many songs were played from the most played artist? # TODO: write your code to answer question 4 user_log.filter(user_log.page == 'NextSong') \ .select('Artist') \ .groupBy('Artist') \ .agg({'Artist' : 'count'}) \ .sort(desc('count(Artist)')) \ .withColumnRenamed('count(Artist)', 'Play_count') \ .show(1) ########################## Question 5 (challenge) ########################## # How many songs do users listen to on average between visint our home page? Round to closest integer function = udf(lambda ishome: int(ishome == 'Home'), IntegerType()) user_window = Window \ .partitionBy('userID') \ .orderBy(desc('ts')) \ .rangeBetween(Window.unboundedPreceding, 0) cusum = user_log.filter((user_log.page == 'NextSong') | (user_log.page == 'Home')) \ .select('userID', 'page', 'ts') \ .withColumn('homevisit', function('page')) \
p_temp = part.filter("p_name like '%dim%'") s_l = lineitem.join(supplier,lineitem.L_SUPPKEY == supplier.S_SUPPKEY) s_l_ps = s_l.join((partsupp,s_l.L_SUPPKEY == partsupp.PS_SUPPKEY) & (s_l.L_PARTKEY == partsupp.PS_PARTKEY )) s_l_ps_p = s_l_ps.join(part,s_l_ps.PS_PARTKEY == part.P_PARTKEY) s_l_ps_p_o = s_l_ps_p.join(orders, s_l_ps_p.L_ORDERKEY == orders.O_ORDERKEY) s_l_ps_p_o_n = s_l_ps_p_o.join(nation,s_l_ps_p_o.S_NATIONKEY == nation.N_NATIONKEY) profit = s_l_ps_p_o_n .select(s_l_ps_p_o_n.N_NAME.alias("NATION"), F.year(s_l_ps_p_o_n.O_ORDERDATE).alias("O_YEAR"),(s_l_ps_p_o_n.L_EXTENDEDPRICE * (1 - s_l_ps_p_o_n.L_DISCOUNT) - s_l_ps_p_o_n.PS_SUPPLYCOST * s_l_ps_p_o_n.L_QUANTITY).alias("AMOUNT")) res = profit .select(profit.NATION,profit.AMOUNT,profit.O_YEAR) .groupBy(profit.NATION,profit.O_YEAR) .agg(F.sum(profit.AMOUNT).alias("SUM_PROFIT")) .orderBy(profit.NATION,F.desc("O_YEAR")) p_temp = part.filter("p_name like '%dim%'") l_p = p_temp.join(lineitem, p_temp.P_PARTKEY == lineitem.L_PARTKEY) n_s = nation.join(supplier,nation.N_NATIONKEY == supplier.S_NATIONKEY) l_p_s = l_p.join(n_s,l_p.L_SUPPKEY == n_s.S_SUPPKEY) l_p_s_ps = l_p_s.join(partsupp,l_p_s.L_SUPPKEY == partsupp.PS_SUPPKEY) l_p_s_ps_o = l_p_s_ps.join(orders,l_p_s_ps.L_ORDERKEY == orders.O_ORDERKEY) profit = l_p_s_ps_o.select(l_p_s_ps_o.N_NAME,F.year(l_p_s_ps_o.O_ORDERDATE).alias("O_YEAR"),(l_p_s_ps_o.L_EXTENDEDPRICE * (1 - l_p_s_ps_o.L_DISCOUNT) - l_p_s_ps_o.PS_SUPPLYCOST * l_p_s_ps_o.L_QUANTITY).alias("AMOUNT")) res = profit .groupBy(profit.N_NAME,profit.O_YEAR) .agg(F.sum(profit.AMOUNT).alias("SUM_PROFIT")) .orderBy(l_p_s_ps_o.N_NAME,F.desc("O_YEAR"))
# MAGIC %md ## Window functions ## # COMMAND ---------- #Add a new date column spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY") from pyspark.sql.functions import col, to_date, desc, max, dense_rank from pyspark.sql.window import Window dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm")) #1 define a window - use all preceding rows up and until current row windowSpec = Window\ .partitionBy("CustomerID","date")\ .orderBy(desc("Quantity"))\ .rowsBetween(Window.unboundedPreceding, Window.currentRow) #2 define aggregation maxPurchaseQuantity maxPurchaseQuantity = max(col("Quantity")).over(windowSpec) print(maxPurchaseQuantity) #2.1 purchaseRank purchaseDenseRank = dense_rank().over(windowSpec) print(purchaseDenseRank) # COMMAND ---------- # 3 Perform a Select dfWithDate.where("CustomerId IS NOT NULL AND ").orderBy("CustomerId")\ .select( col("CustomerId"),
adlocation.printSchema() adlocation.cache() vpn = ctx.read.load('/user/cloudera/ciscovpn') vpn.printSchema() vpn.cache() def func(x): gi = GeoIP.open("GeoIP.dat",GeoIP.GEOIP_MEMORY_CACHE) cc = gi.country_code_by_addr(x.remoteip) return Row(bytesrcv=x.bytesrcv, bytesxmt=x.bytesxmt, duration=x.duration, localip=x.localip, reason=x.reason, remoteip=x.remoteip, source=x.source, time=x.time, user=x.user, date=x.date, remoteipcc=cc) vpnDF = vpn.map(func).toDF() joinDF = vpnDF.join(adlocation, vpnDF.user == adlocation.EmailAddress) joinDF.cache() fromOtherLocations = joinDF.filter("remoteipcc <> c") cntLoginExtLocation = fromOtherLocations.count() groupDF = fromOtherLocations.groupBy(fromOtherLocations.user, fromOtherLocations.remoteip, fromOtherLocations.remoteipcc, fromOtherLocations.c)\ .count()\ .orderBy(desc('count')) groupDF.cache() countbyCountry = fromOtherLocations.groupBy(groupDF.user, groupDF.remoteipcc).count().orderBy(desc('count'))
# MAGIC %md # MAGIC For your final task, you'll group by word and count the number of times each word occurs. Make sure to return the counts in descending order and to call them `counts`. # MAGIC # MAGIC For this task, you can use: # MAGIC * `DataFrame` operations `groupBy`, `agg`, and `sort` # MAGIC * the `Column` operation `alias` # MAGIC * functions `func.count` and `func.desc`. # COMMAND ---------- # ANSWER wordGroupCount = (wordList .groupBy('word') # group .agg(func.count('word').alias('counts')) # aggregate .sort(func.desc('counts'))) #sort wordGroupCount.take(5) # COMMAND ---------- # TEST Test.assertEquals(tuple(wordGroupCount.first()), (u'ref', 29263), 'incorrect counts.') # COMMAND ---------- # MAGIC %md # MAGIC We could also use SQL to accomplish this counting. # COMMAND ----------
#!/usr/bin/python from pyspark import SparkContext from pyspark.sql import SQLContext, Row from pyspark.sql.functions import asc, desc if __name__ == "__main__": sc = SparkContext(appName='resort data') sqlContext = SQLContext(sc) df = sqlContext.read.load('hdfs://discovery3:9000/tmp/dasmith/c19-20160919-a50-o08/pretty.parquet') #df = sqlContext.read.load('hdfs://discovery3:9000/tmp/dasmith/c19-20160402-a50-o08/out.parquet') df.registerTempTable("newspaper") df2 = sqlContext.sql("select series, date, count(*) as cnt from newspaper group by series, date order by cnt desc") df3 = df.join(df2, ['series', 'date']) df3.sort(desc("cnt"), asc("begin"), asc("end"))\ .write.json('/gss_gpfs_scratch/xu.shao/network/resorted-pretty.json')
############################################################### ### ### ### ### ### SORT and ORDER ### ### ### ### ### ############################################################### # Get the five oldest people in the list. To do that, sort by age in descending order using orderBy transformation orderdataDF = dataDF.orderBy(dataDF.age.desc()) display(orderdataDF.take(5)) # desc() order correct/alternate format from pyspark.sql.functions import desc WordsAndCountsDF = wordCount(shakeWordsDF) topWordsAndCountsDF = WordsAndCountsDF.orderBy(desc("count")) topWordsAndCountsDF.show() # for ascending order orderdataDF = dataDF.orderBy(dataDF.age) display(orderdataDF.take(5)) # SORT operation new_sorted_df = (original_df.groupBy('somecolumn').count().sort('somecolumn',ascending=False).cache()) Sorted_df = OriginalDF.select('A_Column').groupBy('A_Column').count().sort('count', ascending=False) # Sorting by 'A_Column' ############################################################### ### ### ### ### ### CACHING AND STORAGE ### ### ###
# MAGIC %md # MAGIC ** (4e) Count the words ** # MAGIC # MAGIC We now have a DataFrame that is only words. Next, let's apply the `wordCount()` function to produce a list of word counts. We can view the first 20 words by using the `show()` action; however, we'd like to see the words in descending order of count, so we'll need to apply the [`orderBy` DataFrame method](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.orderBy) to first sort the DataFrame that is returned from `wordCount()`. # MAGIC # MAGIC You'll notice that many of the words are common English words. These are called stopwords. In a later lab, we will see how to eliminate them from the results. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import desc topWordsAndCountsDF = (shakeWordsDF .groupBy(shakeWordsDF.word) .count() .orderBy(desc('count')) ) topWordsAndCountsDF.show() # COMMAND ---------- # TEST Count the words (4e) Test.assertEquals(topWordsAndCountsDF.take(15), [(u'the', 27361), (u'and', 26028), (u'i', 20681), (u'to', 19150), (u'of', 17463), (u'a', 14593), (u'you', 13615), (u'my', 12481), (u'in', 10956), (u'that', 10890), (u'is', 9134), (u'not', 8497), (u'with', 7771), (u'me', 7769), (u'it', 7678)], 'incorrect value for top15WordsAndCountsDF') # COMMAND ---------- # MAGIC %md
def _calculate_rate(instance_usage_df): instance_usage_data_json_list = [] try: sorted_oldest_ascending_df = instance_usage_df.sort( functions.asc("processing_meta.oldest_timestamp_string")) sorted_latest_descending_df = instance_usage_df.sort( functions.desc("processing_meta.latest_timestamp_string")) # Calculate the rate change by percentage oldest_dict = sorted_oldest_ascending_df.collect()[0].asDict() oldest_quantity = float(oldest_dict[ "processing_meta"]["oldest_quantity"]) latest_dict = sorted_latest_descending_df.collect()[0].asDict() latest_quantity = float(latest_dict[ "processing_meta"]["latest_quantity"]) rate_percentage = 100 * ( (oldest_quantity - latest_quantity) / oldest_quantity) # get any extra data extra_data_map = getattr(sorted_oldest_ascending_df.collect()[0], "extra_data_map", {}) except Exception as e: raise PreHourlyCalculateRateException( "Exception occurred in pre-hourly rate calculation. Error: %s" % str(e)) # create a new instance usage dict instance_usage_dict = {"tenant_id": latest_dict.get("tenant_id", "all"), "user_id": latest_dict.get("user_id", "all"), "resource_uuid": latest_dict.get("resource_uuid", "all"), "geolocation": latest_dict.get("geolocation", "all"), "region": latest_dict.get("region", "all"), "zone": latest_dict.get("zone", "all"), "host": latest_dict.get("host", "all"), "project_id": latest_dict.get("project_id", "all"), "aggregated_metric_name": latest_dict["aggregated_metric_name"], "quantity": rate_percentage, "firstrecord_timestamp_unix": oldest_dict["firstrecord_timestamp_unix"], "firstrecord_timestamp_string": oldest_dict["firstrecord_timestamp_string"], "lastrecord_timestamp_unix": latest_dict["lastrecord_timestamp_unix"], "lastrecord_timestamp_string": latest_dict["lastrecord_timestamp_string"], "record_count": oldest_dict["record_count"] + latest_dict["record_count"], "usage_date": latest_dict["usage_date"], "usage_hour": latest_dict["usage_hour"], "usage_minute": latest_dict["usage_minute"], "aggregation_period": latest_dict["aggregation_period"], "extra_data_map": extra_data_map } instance_usage_data_json = json.dumps(instance_usage_dict) instance_usage_data_json_list.append(instance_usage_data_json) # convert to rdd spark_context = instance_usage_df.rdd.context return spark_context.parallelize(instance_usage_data_json_list)
comments = spark_read.parquet(path=data_dir.make_interim_path('comments')) case_status_history = spark_read.parquet( path=data_dir.make_interim_path('case_status_history')) ref_ids_escalated = (case_status_history.filter( F.col("inverse_time_to_next_escalation") > 0).select( 'reference_id').distinct()) ref_ids_escalated.count() comments_with_cutoff_times = spark_read.parquet( path=data_dir.make_processed_path('comments_with_cutoff_times')) comments_with_cutoff_times.show() comments_with_cutoff_times.groupby('comment_type').count().orderBy( F.desc('count')).show(n=100) comment_types = [ 'general', 'programming', 'email', 'explanation', 'reproduction', 'workaround', 'configuration', 'solution', 'symptom', 'problem', 'educreferral' ] encoded_comments_with_cutoff, one_hot_encoded_comment_columns = ( one_hot_encode_categorical(df=comments_with_cutoff_times, categorical_column='comment_type', values_to_one_hot_encode=comment_types)) columns_for_label_encoding = ['comment_type', 'notes'] for col in columns_for_label_encoding: encoded_comments_with_cutoff = (label_encode_categorical_inplace(
def run(): topFolder = 'hdfs://172.16.241.100:9000/data/stuff/getty/' gettyImagesMetaFile = '{}allGettyMeta_1000000.csv'.format(topFolder) # imageId kwIds vcgImageId print(gettyImagesMetaFile) gettyKwIdCountFile = '{}gettyKwIdCount.csv'.format(topFolder) fields = [ StructField("imageId", StringType()), StructField("kwIds", StringType()), StructField("vcgImageId", StringType()) ] schema = StructType(fields) gettyImagesMeta_df = spark.read.format("csv").option( "header", "false").schema(schema).option("delimiter", '\t').load(gettyImagesMetaFile) # print('gettyImagesMeta_df: %s' % gettyImagesMeta_df.count()) gettyImagesMeta_df = gettyImagesMeta_df.filter( gettyImagesMeta_df.kwIds.isNotNull()).rdd.filter( lambda row: row.kwIds is not None).toDF() # print('gettyImagesMeta_df kwIds not null count: %s' % gettyImagesMeta_df.count()) # compute kwId count, generate kwIdsCount.csv def flatMap1(row): imageId = row.imageId kwIds_ = row.kwIds.split(',') rows = [] for kwId in kwIds_: row = Row(imageId=imageId, kwId=kwId) rows.append(row) return rows gettyImagesMeta_df = gettyImagesMeta_df.rdd.filter( lambda row: ((row.kwIds is not None))).flatMap( lambda row: flatMap1(row)).toDF().cache() gettyImagesMeta_df.show(100, False) # print('total imageId-kwId count:%d' % gettyImagesMeta_df.count()) gettyKwIdCount_df = gettyImagesMeta_df.groupBy("kwId").agg({ '*': 'count' }).withColumnRenamed('count(1)', 'count') gettyKwIdCount_df = gettyKwIdCount_df.orderBy(desc("count")) gettyKwIdCount_df.show(100, False) gettyKwIdCount_df.repartition(1).write.format( "com.databricks.spark.csv").option("header", "True").option( "delimiter", '\t').mode("overwrite").save(gettyKwIdCountFile) # analysis kwIdsCount.csv then sort by count desc select count > topNum kwIds as labels generate related files topNum = 300 gettyKwIdCountFilteredFile = '{}gettyKwIdCountAbove{}.csv'.format( topFolder, 300) labelsIndexMappingFile = '{}labelsIndexMappingAbove{}.csv'.format( topFolder, 300) gettyKwIdCount_df = spark.read.format("csv").option( "header", "true").option("delimiter", '\t').load(gettyKwIdCountFile) gettyKwIdCount_df.show(10, False) gettyKwIdCount_df = gettyKwIdCount_df.filter( gettyKwIdCount_df['count'] > topNum) gettyKwIdCount_df = gettyKwIdCount_df.withColumn( "index", F.row_number().over(Window.orderBy(monotonically_increasing_id())) - 1) gettyKwIdCount_df.show(100, False) gettyKwIdCount_df.repartition(1).write.format( "com.databricks.spark.csv").option("header", "True").option( "delimiter", '\t').mode("overwrite").save(gettyKwIdCountFilteredFile) gettyKwIdCount_df.select('index', 'kwId').repartition(1).write.format("com.databricks.spark.csv").option("header", "False") \ .option("delimiter", '\t').mode("overwrite").save( labelsIndexMappingFile) kwIdsSet = set() kwIds = gettyKwIdCount_df.select('index', 'kwId').rdd.collect() for row in kwIds: kwIdsSet.add(row.kwId) print('filterd kwIds size: %d' % len(kwIdsSet)) kwIdsSet_broadcast = spark.sparkContext.broadcast(kwIdsSet) gettytopNumImagesOfKwIdFile = '{}kwsTopNumImages.csv'.format(topFolder) # kwId,topNumImages finalImageKwIdsFile = '{}finalImageKwIds.csv'.format(topFolder) gettytopNumImagesOfKwId_df = spark.read.format("csv").option( "header", "false").option( "delimiter", '\t').load(gettytopNumImagesOfKwIdFile).withColumnRenamed( '_c0', 'kwId').withColumnRenamed('_c1', 'imageIds') gettytopNumImagesOfKwId_df = gettytopNumImagesOfKwId_df.filter( gettytopNumImagesOfKwId_df.imageIds.isNotNull()) def filterKwIds(row): kwId = row.kwId if kwId in kwIdsSet_broadcast.value: return True else: return False def flatMaps(row): kwId = row.kwId imageIds = row.imageIds.split(',') rows = [] for imageId in imageIds: row = Row(kwId=kwId, imageId=imageId) rows.append(row) return rows gettytopNumImagesOfKwId_df = gettytopNumImagesOfKwId_df.rdd.filter( lambda row: filterKwIds(row)).flatMap( lambda row: flatMaps(row)).toDF() gettytopNumImagesOfKwId_df = gettytopNumImagesOfKwId_df.groupBy( "imageId").agg({ '*': 'count' }).withColumnRenamed('count(1)', 'count').drop('count') gettyImagesMeta_df = gettyImagesMeta_df.withColumnRenamed( 'imageId', 'gettyImageId') # gettyImagesMeta_df 过滤 然后合并imageId-kwId to imageId-kwIds (aggregate operation) zero_value_2 = None def seqFunc_2(accumulator, element): if accumulator is None: return element else: element = accumulator + "," + element return element def combFunc_2(accumulator1, accumulator2): if accumulator1 is None: return accumulator2 else: accumulator2 = accumulator1 + "," + accumulator2 return accumulator2 gettyImagesMeta_df = gettyImagesMeta_df.rdd.filter(lambda row: filterKwIds( row)).map(lambda row: (row.gettyImageId, row.kwId)).aggregateByKey( zero_value_2, seqFunc_2, combFunc_2).toDF().withColumnRenamed( '_2', 'kwIds').withColumnRenamed('_1', 'gettyImageId') gettyImagesMeta_df.show(100, False) finalImageKwIds_df = gettytopNumImagesOfKwId_df.join( gettyImagesMeta_df, gettyImagesMeta_df.gettyImageId == gettytopNumImagesOfKwId_df.imageId, how='inner').drop('imageId') # add new column url gettyImageUrlPrefix = 'https://elephant-data-backup.oss-cn-beijing.aliyuncs.com/elephant-data-backup/gettyimage/' def setUrl(gettyImageId): return '{}{}.jpg'.format(gettyImageUrlPrefix, gettyImageId) setUrlUdf = udf(setUrl, StringType()) finalImageKwIds_df = finalImageKwIds_df.withColumn( 'url', setUrlUdf('gettyImageId')) finalImageKwIds_df.repartition(1).write.format( "com.databricks.spark.csv").option("header", "True").option( "delimiter", '\t').mode("overwrite").save(finalImageKwIdsFile)
(u'/history/apollo/a-001/a-001-patch-small.gif', 97), (u'/images/Nasa-logo.gif', 85), (u'', 76), (u'/shuttle/resources/orbiters/atlantis.gif', 63), (u'/history/apollo/images/little-joe.jpg', 62), (u'/images/lf-logo.gif', 59), (u'/shuttle/resources/orbiters/discovery.gif', 56), (u'/shuttle/resources/orbiters/challenger.gif', 54), (u'/robots.txt', 53), (u'/history/apollo/pad-abort-test-2/pad-abort-test-2-patch-small.gif', 38) ] Test.assertEquals(top_20_not_found, top_20_expected, 'incorrect top_20_not_found') # (5d) Exercise: Listing the Top Twenty-five 404 Response Code Hosts hosts_404_count_df = not_found_df.groupBy('host').count().sort(desc('count')) print 'Top 25 hosts that generated errors:\n' hosts_404_count_df.show(n=25, truncate=False) top_25_404 = [(row[0], row[1]) for row in hosts_404_count_df.take(25)] Test.assertEquals(len(top_25_404), 25, 'length of errHostsTop25 is not 25') expected = set([ (u'maz3.maz.net ', 39), (u'piweba3y.prodigy.com ', 39), (u'gate.barr.com ', 38), (u'nexus.mlckew.edu.au ', 37), (u'ts8-1.westwood.ts.ucla.edu ', 37), (u'm38-370-9.mit.edu ', 37),
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'") # COMMAND ---------- # MAGIC %md # MAGIC ** (4e) Count the words ** # MAGIC # MAGIC We now have a DataFrame that is only words. Next, let's apply the `wordCount()` function to produce a list of word counts. We can view the first 20 words by using the `show()` action; however, we'd like to see the words in descending order of count, so we'll need to apply the [`orderBy` DataFrame method](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.orderBy) to first sort the DataFrame that is returned from `wordCount()`. # MAGIC # MAGIC You'll notice that many of the words are common English words. These are called stopwords. In a later lab, we will see how to eliminate them from the results. # COMMAND ---------- # TODO: Replace <FILL IN> with appropriate code from pyspark.sql.functions import desc topWordsAndCountsDF = wordCount(shakeWordsDF).orderBy(desc("count"),"word") topWordsAndCountsDF.show() # COMMAND ---------- # TEST Count the words (4e) Test.assertEquals(topWordsAndCountsDF.take(15), [(u'the', 27361), (u'and', 26028), (u'i', 20681), (u'to', 19150), (u'of', 17463), (u'a', 14593), (u'you', 13615), (u'my', 12481), (u'in', 10956), (u'that', 10890), (u'is', 9134), (u'not', 8497), (u'with', 7771), (u'me', 7769), (u'it', 7678)], 'incorrect value for top15WordsAndCountsDF') # COMMAND ---------- # MAGIC %md # MAGIC #### ** Prepare to the course autograder **
.withColumn('rate', split(events['value'],',')[5].cast(FloatType()) ) \ .withColumn('action', split(events['value'],',')[6].cast(StringType()) ) #parsed_events.show(10,False) ################################################################################################### # Displaying user count. 60 second window with 15 sec sliding duration... ################################################################################################### # http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.window # pyspark.sql.functions.window(timeColumn, windowDuration, slideDuration=None, startTime=None) windowedCounts = parsed_events.groupBy( window(parsed_events.datetime, "1 minutes", "15 seconds"), parsed_events.user) \ .count() \ .sort(desc("count")) query1 = windowedCounts \ .writeStream \ .outputMode("complete") \ .format("console") \ .start() ################################################################################################### # Displaying average duration by user. 60 second window with 15 sec sliding duration... ################################################################################################### windowedAvg = parsed_events.groupBy( window(parsed_events.datetime, "1 minutes", "15 seconds"), parsed_events.user) \ .agg({'duration': 'mean'}) \
not200DF = logs_df.<FILL IN> not200DF.show(10) # Sorted DataFrame containing all endpoints and the number of times they were accessed with non-200 return code logs_sum_df = not200DF.<FILL IN> print 'Top Ten failed URLs:' logs_sum_df.show(10, False) # COMMAND ---------- # ANSWER from pyspark.sql.functions import desc not200DF = logs_df.filter(logs_df['status'] != 200) not200DF.show(10) # Sorted DataFrame containing all endpoints and the number of times they were accessed with non-200 return code logs_sum_df = not200DF.groupBy('path').count().orderBy(desc('count')) print 'Top Ten failed URLs:' logs_sum_df.show(10, False) # COMMAND ---------- # TEST Top ten error endpoints (4a) top_10_err_urls = [(row[0], row[1]) for row in logs_sum_df.take(10)] top_10_err_expected = [ (u'/images/NASA-logosmall.gif', 8761), (u'/images/KSC-logosmall.gif', 7236), (u'/images/MOSAIC-logosmall.gif', 5197), (u'/images/USA-logosmall.gif', 5157), (u'/images/WORLD-logosmall.gif', 5020), (u'/images/ksclogo-medium.gif', 4728),
from pyspark.sql import SparkSession from pyspark.sql import functions as F from pyspark.sql.types import StructType, StructField, IntegerType, LongType # Spark v3.0.1 spark = SparkSession.builder.master("local").appName( "PopularMovie").getOrCreate() spark.sparkContext.setLogLevel("ERROR") schema = StructType([ StructField("userID", IntegerType(), True), StructField("movieID", IntegerType(), True), StructField("rating", IntegerType(), True), StructField("timestamp", LongType(), True), ]) moviesDF = spark.read.option("sep", "\t").schema(schema).csv("./ml-100k/u.data") topMovieIDs = moviesDF.groupBy("movieID").count().orderBy(F.desc("count")) topMovieIDs.sort("movieID").show(10) spark.stop()
print("Usage: pretty-cluster.py <metadata> <input> <output> [<query>]", file=sys.stderr) exit(-1) sc = SparkContext(appName="Prettyprint Clusters") sqlContext = SQLContext(sc) outpath = sys.argv[3] (outputFormat, outputOptions) = guessFormat(outpath, "json") ## Should do more field renaming in meta to avoid clashing with fields in raw. meta = sqlContext.read.json(sys.argv[1])\ .dropDuplicates(['series']) constructURL = udf(lambda url, corpus, id, regions: formatURL(url, corpus, id, regions)) df = sqlContext.read.load(sys.argv[2]) \ .withColumnRenamed('title', 'doc_title')\ .withColumnRenamed('lang', 'doc_lang')\ .withColumn('url', constructURL(col('page_access'), col('corpus'), col('id'), col('regions')))\ .drop('locs').drop('pages').drop('regions')\ .join(meta, 'series', 'left_outer') filtered = df.join(df.filter(sys.argv[4]).select('cluster').distinct(), 'cluster') \ if len(sys.argv) >= 5 else df filtered.withColumn('lang', concat_ws(',', col('lang'))) \ .orderBy(desc('size'), 'cluster', 'date', 'id', 'begin')\ .write.format(outputFormat).options(**outputOptions).save(outpath) sc.stop()
spark = SparkSession(sc) rawData = sc.textFile("../data/subset-small.tsv") fields = rawData.map(lambda x: x.split("\t")) documents = fields.map(lambda x: x[3].split(" ")) documentNames = fields.map(lambda x: x[1]) hashingTF = HashingTF(100000) tf = hashingTF.transform(documents) tf.cache() idf = IDF(minDocFreq=2).fit(tf) tfidf = idf.transform(tf) gettysburgTF = hashingTF.transform(['Gettysburg']) gettysburgHashValue = int(gettysburgTF.indices[0]) gettsburgRelevance = tfidf.map(lambda x: float(x[gettysburgHashValue])) zippedResults = gettsburgRelevance.zip(documentNames) schema = StructType([StructField("score", FloatType(), True), StructField("document", StringType(), True)]) resultSchema = spark.createDataFrame(zippedResults, schema) resultSchema.createOrReplaceTempView('Results') print("Result: ") print(resultSchema.sort(desc('score')).show())