def test_drop_columns(self, spark):
     data = [('Alice', 20, 'London'), ('Alice', 33, 'Paris'),
             ('Alice', 20, 'Paris'), ('Nina', 40, None)]
     columns = ['name', 'age', 'city']
     df = spark.createDataFrame(data, columns)
     res_df = Count(['name', 'age'],
                    'count').transform(df).sort(f.asc('name'), f.asc('age'))
     rows = res_df.collect()
     assert rows[0]['count'] == 2
     assert rows[1]['count'] == 1
     assert rows[2]['count'] == 1
Example #2
0
def transform(df: DataFrame) -> DataFrame:
    """Weekly top five visitors."""
    grouped = df.where(df.event == "view").groupby("week",
                                                   "visitorid").count().select(
                                                       "week", "visitorid",
                                                       col('count').alias('n'))
    grouped_ranked = grouped.withColumn(
        "rank",
        dense_rank().over(Window.partitionBy("week").orderBy(desc("n"))))
    top_five_customers = grouped_ranked.where(col("rank") < 6).orderBy(
        asc("week"), asc("rank"))
    return top_five_customers
Example #3
0
def get_df_change_level_init(df_student_level):
    df_student_level_new = df_student_level.select(
        'contact_id', df_student_level.level_current.alias('level'),
        f.lit(MIN_DATE).alias('time_level_created'))

    df_student_level_new = df_student_level_new.orderBy(
        f.asc('contact_id'), f.asc('time_level_created'))
    df_student_level_first = df_student_level_new.groupBy('contact_id').agg(
        f.first('level').alias('level'),
        f.first('time_level_created').alias('time_level_created'))

    return df_student_level_first
def get_df_change_advisor_init(df_student_advisor):
    df_student_advisor_new = df_student_advisor.select(
        'contact_id', df_student_advisor.advisor_id_old.alias('advisor_id'))

    df_student_advisor_new = df_student_advisor_new.orderBy(
        f.asc('contact_id'), f.asc('created_at'))
    df_student_advisor_first = df_student_advisor_new.groupBy(
        'contact_id').agg(
            f.first('advisor_id').alias('advisor_id'),
            f.lit(MIN_DATE).alias('created_at'))

    df_student_advisor_first = df_student_advisor_first\
        .filter(df_student_advisor_first.advisor_id.isNotNull())

    return df_student_advisor_first
def run_whole_dataset():
    """
    Run analyses over the entire dataset
    """
    import pyspark.sql.functions as sqlf
    from datetime import datetime as dt

    stations = mkstations('data/stations.csv')

    # Hottest and coldest day and corresponding weather stations in the
    # entire dataset
    print("\nEntire dataset (2000-2016)\n==========================\n")
    print('  * Loading all datasets into a single DataFrame...')
    df = mkdf('data/20??.csv')

    print('  * Computing coldest station for entire dataset...\n')
    coldest = df.filter(df.meas=='TMIN').groupBy('sta', 'date').min('degc') \
                .sort(sqlf.asc('min(degc)')).first()

    date = dt.strptime(coldest.date, '%Y%m%d').strftime('%d %b %Y')
    city = getcity(stations, coldest.sta)

    print('Coldest station was %s (%s) on %s: %0.1f deg C'
          % (coldest.sta, city, date, float(coldest['min(degc)']) / 10.0))

    # and now the hottest
    print('\n  * Computing hottest station for entire dataset...\n')
    hottest = df.filter(df.meas=='TMAX').groupBy('sta', 'date').max('degc') \
                .sort(sqlf.desc('max(degc)')).first()

    date = dt.strptime(hottest.date, '%Y%m%d').strftime('%d %b %Y')
    city = getcity(stations, hottest.sta)

    print('Hottest station was %s (%s) on %s: %0.1f deg C'
          % (hottest.sta, city, date, float(hottest['max(degc)']) / 10.0))
Example #6
0
def passed_temperature_analyse(filename):
    print("begin to analyse passed temperature")
    spark = SparkSession.builder.master("local").appName("passed_temperature_analyse").getOrCreate()
    df = spark.read.csv(filename, header=True)
    df_temperature = df.select(  # 选择需要的列
        df['province'],
        df['city_name'],
        df['city_code'],
        df['temperature'].cast(DecimalType(scale=1)),
        F.date_format(df['time'], "yyyy-MM-dd").alias("date"),  # 得到日期数据
        F.hour(df['time']).alias("hour")  # 得到小时数据
    )
    # 筛选四点时次
    df_4point_temperature = df_temperature.filter(df_temperature['hour'].isin([2, 8, 12, 20]))
    # df_4point_temperature.printSchema()
    df_avg_temperature = df_4point_temperature.groupBy("province", "city_name", "city_code", "date") \
        .agg(F.count("temperature"), F.avg("temperature").alias("avg_temperature")) \
        .filter("count(temperature) = 4") \
        .sort(F.asc("avg_temperature")) \
        .select("province", "city_name", "city_code", "date",
                F.format_number('avg_temperature', 1).alias("avg_temperature"))
    df_avg_temperature.cache()
    avg_temperature_list = df_avg_temperature.collect()
    df_avg_temperature.coalesce(1).write.json("file:///F:/Code_All/Jupyter_Code/spark_test/result_data/bigData/passed_rain_temperature.json")
    print("end analysing passed temperature")
    return avg_temperature_list[0:10]
Example #7
0
    def test_sorting_functions_with_column(self):
        from pyspark.sql import functions
        from pyspark.sql.column import Column

        funs = [
            functions.asc_nulls_first, functions.asc_nulls_last,
            functions.desc_nulls_first, functions.desc_nulls_last
        ]
        exprs = [col("x"), "x"]

        for fun in funs:
            for expr in exprs:
                res = fun(expr)
                self.assertIsInstance(res, Column)
                self.assertIn(
                    f"""'x {fun.__name__.replace("_", " ").upper()}'""",
                    str(res))

        for expr in exprs:
            res = functions.asc(expr)
            self.assertIsInstance(res, Column)
            self.assertIn("""'x ASC NULLS FIRST'""", str(res))

        for expr in exprs:
            res = functions.desc(expr)
            self.assertIsInstance(res, Column)
            self.assertIn("""'x DESC NULLS LAST'""", str(res))
Example #8
0
def analyse_entire_dataset():
    """
    Analyse the entire dataset (2000-2019)
    """
    import pyspark.sql.functions as sqlfunc
    # from datetime import datetime as dt

    # Hottest and coldest day and corresponding weather stations in the entire dataset
    # Loading all datasets into a single DataFrame.

    print("\n-------------------------\n")
    df = mkdataframe('/user/tatavag/weather/20??.csv')

 
    # Coldest station
    coldest = df.filter(df.minormax=='TMIN').groupBy('station', 'date').min('degrees') \
                .sort(sqlfunc.asc('min(degrees)')).first()
    # date = dt.strptime(coldest.date, '%Y%m%d').strftime('%d %b %Y')
    print('Coldest station was %s on %s: %f'
          % (coldest.station, coldest.date, float(coldest['min(degrees)'])))

    # Hottest station
    hottest = df.filter(df.minormax=='TMAX').groupBy('station', 'date').max('degrees') \
                .sort(sqlfunc.desc('max(degrees)')).first()
    # date = dt.strptime(hottest.date, '%Y%m%d').strftime('%d %b %Y')
    print('Hottest station was %s on %s: %f'
          % (hottest.station, hottest.date, float(hottest['max(degrees)'])))

    # Median TMIN
    TMINmed = df.filter(df.minormax=='TMIN').approxQuantile('degrees',[0.5], 0.25)
    print('Median TMIN for the entire dataset: %f' % (TMINmed[0]))

    # Median TMAX
    TMAXmed = df.filter(df.minormax=='TMAX').approxQuantile('degrees',[0.5], 0.25)
    print('Median TMAX for the entire dataset: %f' % (TMAXmed[0]))
def main(inputs, output):
    # main logic starts here
    wiki_schema = types.StructType([
        types.StructField('language', types.StringType()),
        types.StructField('title', types.StringType()),
        types.StructField('views', types.IntegerType()),
        types.StructField('size', types.LongType()),
    ])
    #reading data
    wikiData = spark.read.csv(inputs, schema=wiki_schema, sep=" ").withColumn(
        'hour', path_to_hour(functions.input_file_name()))
    #filtering data
    filteredWikiData = wikiData[(wikiData['language'] == 'en')
                                & (wikiData['title'] != 'Main_Page') &
                                (wikiData['title'] != 'Special:Page')].cache()
    #finding max views per hour.
    maxCount = filteredWikiData.groupBy('hour').agg(
        functions.max(filteredWikiData['views']).alias('max'))
    #joining data to obtain hour and title.
    joinData = filteredWikiData.join(
        maxCount, filteredWikiData.views == maxCount.max).select(
            filteredWikiData["hour"], filteredWikiData["title"],
            filteredWikiData["views"])
    #sorting data based on hour and storing it in json file.
    joinData.sort(functions.asc('hour')).write.json(output, mode='overwrite')
Example #10
0
def with_row_number(output_col: str,
                    order_by: list,
                    df: DataFrame,
                    sort="asc",
                    zero_indexed=True) -> DataFrame:
    """Assign a sequential row number to each member of a dataframe"""

    is_desc = sort.lower() in ["desc", "descending"]
    if isinstance(order_by, str) or isinstance(order_by, Column):
        order_by = [order_by]
    elif not isinstance(order_by, list):
        msg = "Ordering criteria must be a string column name or a list of string column names"
        raise Exception(msg)

    # create a window function depending on the sort order
    if is_desc:
        window = Window.orderBy(*[F.desc(i) for i in order_by])
    else:
        window = Window.orderBy(*[F.asc(i) for i in order_by])

    # if the client wants to start from row 1 then that's fine
    if not zero_indexed:
        return df.withColumn(output_col, F.row_number().over(window))

    # otherwise start from row number 0
    return df.withColumn(output_col, F.row_number().over(window) - 1)
Example #11
0
    def hist(columns, min_value, max_value, buckets=10):
        """
         Get the histogram column in json format
        :param columns: Columns to be processed
        :param min_value: Min value used to calculate the buckets
        :param max_value: Max value used to calculate the buckets
        :param buckets: Number of buckets
        :return:
        """

        columns = parse_columns(self, columns)
        for col_name in columns:
            # Create splits
            splits = create_buckets(min_value, max_value, buckets)

            # Create buckets in the dataFrame
            df = bucketizer(self, col_name, splits=splits)

            counts = (df.groupBy(col_name + "_buckets").agg(
                F.count(col_name + "_buckets").alias("count")).cols.rename(
                    col_name + "_buckets",
                    "value").sort(F.asc("value")).to_json())

            hist = []
            for x, y in zip(counts, splits):
                # if x["value"] is not None and x["count"] != 0:
                hist.append({
                    "lower": y["lower"],
                    "upper": y["upper"],
                    "count": x["count"]
                })

        return hist
Example #12
0
def main(in_dir, out_dir):
    # data = spark.read.text(data1)
    # data = data.filter(data['value'] != '')
    # data.show()
    # wordbreak = r'[%s\s]+' % (re.escape(string.punctuation),)
    # data = data.withColumn('words', functions.explode(functions.split(functions.col('value'),wordbreak)))
    # data = data.withColumn('words', functions.lower(data['words']))
    # data = data.filter(data['words'] != '')

    # data = data.groupBy('words').agg(functions.count(data['words']))
    # data = data.sort(functions.col('words').asc())
    # data = data.sort(functions.col('count(words)').desc())
    # # data = data[data['words'] != '']

    # data.write.csv(data2, mode = 'overwrite')

    data = spark.read.text(in_dir)
    wordbreak = r'[%s\s]+' % (re.escape(string.punctuation), )
    data = data.withColumn(
        'words', functions.explode(functions.split('value', wordbreak)))
    data = data.withColumn('words', functions.lower(data['words']))
    data = data.filter(data['words'] != '')
    data = data.select('words')

    data = data.groupBy('words').agg(
        functions.count(data['words']).alias('count'))

    data = data.sort(asc('words'))
    data = data.sort(desc('count'))

    data.write.csv(out_dir, mode='overwrite')
Example #13
0
def main(inputs, output):
    # main logic starts here
    comments_schema = types.StructType([  # commented-out fields won't be read
        types.StructField('archived', types.BooleanType(), True),
        types.StructField('author', types.StringType(), True),
        types.StructField('author_flair_css_class', types.StringType(), True),
        types.StructField('author_flair_text', types.StringType(), True),
        types.StructField('body', types.StringType(), True),
        types.StructField('controversiality', types.LongType(), True),
        types.StructField('created_utc', types.StringType(), True),
        types.StructField('distinguished', types.StringType(), True),
        types.StructField('downs', types.LongType(), True),
        types.StructField('edited', types.StringType(), True),
        types.StructField('gilded', types.LongType(), True),
        types.StructField('id', types.StringType(), True),
        types.StructField('link_id', types.StringType(), True),
        types.StructField('name', types.StringType(), True),
        types.StructField('parent_id', types.StringType(), True),
        types.StructField('retrieved_on', types.LongType(), True),
        types.StructField('score', types.LongType(), True),
        types.StructField('score_hidden', types.BooleanType(), True),
        types.StructField('subreddit', types.StringType(), True),
        types.StructField('subreddit_id', types.StringType(), True),
        types.StructField('ups', types.LongType(), True),
        #types.StructField('year', types.IntegerType(), False),
        #types.StructField('month', types.IntegerType(), False),
    ])
    comments = spark.read.json(inputs, schema=comments_schema)
    find_avg = comments.groupBy((comments.subreddit).alias("Subreddit")).agg(
        avg(comments.score).alias("Average"))
    averages = find_avg.orderBy(asc("Subreddit")).coalesce(1)
    averages.write.csv(output, mode='overwrite')
Example #14
0
    def occCalc(self, channelID, testing=False):
        """ Calculates occupancy for the user defined month
		"""
        if type(channelID) != list:
            raise TypeError('ChannelID is required to be a list')

        conf = SparkConf()\
          .setAppName("Occupancy Calc")\
          .set("spark.master", "local[*]")\
          .set("spark.driver.maxResultSize", "15G")
        sc = SparkContext(conf=conf)
        sql = SQLContext(sc)
        path = 'AZURE PATH' + self.month +\
          '/*/*/' + self.sensor + '*'
        data = sql.read.parquet(path)

        timeCount = data.select('scan_time').distinct().count()
        timeCount = sc.broadcast(timeCount)
        subData = data.select('scan_time', 'channel_id', 'power_dbm').filter(
            data.channel_id.isin(channelID))
        subData = subData.groupBy('channel_id').agg(
            (count(column('power_dbm')) / timeCount.value).alias('freq'),
            stddev(column('power_dbm')).alias('sd')).sort(
                asc('freq'), desc('sd'))

        if testing:
            subData.toPandas().to_csv('C:/path/freq.csv', sep='\t')
            sc.stop()
        else:
            sc.stop()
            return (subData.toPandas())
def main():
    logs = read_data()
    
    df = create_dataframe(logs)
    
    answer = ''
    
    # 1. Número total de hosts únicos

    unique_hosts = df.select('host').drop_duplicates().count()
    
    answer += '1. Número total de hosts únicos\n'
    answer += 'Answer: {0} hosts únicos\n'.format(unique_hosts)
    
    # 2. O​ ​total​ ​de​ ​erros​ ​404
    
    total_404_errors = df.where(df.status_code == 404).count()
    
    answer += '2. O total de erros 404\n'
    answer += 'Answer: {0} erros\n'.format(total_404_errors)
    
    
    # 3. As​ ​5​ ​URLs​ ​que​ ​mais​ ​causaram​ ​erro​ ​404
    
    urls_with_most_404_errors = df.where(df.status_code == 404)\
        .groupBy('host')\
        .agg(F.count('status_code').alias('count_errors_404'))\
        .orderBy(F.desc('count_errors_404'))\
        .limit(5)\
        .select('host')\
        .collect()
        
    urls_with_most_404_errors = [row['host'] for row in urls_with_most_404_errors]
        
    answer += '3. As 5 URLs que mais causaram erro 404\n'
    answer += 'Answer: {0}\n'.format(', '.join(urls_with_most_404_errors))
    
    # 4. Quantidade​ ​de​ ​erros​ ​404​ ​por​ ​dia
    
    errors_per_day = df.where(df.status_code == 404)\
        .groupBy(F.dayofmonth('timestamp').alias('dia'))\
        .agg(F.count('status_code').alias('count_errors_404'))\
        .orderBy(F.asc('dia'))\
        .collect()
        
    errors_per_day = ['dia: {0}: {1} erros 404'.format(row['dia'], row['count_errors_404']) for row in errors_per_day]
    
    answer += '4. Quantidade de erros 404 por dia\n'
    answer += 'Answer: {0}\n'.format('\n'.join(errors_per_day))
    
    # 5. O​ ​total​ ​de​ ​bytes​ ​retornados
    
    total_bytes = df.select(F.sum(df.total_bytes).alias('total_bytes')).collect()
    total_bytes = total_bytes[0]['total_bytes']
    
    answer += '5. O total de bytes retornados\n'
    answer += 'Answer: {0} bytes'.format(total_bytes)
    
    export_answer(answer)
Example #16
0
def countUserRegTime(accountdf, df):
    """统计某一天的用户注册时长
    从全部注册用户里筛选出当天上线的用户,算出2个日期差,
    然后统计不同日期差里的用户数量并进行排序
    """
    return df.join(accountdf, df.uid == accountdf.uid, 'inner').select(
        F.datediff(df.day, accountdf.regtime).alias('daydiff')).groupBy(
            'daydiff').count().sort(F.asc('daydiff'))
Example #17
0
def counting_of_404_by_day():
    result = logs.filter("http_code like '%404%'")
    result = result.withColumn("date", F.regexp_extract("timestamp", date_regex, 1))
    result = result.groupby(["date", "http_code"]).count()
    result = result.sort(F.asc("date"))
    
    for row in result.collect():
        print("DATA: {} - ERROS: {}".format(row["date"], row["count"]))
Example #18
0
def main(society_1, society_2, society_3, society_4, society_5, tag):
    df_society_1 = spark.read.format("csv").option("header", "true").load(society_1 + '_tags.csv')
    df_society_1.show()
    df_society_2 = spark.read.format("csv").option("header", "true").load(society_2 + '_tags.csv')
    df_society_2.show()
    df_society_3 = spark.read.format("csv").option("header", "true").load(society_3 + '_tags.csv')
    df_society_3.show()
    df_society_4 = spark.read.format("csv").option("header", "true").load(society_4 + '_tags.csv')
    df_society_4.show()
    df_society_5 = spark.read.format("csv").option("header", "true").load(society_5 + '_tags.csv')
    df_society_5.show()

    df_result_1 = relation(society_1, society_2, tag, df_society_1, df_society_2)
    df_result_2 = relation(society_1, society_3, tag, df_society_1, df_society_3)
    df_result_3 = relation(society_1, society_4, tag, df_society_1, df_society_4)
    df_result_4 = relation(society_1, society_5, tag, df_society_1, df_society_5)

    # df_result_1 = spark.read.format("csv").option("header", "true").load(society_2 + '_' + tag + '.csv').sort(functions.desc(str(society_2 + "_count")))
    # df_result_1.show()
    # df_result_2 = spark.read.format("csv").option("header", "true").load(society_3 + '_' + tag + '.csv').sort(functions.desc(str(society_3 + "_count")))
    # df_result_3 = spark.read.format("csv").option("header", "true").load(society_4 + '_' + tag + '.csv').sort(functions.desc(str(society_4 + "_count")))
    # df_result_4 = spark.read.format("csv").option("header", "true").load(society_5 + '_' + tag + '.csv').sort(functions.desc(str(society_5 + "_count")))

    window = Window.orderBy(functions.col(str(society_2 + "_count")).desc())
    df_result_1 = df_result_1.withColumn('id', functions.row_number().over(window))
    df_result_1.show()
    window = Window.orderBy(functions.col(str(society_3 + "_count")).desc())
    df_result_2 = df_result_2.withColumn('id', functions.row_number().over(window))
    df_result_2.show()
    window = Window.orderBy(functions.col(str(society_4 + "_count")).desc())
    df_result_3 = df_result_3.withColumn('id', functions.row_number().over(window))
    df_result_3.show()
    window = Window.orderBy(functions.col(str(society_5 + "_count")).desc())
    df_result_4 = df_result_4.withColumn('id', functions.row_number().over(window))
    df_result_4.show()

    df_join = df_result_1.join(df_result_2, on = ['id'], how = 'outer').sort(functions.asc("id"))
    df_join.show()
    df_join = df_join.join(df_result_3, on = ['id'], how = 'outer').sort(functions.asc("id"))
    df_join.show()
    df_join = df_join.join(df_result_4, on = ['id'], how = 'outer').sort(functions.asc("id"))

    df_join.show()

    df_join.write.csv('tag_' + tag + '.csv',header = 'true')
Example #19
0
    def hist(columns, min_value, max_value, buckets=10):
        """
         Get the histogram column in json format
        :param columns: Columns to be processed
        :param min_value: Min value used to calculate the buckets
        :param max_value: Max value used to calculate the buckets
        :param buckets: Number of buckets
        :return:
        """

        columns = parse_columns(self, columns)
        for col_name in columns:
            # Create splits
            splits = create_buckets(min_value, max_value, buckets)

            # Create buckets in the dataFrame
            df = bucketizer(self, col_name, splits=splits)

            col_bucket = col_name + "_buckets"

            counts = (df.h_repartition(
                col_name=col_bucket).groupBy(col_bucket).agg(
                    F.count(col_bucket).alias("count")).cols.rename(
                        col_bucket, "value").sort(F.asc("value")).to_json())

            # Fill the gaps in dict values. For example if we have  1,5,7,8,9 it get 1,2,3,4,5,6,7,8,9
            new_array = []
            for i in builtins.range(buckets):
                flag = False
                for c in counts:
                    value = c["value"]
                    count = c["count"]
                    if value == i:
                        new_array.append({"value": value, "count": count})
                        flag = True
                if flag is False:
                    new_array.append({"value": i, "count": 0})

            counts = new_array

            hist_data = []
            for i in list(itertools.zip_longest(counts, splits)):
                if i[0] is None:
                    hist_data.append({
                        "count": 0,
                        "lower": i[1]["lower"],
                        "upper": i[1]["upper"]
                    })
                elif "count" in i[0]:
                    hist_data.append({
                        "count": i[0]["count"],
                        "lower": i[1]["lower"],
                        "upper": i[1]["upper"]
                    })

        return hist_data
Example #20
0
def Predict(i, df1, df2, timeSeriesCol, predictionCol, joinCol):
    
    # this converts differenced predictions to raw predictions
    dZCol = 'DeltaZ'+str(i) 
    f_strCol = 'forecast_'+str(i)+'day'
    df = df1.join(df2, [joinCol], how="inner")\
                            .orderBy(asc("Date"))
    df = df.withColumnRenamed(predictionCol, dZCol)
    df = df.withColumn(f_strCol, col(dZCol)+col(timeSeriesCol))
    return df
Example #21
0
def writeToS3(dataFrameFinal, targetDate, destinationPath):

    dataFrameFinal\
    .withColumn("event_date", lit(targetDate).cast("date"))\
    .sort(asc("htl_city_code"))\
    .write\
    .partitionBy("browsing_date", "meta_fnnl_step")\
    .option("mapreduce.fileoutputcommitter.algorithm.version", "2")\
    .mode('append')\
    .parquet(destinationPath)
Example #22
0
def get_most_popular_hashtag_by_time():
    df.withColumn("date",
                  to_timestamp(unix_timestamp('date', "EEE MMM dd HH:mm:ss +0000 yyyy").cast("timestamp"))).withColumn(
        'time', date_format('date', "HH:mm:ss"))

    count_hashtags = df.groupBy('place', 'hashtag').agg(functions.count('hashtag').alias('hashtag_count'))
    most_popular_hashtag = count_hashtags.groupBy('place').agg(functions.max('hashtag_count').alias('max'))

    count_hashtags.join(most_popular_hashtag, ((count_hashtags.hashtag_count == most_popular_hashtag.max) &
                                               (count_hashtags.place == most_popular_hashtag.place))) \
        .select(count_hashtags.place, count_hashtags.hashtag).orderBy('max', ascending=False).show(10)

    w = Window.partitionBy("place", "hashtag", "date", "hour")
    per_hour_frequency = most_popular_hashtag.withColumn("date", to_date("created_at")) \
        .withColumn("tag_count", f.count('id').over(w)). \
        select('place', 'date', 'hour', 'hashtag', 'tag_count'). \
        distinct(). \
        sort(functions.asc('place'), functions.asc('hashtag'), functions.asc('date'), functions.asc('hour'),
             functions.asc('tag_count'))
def run(years=available_years):
    """
    Run analyses on individual years in sequence
    """
    import pyspark.sql.functions as sqlf

    stations = mkstations('data/stations.csv')

    # allow passing a single year or a list of them
    if not type(years) is list:
        years = [years]

    for year in years:
        if not year in available_years:
            raise RuntimeError('Sorry, %s is not available in the dataset.' % year)

        df = mkdf('data/%s.csv' % year)

        print("\n%s\n====\n" % year)

        # Average minimum temperature
        r = df.filter(df.meas=='TMIN').groupBy().avg('degc').first()
        print('Avg min temp = %0.1f deg C' % (r['avg(degc)'] / 10.0))

        # Average maximum temperature
        r = df.filter(df.meas=='TMAX').groupBy().avg('degc').first()
        print('Avg max temp = %0.1f deg C' % (r['avg(degc)'] / 10.0))

        # Five hottest stations (on average)
        fivehot = df.filter(df.meas=='TMAX') \
                    .groupBy(df.sta) \
                    .agg(sqlf.avg('degc')) \
                    .sort(sqlf.desc('avg(degc)')) \
                    .limit(5).collect()
        print()
        i = 1
        for s in fivehot:
            t = float(s['avg(degc)']) / 10.0
            print('Hottest station #%s: %s (%s) - %0.1f deg C'
                  % (i, s.sta, getcity(stations, s.sta), t))
            i = i + 1

        # Five coldest stations (on average)
        fivecold = df.filter(df.meas=='TMIN') \
                     .groupBy(df.sta) \
                     .agg(sqlf.avg('degc')) \
                     .sort(sqlf.asc('avg(degc)')) \
                     .limit(5).collect()
        print()
        i = 1
        for s in fivecold:
            t = float(s['avg(degc)']) / 10.0
            print('Coldest station #%s: %s (%s) - %0.1f deg C'
                  % (i, s.sta, getcity(stations, s.sta), t))
            i = i + 1
Example #24
0
    def get_history_product(self, old_dataframe: DataFrame,
                            new_dataframe: DataFrame):
        inserted = self.__join_safe_null(new_dataframe, old_dataframe, how='anti') \
            .withColumn('meta', lit('inserted')).withColumn('priority', lit(1))

        deleted = self.__join_safe_null(old_dataframe, new_dataframe, how='anti') \
            .withColumn('meta', lit('deleted')).withColumn('priority', lit(0))

        not_changed = self.__join_safe_null(new_dataframe, old_dataframe, how='semi', keys=['id', 'name', 'score']) \
            .withColumn('meta', lit('not_changed')).withColumn('priority', lit(1))

        pre_changed = self.__join_safe_null(new_dataframe,
                                            inserted,
                                            how='anti')
        changed = self.__join_safe_null(pre_changed, not_changed, how='anti') \
            .withColumn('meta', lit('changed')).withColumn('priority', lit(1))

        return inserted.union(deleted).union(not_changed).union(changed) \
            .sort(asc('id'), asc('priority')) \
            .drop('priority')
def calcula_promedio(tabla_referencia, tabla_datos):
    '''esta funcion se encarga de calcular el salario promedio agrupando por el anio'''
    #se realiza la consulta a la BD y se lee con pandas
    query = 'SELECT ' + tabla_referencia + '.' + col_join + ',' + tabla_datos + '.salary,' + tabla_datos  + '.yearID FROM '  +\
    tabla_referencia + ' INNER JOIN '  + tabla_datos + ' ON ' + tabla_referencia + '.' + col_join + '=' + tabla_datos + '.' + col_join
    salarios = pd.read_sql(query, mydb)
    #se crea el dataframe de spark con el contextsql
    data_frame = sqlContext.createDataFrame(salarios)
    #se calcula el salario, se agrupa y se ordena
    media = data_frame.distinct().groupBy('yearID').mean('salary')
    media = media.sort(asc("yearID"))
    return media
Example #26
0
    def recommend_n_movies_for_users(self, n, users, implicit=False):
        model = self.model_implicit if implicit else self.model_explicit
        users = self.ratings.where(self.ratings.userId.isin(users)).distinct()

        subset = model.recommendForUserSubset(users, n)

        formatted_subset = subset.withColumn('recs_exp', explode('recommendations')) \
            .select('userId', col('recs_exp.movieId'), col('recs_exp.rating').alias('rating')) \
            .join(self.movies, 'movieId') \
            .select('userId', 'title', 'rating') \
            .orderBy(asc('userId'), desc('rating')) \
            .select('userId', 'title')

        return formatted_subset
    def read(self, iDF):
        plain_df_idx = iDF.rdd\
                    .zipWithIndex().toDF(["row","idx"])\
                    .orderBy(asc("idx"))\
                    .coalesce(10)

        Windowspec = Window.orderBy("idx")
        oDF = plain_df_idx\
                .withColumn("seq", F.lead("row",count=1).over(Windowspec))\
                .withColumn("seqID", F.lead("row",count=0).over(Windowspec))

        parsedDF = oDF.filter(F.col("idx") % 2 == 0).select(
            "seqID", "seq", "+", "quality")
        return parsedDF
Example #28
0
def _sort_by(ds_table, sortby):
    """
        Sort by clause:
            parses a sort by clause and applies it over dataset
    """
    if sortby:
        sortBy_columns = []
        for sb in sortby:
            [(k, v)] = sb.items()
            if v == 'desc':
                sortBy_columns.append(func.desc(k))
            else:
                sortBy_columns.append(func.asc(k))
        ds_table = ds_table.sort(*sortBy_columns)
    return ds_table
Example #29
0
def compute_avg_temperature():
    df_temperature = df.select(
        df["province"], df["city_name"], df["city_code"],
        df["temperature"].cast(DecimalType(scale=2)),
        F.date_format(df["time"], "yyyy-MM-dd").alias("date"),
        F.hour(df["time"]).alias("hour"))
    # 只需要4个时间的数据
    df_4_point_temperature = df_temperature.filter(df_temperature["hour"].isin(
        2, 8, 12, 20))

    df_avg_temperature = df_4_point_temperature.groupby("province","city_name","city_code","date").\
                        agg(F.count("temperature"),F.avg("temperature").alias("avg_temperature")).\
                        filter("count(temperature)=4").\
                        sort(F.asc("avg_temperature")).select("province", "city_name", "city_code", "date",
                                     F.format_number('avg_temperature', 2).alias("avg_temperature"))
    df_avg_temperature.show()
Example #30
0
 def _discrete_read_data(
     self, custom_reward_expression=None, gamma=None, multi_steps=None
 ):
     ts = TableSpec(table_name=self.table_name)
     dataset: Dataset = query_data(
         input_table_spec=ts,
         discrete_action=True,
         actions=["L", "R", "U", "D"],
         custom_reward_expression=custom_reward_expression,
         multi_steps=multi_steps,
         gamma=gamma,
     )
     df = self.sqlCtx.read.parquet(dataset.parquet_url)
     df = df.orderBy(asc("sequence_number"))
     logger.info("Read parquet dataframe: ")
     df.show()
     return df
Example #31
0
def runOtherFunctions(spark, personDf):
    df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]);

    # array
    df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False)

    # desc, asc
    personDf.show()
    personDf.sort(functions.desc("age"), functions.asc("name")).show()

    # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음

    # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능)
    df2 = spark.createDataFrame([("Splits str around pattern",)], ['value'])
    df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False)

    # rownum, rank
    f1 = StructField("date", StringType(), True)
    f2 = StructField("product", StringType(), True)
    f3 = StructField("amount", IntegerType(), True)
    schema = StructType([f1, f2, f3])

    p1 = ("2017-12-25 12:01:00", "note", 1000)
    p2 = ("2017-12-25 12:01:10", "pencil", 3500)
    p3 = ("2017-12-25 12:03:20", "pencil", 23000)
    p4 = ("2017-12-25 12:05:00", "note", 1500)
    p5 = ("2017-12-25 12:05:07", "note", 2000)
    p6 = ("2017-12-25 12:06:25", "note", 1000)
    p7 = ("2017-12-25 12:08:00", "pencil", 500)
    p8 = ("2017-12-25 12:09:45", "note", 30000)

    dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema)
    w1 = Window.partitionBy("product").orderBy("amount")
    w2 = Window.orderBy("amount")
    dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"),
              functions.rank().over(w2).alias("rank")).show()
# <h1>Process Data using pyspark.sql</h1>
# <p>Set the Hadoop configuration.</p>

# In[8]:

# Python expressions in a code cell will be outputted after computation
expenditures_df.printSchema()


# In[9]:

# Sorting the data using spark sql
from pyspark.sql.functions import desc, asc

factor = expenditures_df.sort(desc('(% OF GDP)')).limit(10).toPandas()
factor_re = expenditures_df.sort(asc('(% OF GDP)')).limit(10).toPandas()


# In[10]:

print factor


# In[11]:

life = life_expectancy_df.sort(desc('(YEARS)')).limit(10).toPandas()
life_re = life_expectancy_df.sort(asc('(YEARS)')).limit(10).toPandas()


# In[12]:
#!/usr/bin/python

from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
from pyspark.sql.functions import asc, desc

if __name__ == "__main__":
  sc = SparkContext(appName='resort data')
  sqlContext = SQLContext(sc)

  df = sqlContext.read.load('hdfs://discovery3:9000/tmp/dasmith/c19-20160919-a50-o08/pretty.parquet')
  #df = sqlContext.read.load('hdfs://discovery3:9000/tmp/dasmith/c19-20160402-a50-o08/out.parquet')
  df.registerTempTable("newspaper")
  df2 = sqlContext.sql("select series, date, count(*) as cnt from newspaper group by series, date order by cnt desc")
  df3 = df.join(df2, ['series', 'date'])
  df3.sort(desc("cnt"), asc("begin"), asc("end"))\
     .write.json('/gss_gpfs_scratch/xu.shao/network/resorted-pretty.json')
Example #34
0
# COMMAND ----------

# MAGIC %md **Use ``filter()`` to return only the rows that match the given predicate.**

# COMMAND ----------

from pyspark.sql.functions import col, asc

filterDF = explodeDF.filter(col("firstName") == "chris").sort(col("lastName"))
display(filterDF)

# COMMAND ----------

from pyspark.sql.functions import col, asc

filterDF = explodeDF.filter((col("firstName") == "chris") | (col("firstName") == "michael")).sort(asc("lastName"))
display(filterDF)

# COMMAND ----------

# MAGIC %md 
# MAGIC **The ``where()`` clause is equivalent to ``filter()``.**

# COMMAND ----------

whereDF = explodeDF.where((col("firstName") == "chris") | (col("firstName") == "michael")).sort(asc("lastName"))
display(whereDF)

# COMMAND ----------

# MAGIC %md 
    def _calculate_rate(instance_usage_df):
        instance_usage_data_json_list = []

        try:
            sorted_oldest_ascending_df = instance_usage_df.sort(
                functions.asc("processing_meta.oldest_timestamp_string"))

            sorted_latest_descending_df = instance_usage_df.sort(
                functions.desc("processing_meta.latest_timestamp_string"))

            # Calculate the rate change by percentage
            oldest_dict = sorted_oldest_ascending_df.collect()[0].asDict()
            oldest_quantity = float(oldest_dict[
                                    "processing_meta"]["oldest_quantity"])

            latest_dict = sorted_latest_descending_df.collect()[0].asDict()
            latest_quantity = float(latest_dict[
                                    "processing_meta"]["latest_quantity"])

            rate_percentage = 100 * (
                (oldest_quantity - latest_quantity) / oldest_quantity)

            # get any extra data
            extra_data_map = getattr(sorted_oldest_ascending_df.collect()[0],
                                     "extra_data_map", {})
        except Exception as e:
            raise PreHourlyCalculateRateException(
                "Exception occurred in pre-hourly rate calculation. Error: %s"
                % str(e))
        #  create a new instance usage dict
        instance_usage_dict = {"tenant_id":
                               latest_dict.get("tenant_id", "all"),
                               "user_id":
                               latest_dict.get("user_id", "all"),
                               "resource_uuid":
                               latest_dict.get("resource_uuid", "all"),
                               "geolocation":
                               latest_dict.get("geolocation", "all"),
                               "region":
                               latest_dict.get("region", "all"),
                               "zone":
                               latest_dict.get("zone", "all"),
                               "host":
                               latest_dict.get("host", "all"),
                               "project_id":
                               latest_dict.get("project_id", "all"),
                               "aggregated_metric_name":
                               latest_dict["aggregated_metric_name"],
                               "quantity": rate_percentage,
                               "firstrecord_timestamp_unix":
                               oldest_dict["firstrecord_timestamp_unix"],
                               "firstrecord_timestamp_string":
                               oldest_dict["firstrecord_timestamp_string"],
                               "lastrecord_timestamp_unix":
                               latest_dict["lastrecord_timestamp_unix"],
                               "lastrecord_timestamp_string":
                               latest_dict["lastrecord_timestamp_string"],
                               "record_count": oldest_dict["record_count"] +
                               latest_dict["record_count"],
                               "usage_date": latest_dict["usage_date"],
                               "usage_hour": latest_dict["usage_hour"],
                               "usage_minute": latest_dict["usage_minute"],
                               "aggregation_period":
                               latest_dict["aggregation_period"],
                               "extra_data_map": extra_data_map
                               }

        instance_usage_data_json = json.dumps(instance_usage_dict)
        instance_usage_data_json_list.append(instance_usage_data_json)

        # convert to rdd
        spark_context = instance_usage_df.rdd.context
        return spark_context.parallelize(instance_usage_data_json_list)
Example #36
0
#featuresOut = df.select(df.command,df.date,df.exec_as,df.source,df.srcip,df.username,df.features)

# Create a DF with training data
#kmtraindata = featuresOut.sample(False, 0.5, 42)

# Create KM model and fit using up to date data
kmeans = KMeans(k=650, seed=42, featuresCol="features", predictionCol="prediction", maxIter=10, initSteps=3)
kmodel = kmeans.fit(df)

#test = kmodel.transform(featuresOut)

'''
########## DEMO #########
'''
df.groupBy(df.prediction).count().orderBy(asc('count')).show(50)
groups = df.groupBy(df.prediction.alias("prediction2")).count().orderBy(asc('count')).filter('count < 40')
df.join(groups, groups.prediction2==df.prediction).select('command','prediction').distinct().show()
df.join(groups, groups.prediction2==df.prediction).select('command').distinct().show(500,truncate=False)

groups = df.groupBy(df.prediction.alias("prediction2")).count().orderBy(desc('count')).filter('count > 100000')
df.join(groups, groups.prediction2==df.prediction).select('command').distinct().show(500,truncate=False)


groups = sc.parallelize(df.groupBy(df.prediction.alias("prediction2")).count().orderBy(desc('count')).head(10)).toDF()
df.join(groups, groups.prediction2==df.prediction).select('command').distinct().show(50,truncate=False)

# Create a new DF with some weird commands
test1 = ctx.createDataFrame([
], ["command"])