Example #1
0
def save_table(df, table_name, partition_keys=None):
    print(f"Saving table: {table_name}")
    output_path = f"s3://{bucket_name}/{output_dir}/{table_name}"
    spark.sql(f"drop table if exists {database_name}.{table_name}")

    df = df\
        .withColumn('dataset_name',
                    f.split(f.split(f.input_file_name(), '/').getItem(f.size(f.split(f.input_file_name(), '/'))-1), '\.').getItem(0))

    if partition_keys is not None:
        df\
            .repartition(*partition_keys)\
            .write\
            .mode("overwrite")\
            .format("parquet")\
            .partitionBy(*partition_keys)\
            .option("path", output_path)\
            .saveAsTable(f"{database_name}.{table_name}")
    else:
        df\
            .coalesce(1)\
            .write\
            .mode("overwrite")\
            .format("parquet")\
            .option("path", output_path)\
            .saveAsTable(f"{database_name}.{table_name}")
    print(f"Table: {table_name} saved")
Example #2
0
    def test_input_file_name_reset_for_rdd(self):
        rdd = self.sc.textFile('python/test_support/hello/hello.txt').map(lambda x: {'data': x})
        df = self.spark.createDataFrame(rdd, "data STRING")
        df.select(input_file_name().alias('file')).collect()

        non_file_df = self.spark.range(100).select(input_file_name())

        results = non_file_df.collect()
        self.assertTrue(len(results) == 100)

        # [SPARK-24605]: if everything was properly reset after the last job, this should return
        # empty string rather than the file read in the last job.
        for result in results:
            self.assertEqual(result[0], '')
Example #3
0
    def test_input_file_name_reset_for_rdd(self):
        rdd = self.sc.textFile('python/test_support/hello/hello.txt').map(lambda x: {'data': x})
        df = self.spark.createDataFrame(rdd, "data STRING")
        df.select(input_file_name().alias('file')).collect()

        non_file_df = self.spark.range(100).select(input_file_name())

        results = non_file_df.collect()
        self.assertTrue(len(results) == 100)

        # [SPARK-24605]: if everything was properly reset after the last job, this should return
        # empty string rather than the file read in the last job.
        for result in results:
            self.assertEqual(result[0], '')
Example #4
0
def main(sc, inputfilename):
    files_df = spark.read.text(inputfilename).select(
        input_file_name().alias("filename"), "value")
    #partition by files
    files_part_df = files_df.repartition("filename")
    #convert to rdd
    files_part_df_rdd = files_part_df.rdd
    words = files_part_df_rdd.flatMap(
        lambda word_file: word_file[1].lower().split(" "))
    distinct_words_rdd = words.flatMap(explode).distinct()
    distinct_word_rdd_index = distinct_words_rdd.zipWithIndex()
    #map each word to the filename
    word_document = files_part_df_rdd.map(lambda word_file_pair: (
        (word_file_pair[0], word_file_pair[1].lower().split()))).flatMap(
            lambda x: ([(y, x[0]) for y in x[1]]))
    #select the distinct (word,filename) pairs
    word_document_distinct_rdd = word_document.distinct()
    word_id_file_id_pairs = distinct_word_rdd_index.join(
        word_document_distinct_rdd).map(lambda x: x[1])
    #switch the ordering to (wid,fid)
    wid_fid_pairRDD = word_id_file_id_pairs.map(lambda x: (x[1], x[0]))
    fid_wid_part_rdd_wid_key = wid_fid_pairRDD.map(
        lambda x: (x[1], int(x[0].split('/')[-1])))
    words_fids_groups_rdd = fid_wid_part_rdd_wid_key.groupByKey().map(
        lambda x: (x[0], sorted(list(x[1]))))
    words_fids_groups_sorted_rdd = words_fids_groups_rdd.sortByKey()
Example #5
0
def run(rucio_path, dbs_path, output, verbose):
    start = time.time()
    spark = SparkSession.builder.appName("rucio_dumps_test").getOrCreate()
    csvreader = spark.read.format("csv") \
        .option("nullValue", "null") \
        .option("mode", "FAILFAST")
    avroreader = spark.read.format("avro")
    rucio_info = avroreader.load(rucio_path) \
        .withColumn("filename", fn.input_file_name())
    logger.debug("Rucio data types")
    logger.debug(rucio_info.dtypes)
    # rucio_info.show(5, False)
    dbs_files = csvreader.schema(schemas.schema_files()) \
        .load(dbs_path) \
        .select("f_logical_file_name", "f_dataset_id")    
    # dbs_files.show(5, False)
    rucio_df = (rucio_info.withColumn("tmp1", fn.substring_index("filename", "/rucio/", -1))
                .withColumn("tally_date", fn.substring_index("tmp1", "/", 1))
                .withColumn('create_day', fn.date_format(fn.to_date((rucio_info.CREATED_AT / fn.lit(1000))
                                                                    .cast(types.LongType())
                                                                    .cast(types.TimestampType())),
                                                         'yyyyMMdd')
                            )
                .withColumn('tally_day', fn.date_format(fn.to_date("tally_date", "yyyy-MM-dd"), 'yyyyMMdd'))
                .select("RSE_ID", "BYTES", "NAME", "SCOPE", "tally_day", "create_day")
                )
    # rucio_df.show(5, False)
    rucio_df = rucio_df \
        .join(dbs_files, dbs_files.f_logical_file_name == rucio_df.NAME) \
        .groupBy("RSE_ID", "f_dataset_id", "SCOPE", "tally_day", "create_day") \
        .agg(fn.sum("BYTES").alias("rep_size"))
    # rucio_df.show(5, False)
    rucio_df.write.option("compression", "snappy").parquet(output, mode="overwrite")
    end = time.time()
    logger.info("Elapsed Time: {min} min, {sec} sec.".format(min=(end - start) // 60, sec=(end - start) % 60))
Example #6
0
def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]
    # in_directory = "pagecounts-1"
    # out_directory = "output"
    pagecounts = spark.read.csv(in_directory, sep=" ",
                                schema=schema).withColumn(
                                    'filename', functions.input_file_name())
    pagecounts = pagecounts.drop("data_size")
    pagecounts = pagecounts.filter(
        (pagecounts["lang"] == "en") & (pagecounts["title"] != "Main_Page")
        & ((pagecounts["title"].startswith("Special:") == False)))
    # pagecounts.show()
    path_to_hour = functions.udf(pathToTime, returnType=types.StringType())
    pagecounts = pagecounts.withColumn("time",
                                       path_to_hour("filename")).cache()

    max_visit_counts = pagecounts.groupby("time").agg(
        functions.max("visit_count"))

    # adapted from https://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html
    result = pagecounts.join(max_visit_counts,
                             (pagecounts["time"] == max_visit_counts["time"]))
    result = result.filter(result["visit_count"] == result["max(visit_count)"])
    result = result.select(pagecounts["time"], "title",
                           "visit_count").sort("time")
    # result.explain()

    # We know groups has <=1000 rows, since the data is grouped by time combinations, so it can safely be moved into 1 partition.
    result.coalesce(1).write.csv(out_directory, mode='overwrite')
Example #7
0
def main(inputs, output):
    log_schema = types.StructType([
        types.StructField('project', types.StringType(), False),
        types.StructField('title', types.StringType(), False),
        types.StructField('views', types.IntegerType(), False),
        types.StructField('size', types.IntegerType(), False)
    ])

    in_df = spark.read.csv(inputs, schema=log_schema,
                           sep=" ").withColumn('filename',
                                               functions.input_file_name())

    hour_df = in_df.withColumn("hour", path_to_hour(in_df.filename)).drop(
        in_df.filename)

    get_en = hour_df.filter(hour_df.project == "en")

    fil_1 = get_en.filter(get_en.title != "Main_Page")

    fil_2 = fil_1.filter(~fil_1.title.startswith("Special:")).cache()

    hour_max = fil_2.groupBy(fil_2.hour).agg(
        functions.max(fil_2['views']).alias('max_count'))

    count_join = fil_2.join(broadcast(hour_max),
                            (hour_max.max_count == fil_2.views) &
                            (hour_max.hour == fil_2.hour)).select(
                                fil_2.hour, fil_2.title, fil_2.views)

    res_sort = count_join.sort(count_join.hour, count_join.title)

    res_sort.write.json(output)
def main(in_directory, out_directory):
	df = spark.read.csv(in_directory, sep = ' ').withColumn('filename', functions.input_file_name())
	
	df = df.withColumnRenamed('_c0', 'language')
	df = df.withColumnRenamed('_c1', 'title')
	df = df.withColumnRenamed('_c2', 'view')
	df = df.withColumnRenamed('_c3', 'bytes')

	udf_datetime_filter = functions.udf(datetime_filter, returnType = types.StringType())
	df = df.withColumn('datetime', udf_datetime_filter(df['filename']))
	df = df.withColumn('view', df['view'].cast(types.IntegerType()))
	

	df = df.filter( (df['language'].startswith('en')) & (df['title'] != 'Main_Page') & (df['title'].startswith('Special:') == False) )
	
	df = df.cache()

	max_data = df.groupBy('datetime').max('view')
	max_data = max_data.withColumnRenamed('max(view)', 'view')
	

	joined_data = df.join(max_data, (df.datetime == max_data.datetime) & (df.view == max_data.view), 'inner' ).drop(df.view).drop(df.datetime)
	joined_data = joined_data.drop('language', 'bytes', 'filename')
	joined_data = joined_data.sort('datetime')
	joined_data = joined_data.select('datetime', 'title', 'view')

	# joined_data.show()

	joined_data.write.csv(out_directory, mode='overwrite')
Example #9
0
def main(in_directory, out_directory):
    wiki_data = spark.read.csv(in_directory, sep=" ",
                               schema=wiki_schema).withColumn(
                                   'filename', functions.input_file_name())

    wiki_data = wiki_data.withColumn('date', udf(wiki_data.filename))
    wiki_data = wiki_data.filter(wiki_data.lang.eqNullSafe('en'))
    wiki_data = wiki_data.filter(~wiki_data.page.eqNullSafe("Main_Page"))
    wiki_data = wiki_data.filter(~wiki_data.page.startswith("Special:"))
    # wiki_data = wiki_data.cache()

    wiki_data_grouped = wiki_data.groupBy(wiki_data['date']).agg(
        functions.max(wiki_data['times_requested']).alias('max'),
        functions.first("page").alias('page'))
    # wiki_data_grouped =wiki_data_grouped.cache()
    wiki_final = wiki_data_grouped.join(
        wiki_data, (wiki_data.times_requested == wiki_data_grouped.max) &
        (wiki_data.date == wiki_data_grouped.date)).drop(wiki_data.date).drop(
            wiki_data.page)
    wiki_final = wiki_final.cache()
    wiki_final = wiki_final.select(
        wiki_final['date'], wiki_final['page'],
        wiki_final['max']).orderBy('date',
                                   'page').write.csv(out_directory + '-wiki',
                                                     mode='overwrite')
def main(in_directory, out_directory):
    df = spark.read.csv(in_directory, schema=comments_schema,
                        sep=' ').withColumn('filename',
                                            functions.input_file_name())
    path_to_hour = functions.udf(getFileName, returnType=types.StringType())
    df = df.select(df['language'], df['title'], df['number_of_requests'],
                   path_to_hour(df['filename']).alias('filename'))

    df = df.filter((df['language'] == 'en')
                   & (df['title'] != 'Main Page')
                   & (functions.substring(df['title'], 0, 8) != 'Special:'))

    grouped = df.groupBy('filename', 'title')
    count = grouped.agg(functions.sum(df['number_of_requests']).alias('count'))

    grouped2 = count.groupBy('filename')
    maxi = grouped2.agg(functions.max(count['count']))

    joined = count.join(maxi, on='filename')
    joined = joined.filter(joined['count'] == joined['max(count)'])\
    .select(joined['filename'], joined['title'], joined['count'])
    joined = joined.cache()
    sorted_ = joined.sort('filename', 'title')

    sorted_.write.csv(out_directory, mode='overwrite')
Example #11
0
def main(in_directory, out_directory):
    #we're interested in "en"only.
    df = spark.read.csv(in_directory, schema=schema,
                        sep=' ').withColumn('path',
                                            functions.input_file_name())
    #df.show(); return
    #df.printSchema(); return
    df = df.withColumn('filename', python_logic(df['path']))
    df = df.drop('path')
    #df.show(); return
    df = df.withColumn('time', complicated_udf2(df['filename']))
    #df.show(); return
    df = df.drop('filename')
    #df.show(); return
    df = df.drop('num_of_bytes')
    #df.show(); return
    df = df.filter(df['language'] == 'en')
    #df.show(); return
    df = df.drop('language')
    df = df.filter(df['titles'] != ('Main_Page'))
    #df.show(); return
    df = df.filter(~df['titles'].contains('Special:'))
    df = df.cache()
    df_max = df.groupBy('time').agg(
        functions.max(df['times_requested']).alias('times_requested'))
    #df.show(); return
    #df_max.show(); return
    df_joined = df_max.join(df, ['time', 'times_requested'])
    df_joined = df_joined.sort('time', ascending=True)
    df_joined = df_joined.select('time', 'titles', 'times_requested')
    #df_joined = df_joined.count();
    df_joined.show()
    return
    df_joined.write.csv(out_directory, mode='overwrite')
def main(inputs, output):
    # main logic starts here
    wikipedia_schema = types.StructType([
        types.StructField('type', types.StringType(), False),
        types.StructField('title', types.StringType(), False),
        types.StructField('views', types.IntegerType(), False),
        types.StructField('size', types.LongType(), False),
    ])
    wikipedia = spark.read.csv(inputs,
                               schema=wikipedia_schema, sep=' ').withColumn(
                                   'filename', functions.input_file_name()
                               )  #read wikipedia input file separated by space
    filtered_wiki_records = wikipedia.where(
        (wikipedia.type == "en") & (~(wikipedia.title.startswith('Special:')))
        & (wikipedia.title != "Main_Page")).cache()
    #registering the udf
    date_time_udf = functions.udf(path_to_hour, types.StringType())
    wiki_records_per_hour = filtered_wiki_records.withColumn(
        'hour', date_time_udf(filtered_wiki_records.filename)).cache()
    #find the max page views per hour
    max_hit_counts = wiki_records_per_hour.groupby('hour').max('views')
    #broadcast the small dataframe
    max_hit_counts = functions.broadcast(max_hit_counts)
    #forming my conditions for join
    condition = [
        wiki_records_per_hour.views == max_hit_counts['max(views)'],
        (wiki_records_per_hour.hour == max_hit_counts.hour)
    ]
    joined_df = max_hit_counts.join(wiki_records_per_hour, condition).select(
        wiki_records_per_hour.hour, wiki_records_per_hour.title,
        wiki_records_per_hour.views).sort(
            wiki_records_per_hour.hour).coalesce(1)
    joined_df.write.json(output, mode='overwrite')
Example #13
0
def test_source_file_name(sql_context):
    expected = books_as_row(sql_context).withColumn(
        'convergdb_source_file_name', input_file_name()).collect()

    test = convergdb.source_file_name(books_as_row(sql_context)).collect()

    assert test == expected
Example #14
0
def main():

    schema = StructType([
        StructField('LANG', StringType(), False),
        StructField('TITLE', StringType(), False),
        StructField('VIEWS', LongType(), False),
        StructField('BYTES', LongType(), False)
    ])

    df = spark.read.csv(inputs, schema=schema,
                        sep=" ").withColumn('filename',
                                            functions.input_file_name())
    # df.show()
    hour_udf = udf(extract_hour_from_filename, StringType())
    df = df.select('LANG', 'TITLE', 'VIEWS', 'BYTES',
                   hour_udf("filename").alias("HOUR"))

    df = df.where(df.LANG == 'en')
    df = df.where((~df.TITLE.startswith('Main_Page')))
    df = df.where((~df.TITLE.startswith('Special:')))
    # df.show()

    df_max_views = df.groupby('HOUR').agg(max("VIEWS").alias('MAX_VIEW'))

    cond = [
        df['HOUR'] == df_max_views['HOUR'],
        df['VIEWS'] == df_max_views['MAX_VIEW']
    ]
    result = df.join(df_max_views, cond,
                     'inner').select(df['HOUR'], df['TITLE'],
                                     df['VIEWS']).sort(df['HOUR'])

    # result.show()

    result.write.csv(output)
Example #15
0
def main(in_dir, out_dir):

    # Read the csv file
    wiki_page = spark.read.csv(in_dir, schema=schema,
                               sep=' ').withColumn('filename',
                                                   functions.input_file_name())

    # We need to find the most-viewed page each hour
    data = wiki_page.filter(wiki_page['language'] == 'en')
    data = data.filter(data['page_name'] != 'Main_Page')
    data = data.filter(data.page_name.startswith('Special:') == False)

    path_to_hour = functions.udf(lambda path: getdate(path),
                                 returnType=types.StringTypes())

    data = data.withColumn('date', path_to_hour(data.filename))
    data = data.drop('language', 'bytes', 'filename')

    groups = data.groupBy('date')

    most_viewed = groups.agg(functions.max(data['views']).alias('views'))
    most_viewed.cache()

    cond = ['views', 'date']
    data_join = most_viewed.join(data, cond)

    output = data_join.sort('date', 'page_name')

    output.show()
def read_pg_dump(spark, input_dir):
    """Read gzipped pg_dump data.

    -RECORD 0-----------------------------------------------------------------
    table_name     | submission_date_nightly_43_20191201
    aggregate_type | submission_date
    ds_nodash      | 20191201
    dimension      | {"os": "Windows_NT", "child": "false", "label": ""...
    aggregate      | {0,2,0,2,2}
    """

    # parse the table of contents
    toc_file = f"{input_dir}/toc.dat"
    with open(toc_file, "rb") as f:
        data = f.read()
    toc_df = spark.createDataFrame([Row(**d) for d in parse_toc(data)])

    data_df = (spark.read.csv(
        f"{input_dir}/*.dat.gz",
        sep="\t",
        schema="dimension string, aggregate string",
    ).withColumn("file_name", parse_filename(F.input_file_name())).select(
        "dimension", "aggregate", "file_name.*"))

    return data_df.join(toc_df, on="table_id", how="inner")
Example #17
0
def main(inputs, output):

    pathfunction = functions.udf(path_to_hour, returnType=types.StringType())

    comments_schema = types.StructType([
        types.StructField('language', types.StringType()),
        types.StructField('title', types.StringType()),
        types.StructField('views', types.LongType()),
        types.StructField('bytes', types.LongType()),
    ])

    wikipage = spark.read.csv(inputs, schema=comments_schema,
                              sep=' ').withColumn(
                                  'hour',
                                  pathfunction(functions.input_file_name()))
    filtered_page = wikipage.filter(
        (wikipage.language == 'en') & (wikipage.title != 'Main Page')
        & (~wikipage.title.startswith('Special:'))).cache()

    max_view = filtered_page.groupBy(wikipage.hour).agg(
        functions.max(wikipage.views).alias('total_views'))
    conditions = [
        filtered_page.views == max_view.total_views,
        filtered_page.hour == max_view.hour
    ]

    # regular join: join_page = filtered_page.join(functions.broadcast(max_view), conditions).select(filtered_page.hour, 'title', 'views')
    # broadcast join as following:
    join_page = filtered_page.join(functions.broadcast(max_view),
                                   conditions).select(filtered_page.hour,
                                                      'title', 'views')
    join_page.sort('hour', 'title').write.json(output, mode='overwrite')
    join_page.explain()
def main(inputs, output):
    # main logic starts here
    wiki_schema = types.StructType([
        types.StructField('language', types.StringType()),
        types.StructField('title', types.StringType()),
        types.StructField('views', types.IntegerType()),
        types.StructField('size', types.LongType()),
    ])
    #reading data
    wikiData = spark.read.csv(inputs, schema=wiki_schema, sep=" ").withColumn(
        'hour', path_to_hour(functions.input_file_name()))
    #filtering data
    filteredWikiData = wikiData[(wikiData['language'] == 'en')
                                & (wikiData['title'] != 'Main_Page') &
                                (wikiData['title'] != 'Special:Page')].cache()
    #finding max views per hour.
    maxCount = filteredWikiData.groupBy('hour').agg(
        functions.max(filteredWikiData['views']).alias('max'))
    #joining data to obtain hour and title.
    joinData = filteredWikiData.join(
        maxCount, filteredWikiData.views == maxCount.max).select(
            filteredWikiData["hour"], filteredWikiData["title"],
            filteredWikiData["views"])
    #sorting data based on hour and storing it in json file.
    joinData.sort(functions.asc('hour')).write.json(output, mode='overwrite')
Example #19
0
def main(in_directory, out_directory):
    path_to_hour = functions.udf(path_to_hour_non_udf,
                                 returnType=types.StringType())

    stats = spark.read.csv(in_directory, schema=wiki_data_schema,
                           sep=' ').withColumn(
                               'filename',
                               path_to_hour(functions.input_file_name()))
    stats = stats.filter((stats['lan'] == 'en')
                         & (stats['name'] != 'Main_Page')
                         & ~(stats['name'].startswith('Special:'))).cache(
                         )  # <------ Caching here!

    max_stats = stats.groupBy('filename').agg(
        functions.max('count')).alias('max(count)')

    stats = max_stats.join(stats, on='filename')

    # stats.explain()
    stats = stats.filter(stats['max(count)'] == stats['count'])

    stats_sorted = stats.sort('filename', 'name')
    cleaned_data = stats_sorted.select(stats_sorted['filename'],
                                       stats_sorted['name'],
                                       stats_sorted['count'])
    cleaned_data.write.csv(out_directory, mode='overwrite')
def main(input, output):
    # Read in CSV data and hold onto filename
    fname = '*/*/*/*/*/Play by Play - All (Parsed).csv'
    df = spark.read.csv(os.path.join(input, fname), header='true', schema=play_by_play_schema) \
        .withColumn('filename', functions.input_file_name()) \
        .withColumn('split', functions.split('filename', '/')) \
        .withColumn('TimeLeft_split', functions.split('TimeLeft', ':')) \
        .withColumn('Score_split', functions.split('Score', '-'))

    # Parse the file name into columns
    df = df \
        .where((df['Action'] != 'Enters Game') & (df['Action'] != 'Leaves Game') ) \
        .withColumn('Gender', df['split'].getItem(4)) \
        .withColumn('Year', df['split'].getItem(5)) \
        .withColumn('Division', df['split'].getItem(6)) \
        .withColumn('FileTeam', df['split'].getItem(7)) \
        .withColumn('Seconds_Left', df['TimeLeft_split'].getItem(0).cast(types.IntegerType())*60 \
                    +df['TimeLeft_split'].getItem(1).cast(types.IntegerType()) \
                    +period_mins_left(df['Period'])*60) \
        .withColumn('Away_Score', df['Score_split'].getItem(0)) \
        .withColumn('Home_Score', df['Score_split'].getItem(1)) \
        .withColumn('File_Team', functions.regexp_replace('FileTeam', '%20', ' ')) \

    # Add Home_Margin column
    df = df.withColumn('Home_Margin',
                       (df['Home_Score'] - df['Away_Score']).cast(
                           types.IntegerType()))

    # Specify final columns and write data
    final_columns = ['Gender','Year','Division', 'Date', 'Time', \
        'Score','Team', 'Player','Status', 'Action','Shot_Clock',\
        'Seconds_Left','Away_Score','Home_Score', 'Home_Margin', 'File_Team']

    df = df.select(final_columns).drop_duplicates() \
        .write.parquet(output, mode='append', compression='gzip')
Example #21
0
    def process(self):

        # file types are with certain name, and we want to identify while file type data belongs to
        def find_brand(s3_path):
            if s3_path.count('xxx') > 0:
                return 'type1'
            elif s3_path.count('yyy') > 0:
                return 'type2'
            elif s3_path.count('zzz') > 0:
                return 'type3'
            else:
                'invalid type'

        udf_find_brand = udf(find_brand, StringType())

        df_stg_user_profile = self.sc.read.json( self.user_profile_input_location, schema=user_tbl_json_ingest_schema) \
            .withColumn("s3_path", input_file_name()) \
            .withColumn("brand", udf_find_brand('s3_path')).drop('s3_path')

        df_stg_user_profile.printSchema()

        write_and_partition(self.sc, df_stg_user_profile, self.user_profile_stg_location,
                            stg_user_table, number_of_files=200,keep_latest_n=2,
                            table_create_statements=[
                                create_stg_user_tbl_template.format(location=self.user_profile_stg_location)])
Example #22
0
def main():
    in_directory = sys.argv[1]
    out_directory = sys.argv[2]

    def spilt(filenmae):
        file = filenmae.rsplit('/', 1)[1]
        date = file.split("-", 1)[1]
        name = date[:-7]
        return name

    path_to_hour = functions.udf(spilt, returnType=types.StringType())
    data = spark.read.csv(in_directory, sep=" ", schema=schema).withColumn(
        'filename', path_to_hour(functions.input_file_name()))

    filter_data = data.filter(
        (data['language'] == "en") & (data["page"] != "Main_Page")
        & (~data["page"].startswith("Special:"))).drop_duplicates()

    grouped = filter_data.groupby("filename").max("views")

    joined_data = filter_data.join(
        grouped,
        on=((filter_data['views'] == grouped['max(views)'])) &
        (filter_data['filename'] == grouped['filename'])).drop(
            filter_data['filename'])
    joined_data = joined_data.sort(joined_data['filename'])

    max_view = joined_data.select(joined_data['filename'], joined_data['page'],
                                  joined_data['views'])
    max_view.show()

    max_view.write.csv(out_directory + '-max_view', mode='overwrite')
Example #23
0
def main():
    schema = types.StructType([
        types.StructField('lan', types.StringType(), True),
        types.StructField('title', types.StringType(), True),
        types.StructField('total_req', types.IntegerType(), True),
        types.StructField('byte_trans', types.DoubleType(), True),
    ])
    pagecount_df = spark.read.csv(inputs, schema, " ").withColumn(
        'time', get_access_time_udf(functions.input_file_name()))
    pagecount_df = pagecount_df.where(
        (pagecount_df.lan == 'en') & (pagecount_df.title != 'Main_Page')
        & (pagecount_df.title.startswith('Special:') == False))
    max_hit_page_df = pagecount_df.groupBy('time').max('total_req').select(
        'time',
        functions.col('max(total_req)').alias('total_req'))
    pagecount_max_df = pagecount_df.join(
        max_hit_page_df,
        ['time', 'total_req']).select('time', 'title',
                                      'total_req').sort(['time', 'title'],
                                                        ascending=True)
    #pagecount_max_df.show()

    if not os.path.exists(output):
        os.makedirs(output)
    pagecount_max_df.write.csv(output, sep=',', mode='overwrite')
def fill_days(year: str, month: str, day: str) -> None:
    S3_FULL_PATH = f"{S3_PATH}/{year}/{month}/{day}"
    try:
        # read all parquet files from specific yyyy/mm/dd
        df = spark.read.format("parquet").load(f"{S3_FULL_PATH}/*.parquet")

        # creates extracted_at column from date path
        df = df.withColumn(
            "extracted_at",
            from_unixtime(  # covert unixtime to timestamp
                unix_timestamp(
                    substring(
                        regexp_replace(input_file_name(), f"{S3_PATH}/", ""), 1, 10
                    ),
                    "yyyy/MM/dd",
                )
            ),
        )
        # overwrite dataframe with the new column
        df.write.mode("overwrite").parquet(f"{S3_FULL_PATH}/")
        print(f"files updated from {S3_FULL_PATH}/")

    # some dates don't have paths.
    except AnalysisException as err:
        print(err)
Example #25
0
def load_data(spark, s_path2s3):

    # define schema
    print('loading {}...'.format(s_path2s3))
    this_schema = StructType([
        StructField("ticker", StringType(), True),
        StructField("open", StringType(), True),
        StructField("settle", StringType(), True),
        StructField("lclose", StringType(), True),
        StructField("max", StringType(), True),
        StructField("min", StringType(), True),
        StructField("bid", StringType(), True),
        StructField("ask", StringType(), True),
        StructField("qbid", StringType(), True),
        StructField("qask", StringType(), True),
        StructField("qlast", IntegerType(), True),
        StructField("last", StringType(), True),
        StructField("group", StringType(), True),
        StructField("ntrades", StringType(), True),
        StructField("name", StringType(), True),
        StructField("qtotal", StringType(), True),
        StructField("strike", StringType(), True),
        StructField("expdate", StringType(), True),
        StructField("totalvol", StringType(), True)
    ])

    # read csv
    df = (spark.read.option(
        "header", "true").schema(this_schema).csv(s_path2s3).withColumn(
            "input_file", input_file_name()))

    # drop rows where ticker is null
    df = df.na.drop(subset=["ticker"])
    return df
Example #26
0
 def test_input_file_name_udf(self):
     df = self.spark.read.text('python/test_support/hello/hello.txt')
     df = df.select(
         udf(lambda x: x)("value"),
         input_file_name().alias('file'))
     file_name = df.collect()[0].file
     self.assertTrue("python/test_support/hello/hello.txt" in file_name)
Example #27
0
 def test_udf_with_input_file_name(self):
     from pyspark.sql.functions import input_file_name
     sourceFile = udf(lambda path: path, StringType())
     filePath = "python/test_support/sql/people1.json"
     row = self.spark.read.json(filePath).select(
         sourceFile(input_file_name())).first()
     self.assertTrue(row[0].find("people1.json") != -1)
Example #28
0
def main(inputs, outputs):
    pages = spark.read.csv(inputs, schema=pages_schema, sep=' ')
    #extract filename from input_file_name()
    filename_to_time = functions.udf(getfilename,
                                     returnType=types.StringType())
    pages = pages.withColumn('filename',
                             filename_to_time(functions.input_file_name()))
    pages.show()
    #apply all filters
    filtered_pages = pages.filter(
        (pages['language'] == 'en') & (pages['page_name'] != 'Main_Page')
        & (pages['page_name'].startswith('Special:') == False))
    filtered_pages.cache()
    #find max viewed pages
    grouped_pages = filtered_pages.groupBy('filename')
    max = grouped_pages.agg(functions.max('count').alias('max_views'))
    #To allow tie
    joined_pages = filtered_pages.join(
        max,
        on=((filtered_pages['count'] == max['max_views']) &
            (filtered_pages['filename'] == max['filename'])))
    #filename to int type to sort
    inttime = functions.udf(makeint, returnType=types.FloatType())
    new_pages = joined_pages.withColumn('date',
                                        inttime(filtered_pages['filename']))

    output = new_pages.select(filtered_pages['filename'], 'page_name',
                              'max_views')
    output.explain()
    output.sort('date').write.csv(outputs, mode='overwrite')
    output.sort('date').show()
Example #29
0
def main(inputs, output):
    # main logic starts here

    observation_schema = types.StructType([
        types.StructField('language', types.StringType(), False),
        types.StructField('title', types.StringType(), False),
        types.StructField('page_count', types.IntegerType(), False),
        types.StructField('bytes', types.IntegerType(), False),
    ])
    wiki_data = spark.read.csv(inputs, sep=' ', schema=observation_schema).withColumn('hour',\
     path_to_hour(functions.input_file_name()))
    wiki_data_filter = wiki_data\
    .filter((wiki_data.language == 'en') & ~(wiki_data.title == 'Main_Page') & ~(wiki_data.title.startswith('Special:')))
    wiki_data_filter.cache()

    wiki_max = wiki_data_filter.select('hour','page_count').groupBy('hour').max('page_count')\
      .orderBy(wiki_data_filter.hour.asc())
    wiki_max_rename = wiki_max.select(functions.col("hour").alias("hours"), functions.col("max(page_count)")\
      .alias("views"))
    cond = [
        wiki_data_filter.page_count == wiki_max_rename.views,
        wiki_data_filter.hour == wiki_max_rename.hours
    ]
    wiki_join = wiki_data_filter.join(functions.broadcast(wiki_max_rename),
                                      cond, 'inner')
    wiki_projection = wiki_join.select('hour', 'title', 'views')
    wiki_projected_sort = wiki_projection.orderBy(wiki_projection.hour.asc(),
                                                  wiki_projection.title.asc())
    wiki_projected_sort.explain()
    # coalesce is used as the question requires an output in a newline-delimited JSON
    wiki_projected_sort.coalesce(1).write.json(output, mode='overwrite')
Example #30
0
def main(inputs, output):
	pages=spark.read.csv(inputs,page_schema,' ').withColumn('filename', functions.input_file_name())
	pages=pages.withColumn('hour',path_to_hour(pages.filename))
	pages=pages.where(pages.language=='en').where(pages.title!='Main_Page').where(~pages.title.startswith('Special')).cache()
	maxviews = pages.groupby(pages.hour).max('views').orderBy(pages.hour).withColumnRenamed('hour','hour1')
	maxviews = maxviews.join(pages,(pages.hour==maxviews.hour1) & (pages.views==maxviews['max(views)'])).select(pages.hour,pages.title,pages.views).orderBy(pages.hour)
	maxviews.write.json(output,mode='overwrite')
	maxviews.explain()
 def add_year(self, df):
     df2 = (
         df.withColumn('file_name', slice(split(input_file_name(), '/'),
                                          -1 ,1)[0])
             .withColumn('flight_year',
                         col('file_name').substr(1, 4).cast(IntegerType()))
     )
     return df2
Example #32
0
    def test_udf_with_input_file_name_for_hadooprdd(self):
        from pyspark.sql.functions import udf, input_file_name

        def filename(path):
            return path

        sameText = udf(filename, StringType())

        rdd = self.sc.textFile('python/test_support/sql/people.json')
        df = self.spark.read.json(rdd).select(input_file_name().alias('file'))
        row = df.select(sameText(df['file'])).first()
        self.assertTrue(row[0].find("people.json") != -1)

        rdd2 = self.sc.newAPIHadoopFile(
            'python/test_support/sql/people.json',
            'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
            'org.apache.hadoop.io.LongWritable',
            'org.apache.hadoop.io.Text')

        df2 = self.spark.read.json(rdd2).select(input_file_name().alias('file'))
        row2 = df2.select(sameText(df2['file'])).first()
        self.assertTrue(row2[0].find("people.json") != -1)
Example #33
0
 def test_udf_with_input_file_name(self):
     from pyspark.sql.functions import udf, input_file_name
     sourceFile = udf(lambda path: path, StringType())
     filePath = "python/test_support/sql/people1.json"
     row = self.spark.read.json(filePath).select(sourceFile(input_file_name())).first()
     self.assertTrue(row[0].find("people1.json") != -1)