def save_table(df, table_name, partition_keys=None): print(f"Saving table: {table_name}") output_path = f"s3://{bucket_name}/{output_dir}/{table_name}" spark.sql(f"drop table if exists {database_name}.{table_name}") df = df\ .withColumn('dataset_name', f.split(f.split(f.input_file_name(), '/').getItem(f.size(f.split(f.input_file_name(), '/'))-1), '\.').getItem(0)) if partition_keys is not None: df\ .repartition(*partition_keys)\ .write\ .mode("overwrite")\ .format("parquet")\ .partitionBy(*partition_keys)\ .option("path", output_path)\ .saveAsTable(f"{database_name}.{table_name}") else: df\ .coalesce(1)\ .write\ .mode("overwrite")\ .format("parquet")\ .option("path", output_path)\ .saveAsTable(f"{database_name}.{table_name}") print(f"Table: {table_name} saved")
def test_input_file_name_reset_for_rdd(self): rdd = self.sc.textFile('python/test_support/hello/hello.txt').map(lambda x: {'data': x}) df = self.spark.createDataFrame(rdd, "data STRING") df.select(input_file_name().alias('file')).collect() non_file_df = self.spark.range(100).select(input_file_name()) results = non_file_df.collect() self.assertTrue(len(results) == 100) # [SPARK-24605]: if everything was properly reset after the last job, this should return # empty string rather than the file read in the last job. for result in results: self.assertEqual(result[0], '')
def main(sc, inputfilename): files_df = spark.read.text(inputfilename).select( input_file_name().alias("filename"), "value") #partition by files files_part_df = files_df.repartition("filename") #convert to rdd files_part_df_rdd = files_part_df.rdd words = files_part_df_rdd.flatMap( lambda word_file: word_file[1].lower().split(" ")) distinct_words_rdd = words.flatMap(explode).distinct() distinct_word_rdd_index = distinct_words_rdd.zipWithIndex() #map each word to the filename word_document = files_part_df_rdd.map(lambda word_file_pair: ( (word_file_pair[0], word_file_pair[1].lower().split()))).flatMap( lambda x: ([(y, x[0]) for y in x[1]])) #select the distinct (word,filename) pairs word_document_distinct_rdd = word_document.distinct() word_id_file_id_pairs = distinct_word_rdd_index.join( word_document_distinct_rdd).map(lambda x: x[1]) #switch the ordering to (wid,fid) wid_fid_pairRDD = word_id_file_id_pairs.map(lambda x: (x[1], x[0])) fid_wid_part_rdd_wid_key = wid_fid_pairRDD.map( lambda x: (x[1], int(x[0].split('/')[-1]))) words_fids_groups_rdd = fid_wid_part_rdd_wid_key.groupByKey().map( lambda x: (x[0], sorted(list(x[1])))) words_fids_groups_sorted_rdd = words_fids_groups_rdd.sortByKey()
def run(rucio_path, dbs_path, output, verbose): start = time.time() spark = SparkSession.builder.appName("rucio_dumps_test").getOrCreate() csvreader = spark.read.format("csv") \ .option("nullValue", "null") \ .option("mode", "FAILFAST") avroreader = spark.read.format("avro") rucio_info = avroreader.load(rucio_path) \ .withColumn("filename", fn.input_file_name()) logger.debug("Rucio data types") logger.debug(rucio_info.dtypes) # rucio_info.show(5, False) dbs_files = csvreader.schema(schemas.schema_files()) \ .load(dbs_path) \ .select("f_logical_file_name", "f_dataset_id") # dbs_files.show(5, False) rucio_df = (rucio_info.withColumn("tmp1", fn.substring_index("filename", "/rucio/", -1)) .withColumn("tally_date", fn.substring_index("tmp1", "/", 1)) .withColumn('create_day', fn.date_format(fn.to_date((rucio_info.CREATED_AT / fn.lit(1000)) .cast(types.LongType()) .cast(types.TimestampType())), 'yyyyMMdd') ) .withColumn('tally_day', fn.date_format(fn.to_date("tally_date", "yyyy-MM-dd"), 'yyyyMMdd')) .select("RSE_ID", "BYTES", "NAME", "SCOPE", "tally_day", "create_day") ) # rucio_df.show(5, False) rucio_df = rucio_df \ .join(dbs_files, dbs_files.f_logical_file_name == rucio_df.NAME) \ .groupBy("RSE_ID", "f_dataset_id", "SCOPE", "tally_day", "create_day") \ .agg(fn.sum("BYTES").alias("rep_size")) # rucio_df.show(5, False) rucio_df.write.option("compression", "snappy").parquet(output, mode="overwrite") end = time.time() logger.info("Elapsed Time: {min} min, {sec} sec.".format(min=(end - start) // 60, sec=(end - start) % 60))
def main(): in_directory = sys.argv[1] out_directory = sys.argv[2] # in_directory = "pagecounts-1" # out_directory = "output" pagecounts = spark.read.csv(in_directory, sep=" ", schema=schema).withColumn( 'filename', functions.input_file_name()) pagecounts = pagecounts.drop("data_size") pagecounts = pagecounts.filter( (pagecounts["lang"] == "en") & (pagecounts["title"] != "Main_Page") & ((pagecounts["title"].startswith("Special:") == False))) # pagecounts.show() path_to_hour = functions.udf(pathToTime, returnType=types.StringType()) pagecounts = pagecounts.withColumn("time", path_to_hour("filename")).cache() max_visit_counts = pagecounts.groupby("time").agg( functions.max("visit_count")) # adapted from https://spark.apache.org/docs/2.0.0/api/python/pyspark.sql.html result = pagecounts.join(max_visit_counts, (pagecounts["time"] == max_visit_counts["time"])) result = result.filter(result["visit_count"] == result["max(visit_count)"]) result = result.select(pagecounts["time"], "title", "visit_count").sort("time") # result.explain() # We know groups has <=1000 rows, since the data is grouped by time combinations, so it can safely be moved into 1 partition. result.coalesce(1).write.csv(out_directory, mode='overwrite')
def main(inputs, output): log_schema = types.StructType([ types.StructField('project', types.StringType(), False), types.StructField('title', types.StringType(), False), types.StructField('views', types.IntegerType(), False), types.StructField('size', types.IntegerType(), False) ]) in_df = spark.read.csv(inputs, schema=log_schema, sep=" ").withColumn('filename', functions.input_file_name()) hour_df = in_df.withColumn("hour", path_to_hour(in_df.filename)).drop( in_df.filename) get_en = hour_df.filter(hour_df.project == "en") fil_1 = get_en.filter(get_en.title != "Main_Page") fil_2 = fil_1.filter(~fil_1.title.startswith("Special:")).cache() hour_max = fil_2.groupBy(fil_2.hour).agg( functions.max(fil_2['views']).alias('max_count')) count_join = fil_2.join(broadcast(hour_max), (hour_max.max_count == fil_2.views) & (hour_max.hour == fil_2.hour)).select( fil_2.hour, fil_2.title, fil_2.views) res_sort = count_join.sort(count_join.hour, count_join.title) res_sort.write.json(output)
def main(in_directory, out_directory): df = spark.read.csv(in_directory, sep = ' ').withColumn('filename', functions.input_file_name()) df = df.withColumnRenamed('_c0', 'language') df = df.withColumnRenamed('_c1', 'title') df = df.withColumnRenamed('_c2', 'view') df = df.withColumnRenamed('_c3', 'bytes') udf_datetime_filter = functions.udf(datetime_filter, returnType = types.StringType()) df = df.withColumn('datetime', udf_datetime_filter(df['filename'])) df = df.withColumn('view', df['view'].cast(types.IntegerType())) df = df.filter( (df['language'].startswith('en')) & (df['title'] != 'Main_Page') & (df['title'].startswith('Special:') == False) ) df = df.cache() max_data = df.groupBy('datetime').max('view') max_data = max_data.withColumnRenamed('max(view)', 'view') joined_data = df.join(max_data, (df.datetime == max_data.datetime) & (df.view == max_data.view), 'inner' ).drop(df.view).drop(df.datetime) joined_data = joined_data.drop('language', 'bytes', 'filename') joined_data = joined_data.sort('datetime') joined_data = joined_data.select('datetime', 'title', 'view') # joined_data.show() joined_data.write.csv(out_directory, mode='overwrite')
def main(in_directory, out_directory): wiki_data = spark.read.csv(in_directory, sep=" ", schema=wiki_schema).withColumn( 'filename', functions.input_file_name()) wiki_data = wiki_data.withColumn('date', udf(wiki_data.filename)) wiki_data = wiki_data.filter(wiki_data.lang.eqNullSafe('en')) wiki_data = wiki_data.filter(~wiki_data.page.eqNullSafe("Main_Page")) wiki_data = wiki_data.filter(~wiki_data.page.startswith("Special:")) # wiki_data = wiki_data.cache() wiki_data_grouped = wiki_data.groupBy(wiki_data['date']).agg( functions.max(wiki_data['times_requested']).alias('max'), functions.first("page").alias('page')) # wiki_data_grouped =wiki_data_grouped.cache() wiki_final = wiki_data_grouped.join( wiki_data, (wiki_data.times_requested == wiki_data_grouped.max) & (wiki_data.date == wiki_data_grouped.date)).drop(wiki_data.date).drop( wiki_data.page) wiki_final = wiki_final.cache() wiki_final = wiki_final.select( wiki_final['date'], wiki_final['page'], wiki_final['max']).orderBy('date', 'page').write.csv(out_directory + '-wiki', mode='overwrite')
def main(in_directory, out_directory): df = spark.read.csv(in_directory, schema=comments_schema, sep=' ').withColumn('filename', functions.input_file_name()) path_to_hour = functions.udf(getFileName, returnType=types.StringType()) df = df.select(df['language'], df['title'], df['number_of_requests'], path_to_hour(df['filename']).alias('filename')) df = df.filter((df['language'] == 'en') & (df['title'] != 'Main Page') & (functions.substring(df['title'], 0, 8) != 'Special:')) grouped = df.groupBy('filename', 'title') count = grouped.agg(functions.sum(df['number_of_requests']).alias('count')) grouped2 = count.groupBy('filename') maxi = grouped2.agg(functions.max(count['count'])) joined = count.join(maxi, on='filename') joined = joined.filter(joined['count'] == joined['max(count)'])\ .select(joined['filename'], joined['title'], joined['count']) joined = joined.cache() sorted_ = joined.sort('filename', 'title') sorted_.write.csv(out_directory, mode='overwrite')
def main(in_directory, out_directory): #we're interested in "en"only. df = spark.read.csv(in_directory, schema=schema, sep=' ').withColumn('path', functions.input_file_name()) #df.show(); return #df.printSchema(); return df = df.withColumn('filename', python_logic(df['path'])) df = df.drop('path') #df.show(); return df = df.withColumn('time', complicated_udf2(df['filename'])) #df.show(); return df = df.drop('filename') #df.show(); return df = df.drop('num_of_bytes') #df.show(); return df = df.filter(df['language'] == 'en') #df.show(); return df = df.drop('language') df = df.filter(df['titles'] != ('Main_Page')) #df.show(); return df = df.filter(~df['titles'].contains('Special:')) df = df.cache() df_max = df.groupBy('time').agg( functions.max(df['times_requested']).alias('times_requested')) #df.show(); return #df_max.show(); return df_joined = df_max.join(df, ['time', 'times_requested']) df_joined = df_joined.sort('time', ascending=True) df_joined = df_joined.select('time', 'titles', 'times_requested') #df_joined = df_joined.count(); df_joined.show() return df_joined.write.csv(out_directory, mode='overwrite')
def main(inputs, output): # main logic starts here wikipedia_schema = types.StructType([ types.StructField('type', types.StringType(), False), types.StructField('title', types.StringType(), False), types.StructField('views', types.IntegerType(), False), types.StructField('size', types.LongType(), False), ]) wikipedia = spark.read.csv(inputs, schema=wikipedia_schema, sep=' ').withColumn( 'filename', functions.input_file_name() ) #read wikipedia input file separated by space filtered_wiki_records = wikipedia.where( (wikipedia.type == "en") & (~(wikipedia.title.startswith('Special:'))) & (wikipedia.title != "Main_Page")).cache() #registering the udf date_time_udf = functions.udf(path_to_hour, types.StringType()) wiki_records_per_hour = filtered_wiki_records.withColumn( 'hour', date_time_udf(filtered_wiki_records.filename)).cache() #find the max page views per hour max_hit_counts = wiki_records_per_hour.groupby('hour').max('views') #broadcast the small dataframe max_hit_counts = functions.broadcast(max_hit_counts) #forming my conditions for join condition = [ wiki_records_per_hour.views == max_hit_counts['max(views)'], (wiki_records_per_hour.hour == max_hit_counts.hour) ] joined_df = max_hit_counts.join(wiki_records_per_hour, condition).select( wiki_records_per_hour.hour, wiki_records_per_hour.title, wiki_records_per_hour.views).sort( wiki_records_per_hour.hour).coalesce(1) joined_df.write.json(output, mode='overwrite')
def test_source_file_name(sql_context): expected = books_as_row(sql_context).withColumn( 'convergdb_source_file_name', input_file_name()).collect() test = convergdb.source_file_name(books_as_row(sql_context)).collect() assert test == expected
def main(): schema = StructType([ StructField('LANG', StringType(), False), StructField('TITLE', StringType(), False), StructField('VIEWS', LongType(), False), StructField('BYTES', LongType(), False) ]) df = spark.read.csv(inputs, schema=schema, sep=" ").withColumn('filename', functions.input_file_name()) # df.show() hour_udf = udf(extract_hour_from_filename, StringType()) df = df.select('LANG', 'TITLE', 'VIEWS', 'BYTES', hour_udf("filename").alias("HOUR")) df = df.where(df.LANG == 'en') df = df.where((~df.TITLE.startswith('Main_Page'))) df = df.where((~df.TITLE.startswith('Special:'))) # df.show() df_max_views = df.groupby('HOUR').agg(max("VIEWS").alias('MAX_VIEW')) cond = [ df['HOUR'] == df_max_views['HOUR'], df['VIEWS'] == df_max_views['MAX_VIEW'] ] result = df.join(df_max_views, cond, 'inner').select(df['HOUR'], df['TITLE'], df['VIEWS']).sort(df['HOUR']) # result.show() result.write.csv(output)
def main(in_dir, out_dir): # Read the csv file wiki_page = spark.read.csv(in_dir, schema=schema, sep=' ').withColumn('filename', functions.input_file_name()) # We need to find the most-viewed page each hour data = wiki_page.filter(wiki_page['language'] == 'en') data = data.filter(data['page_name'] != 'Main_Page') data = data.filter(data.page_name.startswith('Special:') == False) path_to_hour = functions.udf(lambda path: getdate(path), returnType=types.StringTypes()) data = data.withColumn('date', path_to_hour(data.filename)) data = data.drop('language', 'bytes', 'filename') groups = data.groupBy('date') most_viewed = groups.agg(functions.max(data['views']).alias('views')) most_viewed.cache() cond = ['views', 'date'] data_join = most_viewed.join(data, cond) output = data_join.sort('date', 'page_name') output.show()
def read_pg_dump(spark, input_dir): """Read gzipped pg_dump data. -RECORD 0----------------------------------------------------------------- table_name | submission_date_nightly_43_20191201 aggregate_type | submission_date ds_nodash | 20191201 dimension | {"os": "Windows_NT", "child": "false", "label": ""... aggregate | {0,2,0,2,2} """ # parse the table of contents toc_file = f"{input_dir}/toc.dat" with open(toc_file, "rb") as f: data = f.read() toc_df = spark.createDataFrame([Row(**d) for d in parse_toc(data)]) data_df = (spark.read.csv( f"{input_dir}/*.dat.gz", sep="\t", schema="dimension string, aggregate string", ).withColumn("file_name", parse_filename(F.input_file_name())).select( "dimension", "aggregate", "file_name.*")) return data_df.join(toc_df, on="table_id", how="inner")
def main(inputs, output): pathfunction = functions.udf(path_to_hour, returnType=types.StringType()) comments_schema = types.StructType([ types.StructField('language', types.StringType()), types.StructField('title', types.StringType()), types.StructField('views', types.LongType()), types.StructField('bytes', types.LongType()), ]) wikipage = spark.read.csv(inputs, schema=comments_schema, sep=' ').withColumn( 'hour', pathfunction(functions.input_file_name())) filtered_page = wikipage.filter( (wikipage.language == 'en') & (wikipage.title != 'Main Page') & (~wikipage.title.startswith('Special:'))).cache() max_view = filtered_page.groupBy(wikipage.hour).agg( functions.max(wikipage.views).alias('total_views')) conditions = [ filtered_page.views == max_view.total_views, filtered_page.hour == max_view.hour ] # regular join: join_page = filtered_page.join(functions.broadcast(max_view), conditions).select(filtered_page.hour, 'title', 'views') # broadcast join as following: join_page = filtered_page.join(functions.broadcast(max_view), conditions).select(filtered_page.hour, 'title', 'views') join_page.sort('hour', 'title').write.json(output, mode='overwrite') join_page.explain()
def main(inputs, output): # main logic starts here wiki_schema = types.StructType([ types.StructField('language', types.StringType()), types.StructField('title', types.StringType()), types.StructField('views', types.IntegerType()), types.StructField('size', types.LongType()), ]) #reading data wikiData = spark.read.csv(inputs, schema=wiki_schema, sep=" ").withColumn( 'hour', path_to_hour(functions.input_file_name())) #filtering data filteredWikiData = wikiData[(wikiData['language'] == 'en') & (wikiData['title'] != 'Main_Page') & (wikiData['title'] != 'Special:Page')].cache() #finding max views per hour. maxCount = filteredWikiData.groupBy('hour').agg( functions.max(filteredWikiData['views']).alias('max')) #joining data to obtain hour and title. joinData = filteredWikiData.join( maxCount, filteredWikiData.views == maxCount.max).select( filteredWikiData["hour"], filteredWikiData["title"], filteredWikiData["views"]) #sorting data based on hour and storing it in json file. joinData.sort(functions.asc('hour')).write.json(output, mode='overwrite')
def main(in_directory, out_directory): path_to_hour = functions.udf(path_to_hour_non_udf, returnType=types.StringType()) stats = spark.read.csv(in_directory, schema=wiki_data_schema, sep=' ').withColumn( 'filename', path_to_hour(functions.input_file_name())) stats = stats.filter((stats['lan'] == 'en') & (stats['name'] != 'Main_Page') & ~(stats['name'].startswith('Special:'))).cache( ) # <------ Caching here! max_stats = stats.groupBy('filename').agg( functions.max('count')).alias('max(count)') stats = max_stats.join(stats, on='filename') # stats.explain() stats = stats.filter(stats['max(count)'] == stats['count']) stats_sorted = stats.sort('filename', 'name') cleaned_data = stats_sorted.select(stats_sorted['filename'], stats_sorted['name'], stats_sorted['count']) cleaned_data.write.csv(out_directory, mode='overwrite')
def main(input, output): # Read in CSV data and hold onto filename fname = '*/*/*/*/*/Play by Play - All (Parsed).csv' df = spark.read.csv(os.path.join(input, fname), header='true', schema=play_by_play_schema) \ .withColumn('filename', functions.input_file_name()) \ .withColumn('split', functions.split('filename', '/')) \ .withColumn('TimeLeft_split', functions.split('TimeLeft', ':')) \ .withColumn('Score_split', functions.split('Score', '-')) # Parse the file name into columns df = df \ .where((df['Action'] != 'Enters Game') & (df['Action'] != 'Leaves Game') ) \ .withColumn('Gender', df['split'].getItem(4)) \ .withColumn('Year', df['split'].getItem(5)) \ .withColumn('Division', df['split'].getItem(6)) \ .withColumn('FileTeam', df['split'].getItem(7)) \ .withColumn('Seconds_Left', df['TimeLeft_split'].getItem(0).cast(types.IntegerType())*60 \ +df['TimeLeft_split'].getItem(1).cast(types.IntegerType()) \ +period_mins_left(df['Period'])*60) \ .withColumn('Away_Score', df['Score_split'].getItem(0)) \ .withColumn('Home_Score', df['Score_split'].getItem(1)) \ .withColumn('File_Team', functions.regexp_replace('FileTeam', '%20', ' ')) \ # Add Home_Margin column df = df.withColumn('Home_Margin', (df['Home_Score'] - df['Away_Score']).cast( types.IntegerType())) # Specify final columns and write data final_columns = ['Gender','Year','Division', 'Date', 'Time', \ 'Score','Team', 'Player','Status', 'Action','Shot_Clock',\ 'Seconds_Left','Away_Score','Home_Score', 'Home_Margin', 'File_Team'] df = df.select(final_columns).drop_duplicates() \ .write.parquet(output, mode='append', compression='gzip')
def process(self): # file types are with certain name, and we want to identify while file type data belongs to def find_brand(s3_path): if s3_path.count('xxx') > 0: return 'type1' elif s3_path.count('yyy') > 0: return 'type2' elif s3_path.count('zzz') > 0: return 'type3' else: 'invalid type' udf_find_brand = udf(find_brand, StringType()) df_stg_user_profile = self.sc.read.json( self.user_profile_input_location, schema=user_tbl_json_ingest_schema) \ .withColumn("s3_path", input_file_name()) \ .withColumn("brand", udf_find_brand('s3_path')).drop('s3_path') df_stg_user_profile.printSchema() write_and_partition(self.sc, df_stg_user_profile, self.user_profile_stg_location, stg_user_table, number_of_files=200,keep_latest_n=2, table_create_statements=[ create_stg_user_tbl_template.format(location=self.user_profile_stg_location)])
def main(): in_directory = sys.argv[1] out_directory = sys.argv[2] def spilt(filenmae): file = filenmae.rsplit('/', 1)[1] date = file.split("-", 1)[1] name = date[:-7] return name path_to_hour = functions.udf(spilt, returnType=types.StringType()) data = spark.read.csv(in_directory, sep=" ", schema=schema).withColumn( 'filename', path_to_hour(functions.input_file_name())) filter_data = data.filter( (data['language'] == "en") & (data["page"] != "Main_Page") & (~data["page"].startswith("Special:"))).drop_duplicates() grouped = filter_data.groupby("filename").max("views") joined_data = filter_data.join( grouped, on=((filter_data['views'] == grouped['max(views)'])) & (filter_data['filename'] == grouped['filename'])).drop( filter_data['filename']) joined_data = joined_data.sort(joined_data['filename']) max_view = joined_data.select(joined_data['filename'], joined_data['page'], joined_data['views']) max_view.show() max_view.write.csv(out_directory + '-max_view', mode='overwrite')
def main(): schema = types.StructType([ types.StructField('lan', types.StringType(), True), types.StructField('title', types.StringType(), True), types.StructField('total_req', types.IntegerType(), True), types.StructField('byte_trans', types.DoubleType(), True), ]) pagecount_df = spark.read.csv(inputs, schema, " ").withColumn( 'time', get_access_time_udf(functions.input_file_name())) pagecount_df = pagecount_df.where( (pagecount_df.lan == 'en') & (pagecount_df.title != 'Main_Page') & (pagecount_df.title.startswith('Special:') == False)) max_hit_page_df = pagecount_df.groupBy('time').max('total_req').select( 'time', functions.col('max(total_req)').alias('total_req')) pagecount_max_df = pagecount_df.join( max_hit_page_df, ['time', 'total_req']).select('time', 'title', 'total_req').sort(['time', 'title'], ascending=True) #pagecount_max_df.show() if not os.path.exists(output): os.makedirs(output) pagecount_max_df.write.csv(output, sep=',', mode='overwrite')
def fill_days(year: str, month: str, day: str) -> None: S3_FULL_PATH = f"{S3_PATH}/{year}/{month}/{day}" try: # read all parquet files from specific yyyy/mm/dd df = spark.read.format("parquet").load(f"{S3_FULL_PATH}/*.parquet") # creates extracted_at column from date path df = df.withColumn( "extracted_at", from_unixtime( # covert unixtime to timestamp unix_timestamp( substring( regexp_replace(input_file_name(), f"{S3_PATH}/", ""), 1, 10 ), "yyyy/MM/dd", ) ), ) # overwrite dataframe with the new column df.write.mode("overwrite").parquet(f"{S3_FULL_PATH}/") print(f"files updated from {S3_FULL_PATH}/") # some dates don't have paths. except AnalysisException as err: print(err)
def load_data(spark, s_path2s3): # define schema print('loading {}...'.format(s_path2s3)) this_schema = StructType([ StructField("ticker", StringType(), True), StructField("open", StringType(), True), StructField("settle", StringType(), True), StructField("lclose", StringType(), True), StructField("max", StringType(), True), StructField("min", StringType(), True), StructField("bid", StringType(), True), StructField("ask", StringType(), True), StructField("qbid", StringType(), True), StructField("qask", StringType(), True), StructField("qlast", IntegerType(), True), StructField("last", StringType(), True), StructField("group", StringType(), True), StructField("ntrades", StringType(), True), StructField("name", StringType(), True), StructField("qtotal", StringType(), True), StructField("strike", StringType(), True), StructField("expdate", StringType(), True), StructField("totalvol", StringType(), True) ]) # read csv df = (spark.read.option( "header", "true").schema(this_schema).csv(s_path2s3).withColumn( "input_file", input_file_name())) # drop rows where ticker is null df = df.na.drop(subset=["ticker"]) return df
def test_input_file_name_udf(self): df = self.spark.read.text('python/test_support/hello/hello.txt') df = df.select( udf(lambda x: x)("value"), input_file_name().alias('file')) file_name = df.collect()[0].file self.assertTrue("python/test_support/hello/hello.txt" in file_name)
def test_udf_with_input_file_name(self): from pyspark.sql.functions import input_file_name sourceFile = udf(lambda path: path, StringType()) filePath = "python/test_support/sql/people1.json" row = self.spark.read.json(filePath).select( sourceFile(input_file_name())).first() self.assertTrue(row[0].find("people1.json") != -1)
def main(inputs, outputs): pages = spark.read.csv(inputs, schema=pages_schema, sep=' ') #extract filename from input_file_name() filename_to_time = functions.udf(getfilename, returnType=types.StringType()) pages = pages.withColumn('filename', filename_to_time(functions.input_file_name())) pages.show() #apply all filters filtered_pages = pages.filter( (pages['language'] == 'en') & (pages['page_name'] != 'Main_Page') & (pages['page_name'].startswith('Special:') == False)) filtered_pages.cache() #find max viewed pages grouped_pages = filtered_pages.groupBy('filename') max = grouped_pages.agg(functions.max('count').alias('max_views')) #To allow tie joined_pages = filtered_pages.join( max, on=((filtered_pages['count'] == max['max_views']) & (filtered_pages['filename'] == max['filename']))) #filename to int type to sort inttime = functions.udf(makeint, returnType=types.FloatType()) new_pages = joined_pages.withColumn('date', inttime(filtered_pages['filename'])) output = new_pages.select(filtered_pages['filename'], 'page_name', 'max_views') output.explain() output.sort('date').write.csv(outputs, mode='overwrite') output.sort('date').show()
def main(inputs, output): # main logic starts here observation_schema = types.StructType([ types.StructField('language', types.StringType(), False), types.StructField('title', types.StringType(), False), types.StructField('page_count', types.IntegerType(), False), types.StructField('bytes', types.IntegerType(), False), ]) wiki_data = spark.read.csv(inputs, sep=' ', schema=observation_schema).withColumn('hour',\ path_to_hour(functions.input_file_name())) wiki_data_filter = wiki_data\ .filter((wiki_data.language == 'en') & ~(wiki_data.title == 'Main_Page') & ~(wiki_data.title.startswith('Special:'))) wiki_data_filter.cache() wiki_max = wiki_data_filter.select('hour','page_count').groupBy('hour').max('page_count')\ .orderBy(wiki_data_filter.hour.asc()) wiki_max_rename = wiki_max.select(functions.col("hour").alias("hours"), functions.col("max(page_count)")\ .alias("views")) cond = [ wiki_data_filter.page_count == wiki_max_rename.views, wiki_data_filter.hour == wiki_max_rename.hours ] wiki_join = wiki_data_filter.join(functions.broadcast(wiki_max_rename), cond, 'inner') wiki_projection = wiki_join.select('hour', 'title', 'views') wiki_projected_sort = wiki_projection.orderBy(wiki_projection.hour.asc(), wiki_projection.title.asc()) wiki_projected_sort.explain() # coalesce is used as the question requires an output in a newline-delimited JSON wiki_projected_sort.coalesce(1).write.json(output, mode='overwrite')
def main(inputs, output): pages=spark.read.csv(inputs,page_schema,' ').withColumn('filename', functions.input_file_name()) pages=pages.withColumn('hour',path_to_hour(pages.filename)) pages=pages.where(pages.language=='en').where(pages.title!='Main_Page').where(~pages.title.startswith('Special')).cache() maxviews = pages.groupby(pages.hour).max('views').orderBy(pages.hour).withColumnRenamed('hour','hour1') maxviews = maxviews.join(pages,(pages.hour==maxviews.hour1) & (pages.views==maxviews['max(views)'])).select(pages.hour,pages.title,pages.views).orderBy(pages.hour) maxviews.write.json(output,mode='overwrite') maxviews.explain()
def add_year(self, df): df2 = ( df.withColumn('file_name', slice(split(input_file_name(), '/'), -1 ,1)[0]) .withColumn('flight_year', col('file_name').substr(1, 4).cast(IntegerType())) ) return df2
def test_udf_with_input_file_name_for_hadooprdd(self): from pyspark.sql.functions import udf, input_file_name def filename(path): return path sameText = udf(filename, StringType()) rdd = self.sc.textFile('python/test_support/sql/people.json') df = self.spark.read.json(rdd).select(input_file_name().alias('file')) row = df.select(sameText(df['file'])).first() self.assertTrue(row[0].find("people.json") != -1) rdd2 = self.sc.newAPIHadoopFile( 'python/test_support/sql/people.json', 'org.apache.hadoop.mapreduce.lib.input.TextInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.Text') df2 = self.spark.read.json(rdd2).select(input_file_name().alias('file')) row2 = df2.select(sameText(df2['file'])).first() self.assertTrue(row2[0].find("people.json") != -1)
def test_udf_with_input_file_name(self): from pyspark.sql.functions import udf, input_file_name sourceFile = udf(lambda path: path, StringType()) filePath = "python/test_support/sql/people1.json" row = self.spark.read.json(filePath).select(sourceFile(input_file_name())).first() self.assertTrue(row[0].find("people1.json") != -1)