def minute(self) -> ks.Series: """ The minutes of the datetime. """ return _wrap_accessor_spark(self, F.minute, LongType()).alias(self.name)
windowSpec = Window.partitionBy("DISTRICT") \ .orderBy(col("Lat").desc()) crimeFacts.select( "DISTRICT", "Lat", func.max("Lat").over(windowSpec).alias("max_lat")).show() print(crimeFacts.rdd.take(5)) from pyspark.sql.types import LongType from pyspark.sql.functions import udf def squared_typed(s): return s * s squared_udf = udf(squared_typed, LongType()) crimeFacts.select("hour", squared_udf("hour")).show() offenseCodes = spark.read\ .option("header", "true")\ .option("inferSchema", "true")\ .csv("boston_crimes/offense_codes.csv") offenseCodes.show() robberyStats = crimeFacts\ .join(offenseCodes, offenseCodes.CODE == crimeFacts.OFFENSE_CODE)\ .filter(offenseCodes.NAME.contains("ROBBERY"))\ .groupBy(offenseCodes.NAME)\ .count()\ .orderBy(col("count").desc())
def attach_id_column(self, id_type: str, column: Union[Any, Tuple]) -> "DataFrame": """ Attach a column to be used as identifier of rows similar to the default index. See also `Default Index type <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_. Parameters ---------- id_type : string The id type. - 'sequence' : a sequence that increases one by one. .. note:: this uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. - 'distributed-sequence' : a sequence that increases one by one, by group-by and group-map approach in a distributed manner. - 'distributed' : a monotonically increasing sequence simply by using PySpark’s monotonically_increasing_id function in a fully distributed manner. column : string or tuple of string The column name. Returns ------- DataFrame The DataFrame attached the column. Examples -------- >>> df = ps.DataFrame({"x": ['a', 'b', 'c']}) >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column="id") x id 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=0) x 0 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed", column=0.0) ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE x 0.0 0 a ... 1 b ... 2 c ... For multi-index columns: >>> df = ps.DataFrame({("x", "y"): ['a', 'b', 'c']}) >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column=("id-x", "id-y")) x id-x y id-y 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=(0, 1.0)) x 0 y 1.0 0 a 0 1 b 1 2 c 2 """ from pyspark.pandas.frame import DataFrame if id_type == "sequence": attach_func = InternalFrame.attach_sequence_column elif id_type == "distributed-sequence": attach_func = InternalFrame.attach_distributed_sequence_column elif id_type == "distributed": attach_func = InternalFrame.attach_distributed_column else: raise ValueError( "id_type should be one of 'sequence', 'distributed-sequence' and 'distributed'" ) assert is_name_like_value(column, allow_none=False), column if not is_name_like_tuple(column): column = (column,) internal = self._psdf._internal if len(column) != internal.column_labels_level: raise ValueError( "The given column `{}` must be the same length as the existing columns.".format( column ) ) elif column in internal.column_labels: raise ValueError( "The given column `{}` already exists.".format(name_like_string(column)) ) # Make sure the underlying Spark column names are the form of # `name_like_string(column_label)`. sdf = internal.spark_frame.select( [ scol.alias(SPARK_INDEX_NAME_FORMAT(i)) for i, scol in enumerate(internal.index_spark_columns) ] + [ scol.alias(name_like_string(label)) for scol, label in zip(internal.data_spark_columns, internal.column_labels) ] ) sdf, force_nullable = attach_func(sdf, name_like_string(column)) return DataFrame( InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) for i in range(internal.index_level) ], index_names=internal.index_names, index_fields=( [field.copy(nullable=True) for field in internal.index_fields] if force_nullable else internal.index_fields ), column_labels=internal.column_labels + [column], data_spark_columns=( [scol_for(sdf, name_like_string(label)) for label in internal.column_labels] + [scol_for(sdf, name_like_string(column))] ), data_fields=( ( [field.copy(nullable=True) for field in internal.data_fields] if force_nullable else internal.data_fields ) + [ InternalField.from_struct_field( StructField(name_like_string(column), LongType(), nullable=False) ) ] ), column_label_names=internal.column_label_names, ).resolved_copy )
def process_log_data(spark, input_data, output_data): """ Processing log data (users, time table, songplay) by the JSON given by S3, after data normalization and transformation these data are wrote as parquet files """ """ Proving JSON structure to Spark """ logdata_schema = StructType([ StructField("artist", StringType(), True), StructField("auth", StringType(), True), StructField("firstName", StringType(), True), StructField("gender", StringType(), True), StructField("itemInSession", LongType(), True), StructField("lastName", StringType(), True), StructField("length", DoubleType(), True), StructField("level", StringType(), True), StructField("location", StringType(), True), StructField("method", StringType(), True), StructField("page", StringType(), True), StructField("registration", DoubleType(), True), StructField("sessionId", LongType(), True), StructField("song", StringType(), True), StructField("status", LongType(), True), StructField("ts", LongType(), True), StructField("userAgent", StringType(), True), StructField("userId", StringType(), True), ]) # get filepath to log data file log_data = input_data + 'log-data' # read log data file, JSON structure df = spark.read.json(log_data, schema = logdata_schema) # filter by actions for song plays df = df.filter(col("page") == 'NextSong') # extract columns for users table users_table = df.select(col("userId").alias("user_id"),col("firstName").alias("first_name"), col("lastName").alias("last_name"),"gender","level") # write users table to parquet files users_table.write.parquet(output_data+"users") tsFormat = "yyyy-MM-dd HH:MM:ss z" # Converting ts to a timestamp format time_table = df.withColumn('ts', to_timestamp(date_format((df.ts /1000).cast(dataType=TimestampType()), tsFormat), tsFormat)) # extract columns to create time table time_table = time_table.select(col("ts").alias("start_time"), hour(col("ts")).alias("hour"), dayofmonth(col("ts")).alias("day"), weekofyear(col("ts")).alias("week"), month(col("ts")).alias("month"), year(col("ts")).alias("year")) # write time table to parquet files partitioned by year and month time_table.write.partitionBy("year","month").parquet(output_data+"time") # read in song data to use for songplays table song_data = input_data+"song-data/*/*/*/*.json" song_df = spark.read.json(song_data) # extract columns from joined song and log datasets to create songplays table songplays_table = song_df.join(df, song_df.artist_name==df.artist). withColumn("songplay_id", monotonically_increasing_id()). withColumn('start_time', to_timestamp(date_format((col("ts") /1000).cast(dataType=TimestampType()), tsFormat),tsFormat)). select("songplay_id", "start_time", col("userId").alias("user_id"), "level", "song_id", "artist_id", col("sessionId").alias("session_id"), col("artist_location").alias("location"), "userAgent", month(col("start_time")).alias("month"), year(col("start_time")).alias("year")) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy("year","month").parquet(output_data+"songplays")
# Item_based Collaborative Filtering ---GraphFrames, DataFrame sc.addPyFile("/Users/soober/Downloads/graphframes-0.7.0-spark2.4-s_2.11.jar") from graphframes import * from pyspark.sql.types import StructType, StructField, LongType, IntegerType, StringType UserAnimeSchema = StructType([ StructField("username", StringType(), True), StructField("animeId", IntegerType(), True), StructField("watched_episodes", LongType(), True), StructField("start_date", StringType(), True), StructField("finish_date", StringType(), True), StructField("rating", IntegerType(), True), StructField("status", IntegerType(), True), StructField("rewatching", StringType(), True), StructField("rewatching_ep", LongType(), True), StructField("last_updated_date", StringType(), True), StructField("tags", StringType(), True), ]) user_anime = spark.read.schema(UserAnimeSchema) \ .option("header", "false") \ .option("mode", "DROPMALFORMED") \ .csv(RATIING_DATASET) raw_edges = user_anime.select("username", "animeId", "rating") \ .na.drop(subset=["username", "animeId"]) \ .filter("rating <= 10 and rating >= 1") userlist = spark.read.csv(USER_DATASET, header="true", inferSchema="true") \ .select("username", "user_id") \
import pyspark.sql.functions as fspark import collections spark = SparkSession.builder \ .master("local")\ .appName('abc') \ .config("spark.some.config.option", "some-value")\ .enableHiveSupport()\ .getOrCreate() spark.conf spark.range(3).show() myManualSchema = StructType([ StructField("some", StringType(), True), StructField("col", StringType(), True), StructField("names", LongType(), False) ]) myRow = Row("Hello", "hi", 1) myDf = spark.createDataFrame([myRow], myManualSchema) myDf.show() myDf.select("col") myDf.select(expr("col as c"),col("col").alias("c1"),lit(1)).show() myDf.select(expr("col as c"),col("col").alias("c1"),lit(1)).distinct().count() myDf.selectExpr("*","col as newColumnName", "col as wer" ).show(2) myDf.selectExpr("col as `This Long Column-Name`" ).show(2) myDf.withColumn("numberOne", lit(1) * expr("names")).show(2) myDf.withColumn("This Long Column-Name",expr("col")).show() myDf.withColumnRenamed("col", "dest").columns myDf.drop("col").columns myDf.filter(col("names") == 1).show() myDf.filter("names == 1").show()
return { 'engine': engine, 'source': source, 'count': count, } addons_type = ArrayType( StructType([ StructField('addon_id', StringType(), False), StructField('blocklisted', BooleanType(), True), StructField('name', StringType(), True), StructField('user_disabled', BooleanType(), True), StructField('app_disabled', BooleanType(), True), StructField('version', StringType(), True), StructField('scope', LongType(), True), StructField('type', StringType(), True), StructField('foreign_install', BooleanType(), True), StructField('has_binary_components', BooleanType(), True), StructField('install_day', LongType(), True), StructField('update_day', LongType(), True), StructField('signed_state', LongType(), True), StructField('is_system', BooleanType(), True), StructField('is_web_extension', BooleanType(), True), StructField('multiprocess_compatible', BooleanType(), True), ])) def generate_addon(addon_id, name, version): return { 'addon_id': addon_id,
# signature.append(repoName) # signature.append(repoPath) # For each of the random hash functions... for i in range(0, numHashes): minHashCode = nextPrime + 1 for shingleID in shinglesInDoc: hashCode = (valueA[i] * shingleID + valueB[i]) % nextPrime if hashCode < minHashCode: minHashCode = hashCode signature.append(minHashCode) #signatures.append(signature) elapsed = (time.time() - t0) return signature #print(signatures) func_udf = udf(generate_shingel_minhash, ArrayType(LongType())) df = df.withColumn('ten_signatures', func_udf("content")) #df.select('ten_signatures').show() df.printSchema() def insert_db(df2): config = configparser.ConfigParser() config.read('config.ini') url = "jdbc:mysql://localhost/insight" properties = { "user": config['mysqlDB']['user'], "password": config['mysqlDB']['pass'], "driver": config['mysqlDB']['driver'] }
df_coll_need = df_coll.withColumn('X', zip_(df_coll.food, df_coll.price))\ .drop('food').drop('price') df_coll_need.show(truncate=False) # --------------------------- # 3- 单列聚合 agg() + dict # --------------------------- df.groupBy("name").agg({'price': 'mean'}).show() df.groupBy("name").agg({'price': 'max'}).show() # --------------------------- # 4- udf # --------------------------- data = [('alex', 5), ('jane', 7), ('bob', 9)] df_udf = spark.createDataFrame(data, ['name', 'age']) sqrt_udf = udf(m_sqrt, LongType()) df_udfed = df_udf.select('name', 'age', sqrt_udf('age').alias('age_sqrt')) df_udfed.show() # --------------------------- # 5- df from pandas # --------------------------- pd_df = pd.DataFrame( data={ 'integers': [2, 5, 7, 8, 9], 'floats': [1.2, -2.0, 1.5, 2.7, 3.6], 'int_arrays': [[6], [1, 2], [3, 4, 5], [6, 7, 8, 9], [10, 11, 12]] }) spark_df = spark.createDataFrame(pd_df) spark_df.show() # 转回pandas
import findspark findspark.init() import json import os from pyspark import SparkConf from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType, FloatType, BooleanType, \ DoubleType, ShortType type_map = { 'int': IntegerType(), 'bigint': LongType(), 'smallint': ShortType(), 'float': FloatType(), 'double': DoubleType(), 'string': StringType(), 'boolean': BooleanType(), } class LoadProdDataJob: def __init__(self, database, target_table, file_path, partition_columns, dfs_path, schema_path): self.database = database self.target_table = target_table self.file_path = file_path self.partition_columns = partition_columns self.dfs_path = dfs_path self.schema_path = schema_path
def compute_churn_week(df, week_start): """Compute the churn data for this week. Note that it takes 10 days from the end of this period for all the activity to arrive. This data should be from Sunday through Saturday. df: DataFrame of the dataset relevant to computing the churn week_start: datestring of this time period""" week_start_date = datetime.strptime(week_start, "%Y%m%d") week_end_date = week_start_date + timedelta(6) week_start = fmt(week_start_date) week_end = fmt(week_end_date) # Verify that the start date is a Sunday if week_start_date.weekday() != 6: msg = "Week start date {} is not a Sunday".format(week_start) raise RuntimeError(msg) # If the data for this week can still be coming, don't try to compute the # churn. week_end_slop = fmt(week_end_date + timedelta(10)) today = fmt(datetime.utcnow()) if week_end_slop >= today: msg = ("Skipping week of {} to {} - Data is still arriving until {}.". format(week_start, week_end, week_end_slop)) raise RuntimeError(msg) logger.info("Starting week from {} to {}".format(week_start, week_end)) # the subsession_start_date field has a different form than # submission_date_s3, so needs to be formatted with hyphens. week_end_excl = fmt(week_end_date + timedelta(1), date_format="%Y-%m-%d") week_start_hyphenated = fmt(week_start_date, date_format="%Y-%m-%d") current_week = (df.filter(df['submission_date_s3'] >= week_start).filter( df['submission_date_s3'] <= week_end_slop).filter( df['subsession_start_date'] >= week_start_hyphenated).filter( df['subsession_start_date'] < week_end_excl)) # take a subset and rename the app_version field current_week = (current_week.select(source_columns).withColumnRenamed( "scalar_parent_browser_engagement_total_uri_count", "total_uri_count").withColumnRenamed( "scalar_parent_browser_engagement_unique_domains_count", "unique_domains_count").withColumnRenamed("app_version", "version")) # clean some of the aggregate fields current_week = current_week.na.fill( 0, ["total_uri_count", "unique_domains_count"]) # Clamp broken subsession values in the [0, MAX_SUBSESSION_LENGTH] range. clamped_subsession_subquery = (F.when( F.col('subsession_length') > MAX_SUBSESSION_LENGTH, MAX_SUBSESSION_LENGTH).otherwise( F.when(F.col('subsession_length') < 0, 0).otherwise(F.col('subsession_length')))) # Compute per client aggregates lost during newest client computation per_client_aggregates = (current_week.select( 'client_id', 'total_uri_count', 'unique_domains_count', clamped_subsession_subquery.alias('subsession_length')).groupby( 'client_id').agg( F.sum('subsession_length').alias('usage_seconds'), F.sum('total_uri_count').alias('total_uri_count_per_client'), F.avg('unique_domains_count').alias( 'average_unique_domains_count_per_client'))) # Get the newest ping per client and append to original dataframe newest_per_client = get_newest_per_client(current_week) newest_with_usage = newest_per_client.join(per_client_aggregates, 'client_id', 'inner') # Build the "effective version" cache: d2v = make_d2v(get_release_info()) converted = newest_with_usage.rdd.map( lambda x: convert(d2v, week_start, x)) """ - channel (appUpdateChannel) - geo (bucketed into top 30 countries + "rest of world") - is_funnelcake (contains "-cck-"?) - acquisition_period (cohort_week) - start_version (effective version on profile creation date) - sync_usage ("no", "single" or "multiple" devices) - current_version (current appVersion) - current_week (week) - source (associated attribution) - medium (associated with attribution) - campaign (associated with attribution) - content (associated with attribution) - distribution_id (funnelcake associated with profile) - default_search_engine - locale - is_active (were the client_ids active this week or not) - n_profiles (count of matching client_ids) - usage_hours (sum of the per-client subsession lengths, clamped in the [0, MAX_SUBSESSION_LENGTH] range) - sum_squared_usage_hours (the sum of squares of the usage hours) - total_uri_count (sum of per-client uri counts) - unique_domains_count_per_profile (average of the average unique domains per-client) """ churn_schema = StructType([ StructField('channel', StringType(), True), StructField('geo', StringType(), True), StructField('is_funnelcake', StringType(), True), StructField('acquisition_period', StringType(), True), StructField('start_version', StringType(), True), StructField('sync_usage', StringType(), True), StructField('current_version', StringType(), True), StructField('current_week', LongType(), True), StructField('source', StringType(), True), StructField('medium', StringType(), True), StructField('campaign', StringType(), True), StructField('content', StringType(), True), StructField('distribution_id', StringType(), True), StructField('default_search_engine', StringType(), True), StructField('locale', StringType(), True), StructField('is_active', StringType(), True), StructField('n_profiles', LongType(), True), StructField('usage_hours', DoubleType(), True), StructField('sum_squared_usage_hours', DoubleType(), True), StructField('total_uri_count', LongType(), True), StructField('unique_domains_count', DoubleType(), True) ]) # Don't bother to filter out non-good records - they will appear # as 'unknown' in the output. countable = converted.map(lambda x: ( ( # attributes unique to a client x.get('channel', 'unknown'), x.get('geo', 'unknown'), "yes" if x.get('is_funnelcake', False) else "no", datetime.strftime(x.get('acquisition_period', date(2000, 1, 1)), "%Y-%m-%d"), x.get('start_version', 'unknown'), x.get('sync_usage', 'unknown'), x.get('current_version', 'unknown'), x.get('current_week', -1), x.get('source', 'unknown'), x.get('medium', 'unknown'), x.get('campaign', 'unknown'), x.get('content', 'unknown'), x.get('distribution_id', 'unknown'), x.get('default_search_engine', 'unknown'), x.get('locale', 'unknown'), x.get('is_active', 'unknown')), ( 1, # active users x.get('usage_hours', 0.0), x.get('squared_usage_hours', 0.0), x.get('total_uri_count', 0), x.get('unique_domains_count', 0.0)))) def reduce_func(x, y): return tuple(map(sum, zip(x, y))) aggregated = countable.reduceByKey(reduce_func) records_df = aggregated.map(lambda x: x[0] + x[1]).toDF(churn_schema) # Apply some post-processing for other aggregates # (i.e. unique_domains_count). This needs to be done when you want # something other than just a simple sum def average(total, n): if not n: return 0.0 return float(total) / n average_udf = F.udf(average, DoubleType()) # Create new derived columns and drop any unnecessary ones records_df = ( records_df # The total number of unique domains divided by the number of profiles # over a set of dimensions. This should be aggregated using a weighted # mean, i.e. sum(unique_domains_count_per_profile * n_profiles) .withColumn('unique_domains_count_per_profile', average_udf(F.col('unique_domains_count'), F.col('n_profiles'))) # This value is meaningless because of overlapping domains between # profiles .drop('unique_domains_count') ) return records_df
def expected_search_clients_daily_data(define_dataframe_factory): # template for the expected results factory = define_dataframe_factory( list( map( to_field, [ ("client_id", "a", StringType(), False), ("sample_id", "42", StringType(), False), ("submission_date", "20170101", StringType(), False), ("os", "windows", StringType(), True), ("channel", "release", StringType(), True), ("country", "DE", StringType(), True), ("locale", "de", StringType(), True), ("search_cohort", None, StringType(), True), ("app_version", "54.0.1", StringType(), True), ("distribution_id", None, StringType(), True), ("addon_version", "0.9.5", StringType(), False), ("engine", "google", StringType(), True), ("source", "urlbar", StringType(), True), ("tagged-sap", None, LongType(), True), ("tagged-follow-on", None, LongType(), True), ("tagged_sap", None, LongType(), True), ("tagged_follow_on", None, LongType(), True), ("sap", 4, LongType(), True), ("organic", None, LongType(), True), ("unknown", None, LongType(), True), # Roughly 2016-01-01 ("profile_creation_date", 16801, LongType(), False), ("default_search_engine", "google", StringType(), False), ( "default_search_engine_data_load_path", "jar:[app]/omni.ja!browser/google.xml", StringType(), False, ), ( "default_search_engine_data_submission_url", "https://www.google.com/search?q=&ie=utf-8&oe=utf-8&client=firefox-b", StringType(), False, ), ("sessions_started_on_this_day", 1, LongType(), True), ("profile_age_in_days", 366, LongType(), True), ("subsession_hours_sum", 1.0, DoubleType(), True), ("active_addons_count_mean", 2.0, DoubleType(), True), ("max_concurrent_tab_count_max", 10, LongType(), True), ("tab_open_event_count_sum", 5, LongType(), True), ("active_hours_sum", 0.5, DoubleType(), True), ], ))) return factory([ { "client_id": "b", "country": "US" }, # Covers 5 dupe rows and custom app_version, distribution_id rows { "app_version": "52.0.3", "sap": 28, "sessions_started_on_this_day": 7, "subsession_hours_sum": 7.0, "tab_open_event_count_sum": 35, "active_hours_sum": 3.5, }, { "engine": "bing" }, { "engine": "yahoo" }, { "client_id": "c", "unknown": None, "sap": 0, "tagged-sap": None, "tagged-follow-on": None, "tagged_sap": None, "tagged_follow_on": None, "source": None, "engine": None, }, ])
# Boilerplate for generating example main_summary tables def generate_search_count(engine="google", source="urlbar", count=4): return {"engine": engine, "source": source, "count": count} addons_type = ArrayType( StructType([ StructField("addon_id", StringType(), False), StructField("blocklisted", BooleanType(), True), StructField("name", StringType(), True), StructField("user_disabled", BooleanType(), True), StructField("app_disabled", BooleanType(), True), StructField("version", StringType(), True), StructField("scope", LongType(), True), StructField("type", StringType(), True), StructField("foreign_install", BooleanType(), True), StructField("has_binary_components", BooleanType(), True), StructField("install_day", LongType(), True), StructField("update_day", LongType(), True), StructField("signed_state", LongType(), True), StructField("is_system", BooleanType(), True), StructField("is_web_extension", BooleanType(), True), StructField("multiprocess_compatible", BooleanType(), True), ])) def generate_addon(addon_id, name, version): return {"addon_id": addon_id, "name": name, "version": version}
def second(self) -> ks.Series: """ The seconds of the datetime. """ return _wrap_accessor_spark(self, F.second, LongType()).alias(self.name)
# 2. 创建dataframe # 2.1. 从变量创建 stringCSVRDD = spark.sparkContext.parallelize([(123, "Katie", 19, "brown"), (234, "Michael", 22, "green"), (345, "Simone", 23, "blue")]) # 指定模式, StructField(name,dataType,nullable) # 其中: # name: 该字段的名字, # dataType:该字段的数据类型, # nullable: 指示该字段的值是否为空 from pyspark.sql.types import StructType, StructField, LongType, StringType # 导入类型 schema = StructType([ StructField("id", LongType(), True), StructField("name", StringType(), True), StructField("age", LongType(), True), StructField("eyeColor", StringType(), True) ]) # 对RDD应用该模式并且创建DataFrame swimmers = spark.createDataFrame(stringCSVRDD, schema) swimmers.registerTempTable("swimmers") # 查看DataFrame的行数 print(swimmers.count()) # 2.2. 从变量创建 # 使用自动类型推断的方式创建dataframe data = [(123, "Katie", 19, "brown"), (234, "Michael", 22, "green"), (345, "Simone", 23, "blue")]
def with_idx(sdf): new_schema = StructType(sdf.schema.fields + [StructField("idx", LongType(), False), ]) return sdf.rdd.zipWithIndex().map(lambda row: row[0] + (row[1],)).toDF( schema=new_schema)
def analyze(spark): # allSubDir = glob.glob("data/preprocess_test") # allcsv = [] # for subdir in allSubDir: # files = glob.glob(subdir + "*.csv") # allcsv = allcsv + files input_file = "data/df.csv" mapping_file = "data/preprocess_test/mapping/mapping.csv" mapping_df = spark.read \ .option("header", "true") \ .option("treatEmptyValuesAsNulls", "true") \ .option("inferSchema", "true") \ .option("charset", "UTF-8") \ .csv(mapping_file) fix_mapping_df = mapping_df\ .withColumn("F_MAMH", when(col("F_MAMH").cast(IntegerType()).isNotNull(), col("F_MAMH") + lit(".0")).otherwise(col("F_MAMH")))\ .withColumn("F_MAMH_new", when(col("F_MAMH_new").cast(IntegerType()).isNotNull(), col("F_MAMH_new").cast(IntegerType()) + lit(".0")).otherwise(col("F_MAMH_new"))) # df = spark.read.format("com.crealytics.spark.excel").option("location", input_file) \ # .option("useHeader", "True") \ # .option("treatEmptyValuesAsNulls", "true") \ # .option("inferSchema", "False") \ # .option("addColorColumns", "False") \ # .load() # original input file df = spark.read \ .option("header", "true") \ .option("treatEmptyValuesAsNulls", "true") \ .option("inferSchema", "true") \ .option("charset", "UTF-8") \ .csv(input_file) df = df.select("MASV1", "F_MAMH", "F_MAKH", "TKET", "F_NIENKHOA") # df = df.groupBy("MASV1", "F_MAMH", "F_MAKH").agg(collect_list("TKET").alias("list_TKET")).withColumn("TKET", col("list_TKET")[0]) # df = df.filter(df["F_MAKH"] == "MT") # print(df.count()) # df = df.withColumn("MASV1", df["MASV1"].cast(DoubleType())) # df = df.withColumn("MASV1", df["MASV1"].cast(IntegerType())) # df = df.withColumn("TKET", df["TKET"].cast(DoubleType())) # df = df.groupBy("MASV1", "F_MAMH", "F_MAKH").agg(collect_list("TKET").alias("list_TKET"))\ # .withColumn("TKET", col("list_TKET")[0]).drop("list_TKET") print("Original df count: {}".format(str(df.count()))) print("Original df distinct SV_MH distinct count: {}".format( str(df.select("MASV1", "F_MAMH", "F_MAKH").distinct().count()))) print("Original df distinct F_MAMH count: {}".format( str(df.select("F_MAMH").distinct().count()))) print("Original df distinct F_MAKH count: {}".format( str(df.select("F_MAKH").distinct().count()))) course_mapping = FullCourseMappingEstimator()\ .setItemCol("F_MAMH")\ .setOutputCol("F_MAMH_new")\ .fit(fix_mapping_df) course_filter = FilterCountTransformer(limit=50)\ .setItemCol("F_MAMH") faculty_filter = FilterCountTransformer(limit=500)\ .setItemCol("F_MAKH") get_max_filter = FilterDuplicateUserItemGetMaxTransformer()\ .setUserCol("MASV1")\ .setItemCol("F_MAMH")\ .setValueCol("TKET") mapping_output_df = course_mapping.transform(df).withColumn( "F_MAMH", col("F_MAMH_new")).select("MASV1", "F_MAMH", "F_MAKH", "TKET", "F_NIENKHOA") mapping_output_df = spark.createDataFrame(mapping_output_df.rdd, mapping_output_df.schema) remove_duplicate_df = get_max_filter.transform(mapping_output_df) course_filter_output_df = course_filter.transform(remove_duplicate_df) # distinct_mapping_count_df = mapping_output_df.select("MASV1", "F_MAMH", "F_MAKH").distinct().groupBy("F_MAMH").agg(count(lit(1)).alias("count_distinct")) # mapping_count_df = mapping_output_df.groupBy("F_MAMH").agg(count(lit(1)).alias("count")) # list_MMH = distinct_mapping_count_df.join(mapping_count_df, ["F_MAMH"]).withColumn("chenhlech", col("count") - col("count_distinct"))\ # .filter(col("chenhlech") != 0)\ # .select("F_MAMH") \ # .rdd.flatMap(lambda x: x).collect() # print(list_MMH) # print(len(list_MMH)) faculty_filter_output_df = faculty_filter.transform( course_filter_output_df) print("After mapping MH count: {}".format(str(mapping_output_df.count()))) print("After mapping SV_MH distinct count: {}".format( str( mapping_output_df.select("MASV1", "F_MAMH", "F_MAKH").distinct().count()))) print("After remove duplicate MH count: {}".format( str(remove_duplicate_df.count()))) print("After remove duplicate SV_MH distinct count: {}".format( str( remove_duplicate_df.select("MASV1", "F_MAMH", "F_MAKH").distinct().count()))) print("After filter MH < 50 data F_MAMH distinct count: {}".format( str(course_filter_output_df.select("F_MAMH").distinct().count()))) print("After filter MH < 50 data SV_MH distinct count: {}".format( str( course_filter_output_df.select("MASV1", "F_MAMH", "F_MAKH").distinct().count()))) print("After filter KH < 500 data F_MAKH distinct count: {}".format( str(faculty_filter_output_df.select("F_MAKH").distinct().count()))) print("After filter KH < 500 data SV_MH distinct count: {}".format( str( faculty_filter_output_df.select("MASV1", "F_MAMH", "F_MAKH").distinct().count()))) faculty_filter_output_df = faculty_filter_output_df.filter(col("F_NIENKHOA") >= 14)\ .select("MASV1", "F_MAMH", "F_MAKH", "TKET") # # split major list_faculty = faculty_filter_output_df.select( "F_MAKH").distinct().rdd.flatMap(lambda x: x).collect() output_path = "preprocess_output_namhoc" print(list_faculty) for faculty in list_faculty: course_filter_faculty = FilterCountTransformer(limit=15) \ .setItemCol("F_MAMH") faculty_filter_df = course_filter_faculty.transform( faculty_filter_output_df.filter(col("F_MAKH") == faculty)) # print(faculty_filter_df.count()) faculty_filter_df = spark.createDataFrame(faculty_filter_df.rdd, faculty_filter_df.schema) list_user = faculty_filter_df.select("MASV1").distinct().rdd.flatMap( lambda x: x).collect() # print(len(list_user)) train, validation = train_test_split(np.array(list_user), test_size=0.2, random_state=1) train, test = train_test_split(np.array(train), test_size=0.25, random_state=1) # print(len(train)) # print(len(validation)) # print(len(test)) user_schema = StructType([StructField("MASV1", LongType())]) train_data_df = faculty_filter_df.join( spark.createDataFrame([[x] for x in train.tolist()], schema=user_schema), ["MASV1"]) validation_data_df = faculty_filter_df.join( spark.createDataFrame([[x] for x in validation.tolist()], schema=user_schema), ["MASV1"]) test_data_df = faculty_filter_df.join( spark.createDataFrame([[x] for x in test.tolist()], schema=user_schema), ["MASV1"]) # print(train_data_df.count()) # print(validation_data_df.count()) # print(test_data_df.count()) # train_data_df.show() # print("new") train_data_df.coalesce(1).write.option("header", "true").option( "charset", "UTF-8").csv("{}/{}/{}".format(output_path, faculty, "train")) validation_data_df.coalesce(1).write.option("header", "true").option( "charset", "UTF-8").csv("{}/{}/{}".format(output_path, faculty, "validation")) test_data_df.coalesce(1).write.option("header", "true").option( "charset", "UTF-8").csv("{}/{}/{}".format(output_path, faculty, "test"))
from pyspark.sql.types import ( ArrayType, BinaryType, DoubleType, StructType, StructField, StringType, IntegerType, LongType, ) ARCHIVE_ORG_SCHEMA = StructType( [ StructField("created", LongType(), True), StructField("d1", StringType(), True), StructField("d2", StringType(), True), StructField("dir", StringType(), True), StructField( "files", ArrayType( StructType( [ StructField("bitrate", StringType(), True), StructField("btih", StringType(), True), StructField("crc32", StringType(), True), StructField("format", StringType(), True), StructField("height", StringType(), True), StructField("length", StringType(), True), StructField("license", StringType(), True), StructField("md5", StringType(), True), StructField("mtime", StringType(), True),
class CCSparkJob: name = 'CCSparkJob' output_schema = StructType([ StructField("key", StringType(), True), StructField("val", LongType(), True) ]) warc_parse_http_header = True args = None records_processed = None warc_input_processed = None warc_input_failed = None log_level = 'INFO' logging.basicConfig(level=log_level, format=LOGGING_FORMAT) num_input_partitions = 400 num_output_partitions = 10 def parse_arguments(self): """ Returns the parsed arguments from the command line """ description = self.name if self.__doc__ is not None: description += " - " description += self.__doc__ arg_parser = argparse.ArgumentParser(description=description) arg_parser.add_argument("input", help="Path to file listing input paths") arg_parser.add_argument("output", help="Name of output table" " (saved in spark.sql.warehouse.dir)") arg_parser.add_argument("--num_input_partitions", type=int, default=self.num_input_partitions, help="Number of input splits/partitions") arg_parser.add_argument("--num_output_partitions", type=int, default=self.num_output_partitions, help="Number of output partitions") arg_parser.add_argument("--local_temp_dir", default=None, help="Local temporary directory, used to" "buffer content from S3") arg_parser.add_argument("--log_level", default=self.log_level, help="Logging level") self.add_arguments(arg_parser) args = arg_parser.parse_args() self.validate_arguments(args) self.init_logging(args.log_level) return args def add_arguments(self, parser): pass def validate_arguments(self, args): return True def init_logging(self, level=None): if level is None: level = self.log_level else: self.log_level = level logging.basicConfig(level=level, format=LOGGING_FORMAT) def get_logger(self, spark_context=None): """Get logger from SparkContext or (if None) from logging module""" if spark_context is None: return logging.getLogger(self.name) return spark_context._jvm.org.apache.log4j.LogManager \ .getLogger(self.name) def run(self): self.args = self.parse_arguments() conf = SparkConf().setAll(( ("spark.task.maxFailures", "10"), ("spark.locality.wait", "20s"), ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"), )) sc = SparkContext(appName=self.name, conf=conf) sqlc = SQLContext(sparkContext=sc) self.records_processed = sc.accumulator(0) self.warc_input_processed = sc.accumulator(0) self.warc_input_failed = sc.accumulator(0) self.run_job(sc, sqlc) sc.stop() def log_aggregator(self, sc, agg, descr): self.get_logger(sc).info(descr.format(agg.value)) def log_aggregators(self, sc): self.log_aggregator(sc, self.warc_input_processed, 'WARC input files processed = {}') self.log_aggregator(sc, self.warc_input_failed, 'WARC input files failed = {}') self.log_aggregator(sc, self.records_processed, 'records processed = {}') @staticmethod def reduce_by_key_func(a, b): return a + b def run_job(self, sc, sqlc): input_data = sc.textFile(self.args.input, minPartitions=self.args.num_input_partitions) output = input_data.mapPartitionsWithIndex(self.process_warcs) \ .reduceByKey(self.reduce_by_key_func) sqlc.createDataFrame(output, schema=self.output_schema) \ .coalesce(self.args.num_output_partitions) \ .write \ .format("parquet") \ .saveAsTable(self.args.output) self.get_logger(sc).info('records processed = {}'.format( self.records_processed.value)) def process_warcs(self, id_, iterator): s3pattern = re.compile('^s3://([^/]+)/(.+)') base_dir = os.path.abspath(os.path.dirname(__file__)) # S3 client (not thread-safe, initialize outside parallelized loop) no_sign_request = botocore.client.Config( signature_version=botocore.UNSIGNED) s3client = boto3.client('s3', config=no_sign_request) for uri in iterator: self.warc_input_processed.add(1) if uri.startswith('s3://'): self.get_logger().info('Reading from S3 {}'.format(uri)) s3match = s3pattern.match(uri) if s3match is None: self.get_logger().error("Invalid S3 URI: " + uri) continue bucketname = s3match.group(1) path = s3match.group(2) warctemp = TemporaryFile(mode='w+b', dir=self.args.local_temp_dir) try: s3client.download_fileobj(bucketname, path, warctemp) except botocore.client.ClientError as exception: self.get_logger().error('Failed to download {}: {}'.format( uri, exception)) self.warc_input_failed.add(1) warctemp.close() continue warctemp.seek(0) stream = warctemp elif uri.startswith('hdfs://'): self.get_logger().error("HDFS input not implemented: " + uri) continue else: self.get_logger().info('Reading local stream {}'.format(uri)) if uri.startswith('file:'): uri = uri[5:] uri = os.path.join(base_dir, uri) try: stream = open(uri, 'rb') except IOError as exception: self.get_logger().error('Failed to open {}: {}'.format( uri, exception)) self.warc_input_failed.add(1) continue no_parse = (not self.warc_parse_http_header) try: for record in ArchiveIterator(stream, no_record_parse=no_parse): for res in self.process_record(record): yield res self.records_processed.add(1) except ArchiveLoadFailed as exception: self.warc_input_failed.add(1) self.get_logger().error('Invalid WARC: {} - {}'.format( uri, exception)) finally: stream.close() def process_record(self, record): raise NotImplementedError('Processing record needs to be customized') @staticmethod def is_wet_text_record(record): """Return true if WARC record is a WET text/plain record""" return (record.rec_type == 'conversion' and record.content_type == 'text/plain') @staticmethod def is_wat_json_record(record): """Return true if WARC record is a WAT record""" return (record.rec_type == 'metadata' and record.content_type == 'application/json')
def _attach_distributed_sequence_column(sdf, column_name): """ >>> sdf = ks.DataFrame(['a', 'b', 'c']).to_spark() >>> sdf = InternalFrame._attach_distributed_sequence_column(sdf, column_name="sequence") >>> sdf.sort("sequence").show() # doctest: +NORMALIZE_WHITESPACE +--------+---+ |sequence| 0| +--------+---+ | 0| a| | 1| b| | 2| c| +--------+---+ """ scols = [scol_for(sdf, column) for column in sdf.columns] spark_partition_column = verify_temp_column_name( sdf, "__spark_partition_id__") offset_column = verify_temp_column_name(sdf, "__offset__") row_number_column = verify_temp_column_name(sdf, "__row_number__") # 1. Calculates counts per each partition ID. `counts` here is, for instance, # { # 1: 83, # 6: 83, # 3: 83, # ... # } sdf = sdf.withColumn(spark_partition_column, F.spark_partition_id()) # Checkpoint the DataFrame to fix the partition ID. sdf = sdf.localCheckpoint(eager=False) counts = map( lambda x: (x["key"], x["count"]), sdf.groupby( sdf[spark_partition_column].alias("key")).count().collect(), ) # 2. Calculates cumulative sum in an order of partition id. # Note that it does not matter if partition id guarantees its order or not. # We just need a one-by-one sequential id. # sort by partition key. sorted_counts = sorted(counts, key=lambda x: x[0]) # get cumulative sum in an order of partition key. cumulative_counts = [0] + list( accumulate(map(lambda count: count[1], sorted_counts))) # zip it with partition key. sums = dict( zip(map(lambda count: count[0], sorted_counts), cumulative_counts)) # 3. Attach offset for each partition. @pandas_udf(LongType(), PandasUDFType.SCALAR) def offset(id): current_partition_offset = sums[id.iloc[0]] return pd.Series(current_partition_offset).repeat(len(id)) sdf = sdf.withColumn(offset_column, offset(spark_partition_column)) # 4. Calculate row_number in each partition. w = Window.partitionBy(spark_partition_column).orderBy( F.monotonically_increasing_id()) row_number = F.row_number().over(w) sdf = sdf.withColumn(row_number_column, row_number) # 5. Calculate the index. return sdf.select((sdf[offset_column] + sdf[row_number_column] - 1).alias(column_name), *scols)
def expected_search_clients_daily_data(define_dataframe_factory): # template for the expected results factory = define_dataframe_factory( map( to_field, [ ('client_id', 'a', StringType(), False), ('sample_id', '42', StringType(), False), ('submission_date', '20170101', StringType(), False), ('os', 'windows', StringType(), True), ('channel', 'release', StringType(), True), ('country', 'DE', StringType(), True), ('locale', 'de', StringType(), True), ('search_cohort', None, StringType(), True), ('app_version', '54.0.1', StringType(), True), ('distribution_id', None, StringType(), True), ('addon_version', '0.9.5', StringType(), False), ('engine', 'google', StringType(), True), ('source', 'urlbar', StringType(), True), ('tagged-sap', None, LongType(), True), ('tagged-follow-on', None, LongType(), True), ('tagged_sap', None, LongType(), True), ('tagged_follow_on', None, LongType(), True), ('sap', 4, LongType(), True), # Roughly 2016-01-01 ('profile_creation_date', 16801, LongType(), False), ('default_search_engine', 'google', StringType(), False), ('default_search_engine_data_load_path', 'jar:[app]/omni.ja!browser/google.xml', StringType(), False), ('default_search_engine_data_submission_url', 'https://www.google.com/search?q=&ie=utf-8&oe=utf-8&client=firefox-b', StringType(), False), ('sessions_started_on_this_day', 1, LongType(), True), ('profile_age_in_days', 366, LongType(), True), ('subsession_hours_sum', 1.0, DoubleType(), True), ('active_addons_count_mean', 2.0, DoubleType(), True), ('max_concurrent_tab_count_max', 10, LongType(), True), ('tab_open_event_count_sum', 5, LongType(), True), ('active_hours_sum', .5, DoubleType(), True), ])) return factory([ { 'client_id': 'b', 'country': 'US' }, # Covers 5 dupe rows and custom app_version, distribution_id rows { 'app_version': '52.0.3', 'sap': 28, 'sessions_started_on_this_day': 7, 'subsession_hours_sum': 7.0, 'tab_open_event_count_sum': 35, 'active_hours_sum': 3.5, }, { 'engine': 'bing' }, { 'engine': 'yahoo' }, { 'client_id': 'c', 'sap': 0, 'tagged-sap': None, 'tagged-follow-on': None, 'tagged_sap': None, 'tagged_follow_on': None, 'source': None, 'engine': None, } ])
"org.postgresql:postgresql:42.2.19") \ .config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:log4j.properties " "-Dspark.yarn.app.container.log.dir=app-logs " "-Dlogfile.name=hello-spark") \ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\ .config("spark.sql.catalog.spark_catalog", "spark.sql.catalog.spark_catalog")\ .getOrCreate() # conf_out = spark.sparkContext.getConf() # print(conf_out.toDebugString()) logger = Log4j(spark) schema = StructType([ StructField("InvoiceNumber", StringType()), StructField("CreatedTime", LongType()), StructField("StoreID", StringType()), StructField("PosID", StringType()), StructField("CashierID", StringType()), StructField("CustomerType", StringType()), StructField("CustomerCardNo", StringType()), StructField("TotalAmount", DoubleType()), StructField("NumberOfItems", IntegerType()), StructField("PaymentMethod", StringType()), StructField("CGST", DoubleType()), StructField("SGST", DoubleType()), StructField("CESS", DoubleType()), StructField("DeliveryType", StringType()), StructField( "DeliveryAddress", StructType([
def process_log_data(spark, input_data_path): """ Summary line. Process log data Parameters: arg1 (spark object) arg2 (Read input from this path which can be local or S3) Returns: log_df, users_table, time_table, user_listen """ pl_start = time() print('Starting to process log data') # get filepath to log data file log_data = input_data_path # read log data file log_schema = StructType([ StructField("artist", StringType()), StructField("auth", StringType()), StructField("firstName", StringType()), StructField("gender", StringType()), StructField("itemInSession", LongType()), StructField("lastName", StringType()), StructField("length", DoubleType()), StructField("level", StringType()), StructField("location", StringType()), StructField("method", StringType()), StructField("page", StringType()), StructField("registration", DoubleType()), StructField("sessionId", LongType()), StructField("song", StringType()), StructField("status", StringType()), StructField("ts", StringType()), StructField("userAgent", StringType()), StructField("userId", StringType()) ]) log_df = spark.read.json(input_data_path, schema=log_schema) # Number of songs users listened to during each level paid_users = log_df.select(['userId', 'level']).filter(log_df['level'] == 'paid') paid_users = paid_users.groupby(['userId']).count() free_users = log_df.select(['userId', 'level']).filter(log_df['level'] == 'free') free_users = free_users.groupby(['userId']).count() paid_users.createOrReplaceTempView('paid_users') free_users.createOrReplaceTempView('free_users') user_listen = spark.sql(""" select a.userId, a.count puCount, b.count fuCount from paid_users a join free_users b on a.userId = b.userId where a.userId != '' """) # Filter only column page with value "NextSong" log_df = log_df.filter(log_df.page == 'NextSong').collect() # Convert List to Spark log_df = spark.createDataFrame(log_df, schema=log_schema) # Convert ts from long to datetime convert_ts = udf( lambda x: datetime.datetime.fromtimestamp(float(x) / 1000.0), TimestampType()) log_df = log_df.withColumn("ts_converted", convert_ts(log_df.ts)) # Convert registration from double to long log_df = log_df.withColumn("registration_converted", log_df.registration.cast(LongType())) pl_et = time() - pl_start print("=== {} Total Elapsed time is {} sec\n".format( 'Process log files : Read & Transformation', round(pl_et, 2))) print('Creating users table') temp_start = time() # extract columns for users table # creating users table with columns user_id, first_name, last_name, gender, level users_table = log_df.select(['userId', 'firstName', 'lastName', 'gender', 'level'])\ .withColumnRenamed('userId', 'user_id')\ .withColumnRenamed('firstName', 'first_name')\ .withColumnRenamed('lastName', 'last_name').dropDuplicates() pl_et = time() - temp_start print("=== {} Total Elapsed time is {} sec\n".format( 'Creating users table', round(pl_et, 2))) print('Creating user_listen table') temp_start = time() user_listen.createOrReplaceTempView('user_listen') users_table.createOrReplaceTempView('users') user_listen = spark.sql(""" select distinct b.first_name, a.puCount, a.fuCount from user_listen a join users b on a.userId = b.user_id """) pl_et = time() - temp_start print("=== {} Total Elapsed time is {} sec\n".format( 'Creating user_listen table', round(pl_et, 2))) # extract columns to create time table # Creating time table with columns start_time, hour, day, week, month, year, weekday print('Creating time table') temp_start = time() time_table = log_df.select(['ts_converted'])\ .withColumnRenamed('ts_converted','start_time') time_table = time_table.withColumn('day', F.dayofmonth('start_time')) \ .withColumn('month', F.month('start_time')) \ .withColumn('year', F.year('start_time')) \ .withColumn('hour', F.hour('start_time')) \ .withColumn('minute', F.minute('start_time')) \ .withColumn('second', F.second('start_time')) \ .withColumn('week', F.weekofyear('start_time')) \ .withColumn('weekday', F.dayofweek('start_time')).dropDuplicates() pl_et = time() - temp_start print("=== {} Total Elapsed time is {} sec\n".format( 'Creating time table', round(pl_et, 2))) pl_et = time() - pl_start print("=== {} Total Elapsed time is {} sec\n".format( 'Process log files : Total', round(pl_et, 2))) return log_df, users_table, time_table, user_listen
"r", encoding='ISO-8859-1', errors='ignore') as f: for line in f: fields = line.split('|') movieNames[int(fields[0])] = fields[1] return movieNames spark = get_spark_session('ALSExample') moviesSchema = StructType([ \ StructField("userID", IntegerType(), True), \ StructField("movieID", IntegerType(), True), \ StructField("rating", IntegerType(), True), \ StructField("timestamp", LongType(), True)]) names = loadMovieNames() ratings = spark.read.option("sep", "\t").schema(moviesSchema) \ .csv(f"{SPARK_DATA_PATH}/ml-100k/u.data") print("Training recommendation model...") als = ALS().setMaxIter(5).setRegParam(0.01).setUserCol("userID").setItemCol("movieID") \ .setRatingCol("rating") model = als.fit(ratings) # Manually construct a dataframe of the user ID's we want recs for userID = int(sys.argv[1])
"""This module contains the schema of the song and log data on S3.""" from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, LongType song_schema = (StructType([ StructField('artist_id', StringType(), True), StructField('artist_latitude', DoubleType(), True), StructField('artist_location', StringType(), True), StructField('artist_longitude', DoubleType(), True), StructField('artist_name', StringType(), True), StructField('duration', DoubleType(), True), StructField('num_songs', LongType(), True), StructField('song_id', StringType(), True), StructField('title', StringType(), True), StructField('year', LongType(), True) ])) log_schema = (StructType([ StructField('artist', StringType(), True), StructField('auth', StringType(), True), StructField('firstName', StringType(), True), StructField('gender', StringType(), True), StructField('itemInSession', LongType(), True), StructField('lastName', StringType(), True), StructField('length', DoubleType(), True), StructField('level', StringType(), True), StructField('location', StringType(), True), StructField('method', StringType(), True), StructField('page', StringType(), True), StructField('registration', DoubleType(), True), StructField('sessionId', LongType(), True),
df = spark.range(1000).toDF("nums") spark.range(5).collect() df.select(df["nums"] + 10) df = spark.read.format("json").load("data/flight-data/json/2015-summary.json") dfS = spark.read.format("json").load( "data/flight-data/json/2015-summary.json").schema dfS df.printSchema() from pyspark.sql.types import StructField, StructType, StringType, LongType myManualSchema = StructType([ StructField("DEST_COUNTRY_NAME", StringType(), True), StructField("ORIGIN_COUNTRY_NAME", StringType(), True), StructField("count", LongType(), False, metadata={"hello": "world"}), ]) df = spark.read.format("json").schema(myManualSchema)\ .load("data/flight-data/json/2015-summary.json") df from pyspark.sql.functions import col, column df.col("count") from pyspark.sql import Row myManualSchema = StructType([ StructField("id", StringType(), True), StructField("name", StringType(), True), StructField("rank", LongType(), False) ])
from util import get_catlog from pyspark.sql.functions import * from pyspark.sql.types import StringType, BooleanType, FloatType, IntegerType, LongType, DateType, TimestampType, Row, \ StructField, StructType from config import * from pyspark.sql import DataFrame from pyspark import SparkConf, SparkContext, HiveContext, RDD from transformData import USER_TITLE, exception, logger, execute_func import sys TYPE_DICT = { "string": StringType(), "boolean": BooleanType(), "float": FloatType(), "int": IntegerType(), "long": LongType(), "date": DateType(), "datetime": TimestampType() } TITTLE = USER_TITLE CONNECT_TABLE_CATELOG = { "table": { "namespace": "default", "name": "ORG_CONNECT_TAB" }, "rowkey": "", "domain": "", "columns": { "org_id": { "cf": "rowkey",
def _select_rows(self, rows_sel): from databricks.koalas.series import Series if isinstance(rows_sel, Series): assert isinstance(rows_sel.spark_type, BooleanType), rows_sel.spark_type return rows_sel._scol, None, None elif isinstance(rows_sel, slice): assert len(self._internal.index_columns) > 0 if rows_sel.step is not None: LocIndexer._raiseNotImplemented("Cannot use step with Spark.") if rows_sel == slice(None): # If slice is None - select everything, so nothing to do return None, None, None elif len(self._internal.index_columns) == 1: sdf = self._internal.sdf index = self._kdf_or_kser.index index_column = index.to_series() index_data_type = index_column.spark_type start = rows_sel.start stop = rows_sel.stop # get natural order from '__natural_order__' from start to stop # to keep natural order. start_and_stop = (sdf.select( index_column._scol, NATURAL_ORDER_COLUMN_NAME ).where( (index_column._scol == F.lit(start).cast(index_data_type)) | (index_column._scol == F.lit(stop).cast(index_data_type)) ).collect()) start = [row[1] for row in start_and_stop if row[0] == start] start = start[0] if len(start) > 0 else None stop = [row[1] for row in start_and_stop if row[0] == stop] stop = stop[-1] if len(stop) > 0 else None cond = [] if start is not None: cond.append( F.col(NATURAL_ORDER_COLUMN_NAME) >= F.lit(start).cast( LongType())) if stop is not None: cond.append( F.col(NATURAL_ORDER_COLUMN_NAME) <= F.lit(stop).cast( LongType())) # if index order is not monotonic increasing or decreasing # and specified values don't exist in index, raise KeyError if ((start is None and rows_sel.start is not None) or (stop is None and rows_sel.stop is not None)): inc, dec = sdf.select( index_column._is_monotonic()._scol.alias('__increasing__'), index_column._is_monotonic_decreasing()._scol.alias('__decreasing__')) \ .select(F.min(F.coalesce('__increasing__', F.lit(True))), F.min(F.coalesce('__decreasing__', F.lit(True)))).first() if start is None and rows_sel.start is not None: start = rows_sel.start if inc is not False: cond.append(index_column._scol >= F.lit( start).cast(index_data_type)) elif dec is not False: cond.append(index_column._scol <= F.lit( start).cast(index_data_type)) else: raise KeyError(rows_sel.start) if stop is None and rows_sel.stop is not None: stop = rows_sel.stop if inc is not False: cond.append(index_column._scol <= F.lit(stop).cast( index_data_type)) elif dec is not False: cond.append(index_column._scol >= F.lit(stop).cast( index_data_type)) else: raise KeyError(rows_sel.stop) if len(cond) > 0: return reduce(lambda x, y: x & y, cond), None, None else: LocIndexer._raiseNotImplemented( "Cannot use slice for MultiIndex with Spark.") elif is_list_like(rows_sel) and not isinstance(rows_sel, tuple): rows_sel = list(rows_sel) if len(rows_sel) == 0: return F.lit(False), None, None elif len(self._internal.index_columns) == 1: index_column = self._kdf_or_kser.index.to_series() index_data_type = index_column.spark_type if len(rows_sel) == 1: return (index_column._scol == F.lit( rows_sel[0]).cast(index_data_type), None, None) else: return (index_column._scol.isin([ F.lit(r).cast(index_data_type) for r in rows_sel ]), None, None) else: LocIndexer._raiseNotImplemented( "Cannot select with MultiIndex with Spark.") else: if not isinstance(rows_sel, tuple): rows_sel = (rows_sel, ) if len(rows_sel) > len(self._internal.index_map): raise SparkPandasIndexingError('Too many indexers') rows = [ scol == value for scol, value in zip(self._internal.index_scols, rows_sel) ] return (reduce(lambda x, y: x & y, rows), None, len(self._internal.index_map) - len(rows_sel))
def _cast_spark_column_timestamp_to_long(self, scol: Column) -> Column: return scol.cast(LongType())
def hour(self) -> ks.Series: """ The hours of the datetime. """ return _wrap_accessor_spark(self, F.hour, LongType()).alias(self.name)