Created on Thu Mar 15 19:07:02 2018 @author: zhcao """ from pyspark.sql import SparkSession import pyspark.sql.types as typ import pyspark.ml.feature as ft from pyspark.ml.clustering import KMeans from pyspark.ml import Pipeline if __name__ == "__main__": spark = SparkSession.builder.appName("XiGuaKMeans").getOrCreate() labels = [('Num', typ.IntegerType()), ('VIB1', typ.FloatType()), ('VIB2', typ.FloatType())] schema = typ.StructType( [typ.StructField(e[0], e[1], False) for e in labels]) data = spark.read.csv( "file:///home/hadoop/zhaco/workspace/spark_test/watermelon.csv", header=True, schema=schema) data.createOrReplaceTempView("data_clu") data.printSchema() data.cache() data.show()
from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext, HiveContext import pyspark.sql.types as typ if __name__ == "__main__": conf = SparkConf().setMaster("local[*]").setAppName("Test7_1") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) hiveContext = HiveContext(sc) labels = [('INFANT_ALIVE_AT_REPORT', typ.StringType()), ('BIRTH_YEAR', typ.IntegerType()), ('BIRTH_MONTH', typ.IntegerType()), ('BIRTH_PLACE', typ.StringType()), ('MOTHER_AGE_YEARS', typ.IntegerType()), ('MOTHER_RACE_6CODE', typ.StringType()), ('MOTHER_EDUCATION', typ.StringType()), ('FATHER_COMBINED_AGE', typ.IntegerType()), ('FATHER_EDUCATION', typ.StringType()), ('MONTH_PRECARE_RECODE', typ.StringType()), ('CIG_BEFORE', typ.IntegerType()), ('CIG_1_TRI', typ.IntegerType()), ('CIG_2_TRI', typ.IntegerType()), ('CIG_3_TRI', typ.IntegerType()), ('MOTHER_HEIGHT_IN', typ.IntegerType()), ('MOTHER_BMI_RECODE', typ.IntegerType()), ('MOTHER_PRE_WEIGHT', typ.IntegerType()),
types.StructField('2008',types.DoubleType()), types.StructField('2009',types.DoubleType()), types.StructField('2010',types.DoubleType()), types.StructField('2011',types.DoubleType()), types.StructField('2012',types.DoubleType()), types.StructField('2013',types.DoubleType()), types.StructField('2014',types.DoubleType()), types.StructField('2015',types.DoubleType()), types.StructField('2016',types.DoubleType()), types.StructField('2017',types.DoubleType()), types.StructField('2018',types.DoubleType()), types.StructField('2019',types.DoubleType()), types.StructField('2020',types.DoubleType()), types.StructField('2021',types.DoubleType()), types.StructField('2022',types.DoubleType()), types.StructField('Estimates Start After',types.IntegerType()) ]) FDindex_schema = types.StructType([ types.StructField('ifs', types.IntegerType()), types.StructField('code', types.StringType()), types.StructField('country', types.StringType()), types.StructField('imf_region', types.StringType()), types.StructField('imf_income', types.StringType()), types.StructField('year', types.IntegerType()), types.StructField('FD', types.DoubleType()), types.StructField('FI', types.DoubleType()), types.StructField('FM', types.DoubleType()), types.StructField('FID',types.DoubleType()), types.StructField('FIA',types.DoubleType()), types.StructField('FIE', types.DoubleType()),
import sys from pyspark.sql import SparkSession, functions, types import pandas as pd spark = SparkSession.builder.appName('wikipedia popular').getOrCreate() assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.1' # make sure we have Spark 2.1+ schema = types.StructType([ types.StructField('language', types.StringType(), False), types.StructField('page', types.StringType(), False), types.StructField('views', types.IntegerType(), False), types.StructField('bytes', types.IntegerType(), False), ]) def main(): in_directory = sys.argv[1] out_directory = sys.argv[2] def spilt(filenmae): file = filenmae.rsplit('/', 1)[1] date = file.split("-", 1)[1] name = date[:-7] return name path_to_hour = functions.udf(spilt, returnType=types.StringType()) data = spark.read.csv(in_directory, sep=" ", schema=schema).withColumn( 'filename', path_to_hour(functions.input_file_name()))
def get_common_spark_testing_client(data_directory, connect): spark = (SparkSession.builder.config('spark.default.parallelism', 4).config('spark.driver.bindAddress', '127.0.0.1').getOrCreate()) _spark_testing_client = connect(spark) s = _spark_testing_client._session num_partitions = 4 df_functional_alltypes = ( s.read.csv( path=str(data_directory / 'functional_alltypes.csv'), schema=pt.StructType([ pt.StructField('index', pt.IntegerType(), True), pt.StructField('Unnamed: 0', pt.IntegerType(), True), pt.StructField('id', pt.IntegerType(), True), # cast below, Spark can't read 0/1 as bool pt.StructField('bool_col', pt.ByteType(), True), pt.StructField('tinyint_col', pt.ByteType(), True), pt.StructField('smallint_col', pt.ShortType(), True), pt.StructField('int_col', pt.IntegerType(), True), pt.StructField('bigint_col', pt.LongType(), True), pt.StructField('float_col', pt.FloatType(), True), pt.StructField('double_col', pt.DoubleType(), True), pt.StructField('date_string_col', pt.StringType(), True), pt.StructField('string_col', pt.StringType(), True), pt.StructField('timestamp_col', pt.TimestampType(), True), pt.StructField('year', pt.IntegerType(), True), pt.StructField('month', pt.IntegerType(), True), ]), mode='FAILFAST', header=True, ).repartition(num_partitions).sort('index')) df_functional_alltypes = df_functional_alltypes.withColumn( "bool_col", df_functional_alltypes["bool_col"].cast("boolean")) df_functional_alltypes.createOrReplaceTempView('functional_alltypes') df_batting = (s.read.csv( path=str(data_directory / 'batting.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('stint', pt.IntegerType(), True), pt.StructField('teamID', pt.StringType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('G', pt.IntegerType(), True), pt.StructField('AB', pt.DoubleType(), True), pt.StructField('R', pt.DoubleType(), True), pt.StructField('H', pt.DoubleType(), True), pt.StructField('X2B', pt.DoubleType(), True), pt.StructField('X3B', pt.DoubleType(), True), pt.StructField('HR', pt.DoubleType(), True), pt.StructField('RBI', pt.DoubleType(), True), pt.StructField('SB', pt.DoubleType(), True), pt.StructField('CS', pt.DoubleType(), True), pt.StructField('BB', pt.DoubleType(), True), pt.StructField('SO', pt.DoubleType(), True), pt.StructField('IBB', pt.DoubleType(), True), pt.StructField('HBP', pt.DoubleType(), True), pt.StructField('SH', pt.DoubleType(), True), pt.StructField('SF', pt.DoubleType(), True), pt.StructField('GIDP', pt.DoubleType(), True), ]), header=True, ).repartition(num_partitions).sort('playerID')) df_batting.createOrReplaceTempView('batting') df_awards_players = (s.read.csv( path=str(data_directory / 'awards_players.csv'), schema=pt.StructType([ pt.StructField('playerID', pt.StringType(), True), pt.StructField('awardID', pt.StringType(), True), pt.StructField('yearID', pt.IntegerType(), True), pt.StructField('lgID', pt.StringType(), True), pt.StructField('tie', pt.StringType(), True), pt.StructField('notes', pt.StringType(), True), ]), header=True, ).repartition(num_partitions).sort('playerID')) df_awards_players.createOrReplaceTempView('awards_players') df_simple = s.createDataFrame([(1, 'a')], ['foo', 'bar']) df_simple.createOrReplaceTempView('simple') df_struct = s.createDataFrame([((1, 2, 'a'), )], ['struct_col']) df_struct.createOrReplaceTempView('struct') df_nested_types = s.createDataFrame( [([1, 2], [[3, 4], [5, 6]], { 'a': [[2, 4], [3, 5]] })], [ 'list_of_ints', 'list_of_list_of_ints', 'map_string_list_of_list_of_ints', ], ) df_nested_types.createOrReplaceTempView('nested_types') df_complicated = s.createDataFrame([({ (1, 3): [[2, 4], [3, 5]] }, )], ['map_tuple_list_of_list_of_ints']) df_complicated.createOrReplaceTempView('complicated') df_udf = s.createDataFrame( [('a', 1, 4.0, 'a'), ('b', 2, 5.0, 'a'), ('c', 3, 6.0, 'b')], ['a', 'b', 'c', 'key'], ) df_udf.createOrReplaceTempView('udf') df_udf_nan = s.createDataFrame( pd.DataFrame({ 'a': np.arange(10, dtype=float), 'b': [3.0, np.NaN] * 5, 'key': list('ddeefffggh'), })) df_udf_nan.createOrReplaceTempView('udf_nan') df_udf_null = s.createDataFrame( [(float(i), None if i % 2 else 3.0, 'ddeefffggh'[i]) for i in range(10)], ['a', 'b', 'key'], ) df_udf_null.createOrReplaceTempView('udf_null') df_udf_random = s.createDataFrame( pd.DataFrame({ 'a': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'b': np.arange(4, dtype=float).tolist() + np.random.rand(3).tolist(), 'key': list('ddeefff'), })) df_udf_random.createOrReplaceTempView('udf_random') return _spark_testing_client
def main(): get_pred_error = functions.udf(get_sq_error, types.FloatType()) convert_to_int = functions.udf(lambda value: int(value), types.IntegerType()) get_day_of_week = functions.udf(lambda timestamp: datetime.strptime(timestamp, '%Y-%m-%d').weekday(), types.StringType()) get_month = functions.udf(lambda timestamp: datetime.strptime(timestamp, '%Y-%m-%d').month, types.StringType()) training_data = spark.read.csv('train_rating.txt', header=True) test_data = spark.read.csv('test_rating.txt', header=True) for column in training_data.columns: if(column != 'date'): training_data = training_data.withColumn(column, convert_to_int(training_data[column])) else: training_data = training_data.withColumn('dow', get_day_of_week(training_data[column])) training_data = training_data.withColumn('month', get_month(training_data[column])) for column in test_data.columns: if (column != 'date'): test_data = test_data.withColumn(column, convert_to_int(test_data[column])) else: test_data = test_data.withColumn('dow', get_day_of_week(test_data[column])) test_data = test_data.withColumn('month', get_month(test_data[column])) training_data = training_data.drop('date') test_data = test_data.drop('date') discreete_columns = ['dow', 'month'] string_indexer = [StringIndexer(inputCol='{}'.format(column), outputCol='{}_ind'.format(column)) for column in discreete_columns] hot_encoders = [OneHotEncoder(inputCol='{}_ind'.format(column), outputCol='{}_he'.format(column)) for column in discreete_columns] vector_assembler = VectorAssembler(inputCols=["user_id", "business_id", "dow_he", "month_he"], outputCol="features") rf = RandomForestClassifier(numTrees=25, maxDepth=10, labelCol="rating", seed=42) models = [ ('Rand-forest', Pipeline(stages=string_indexer + hot_encoders + [vector_assembler, rf])) ] evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol='rating') # split data into training and testing train, test = training_data.randomSplit([0.8, 0.2]) train = train.cache() test = test.cache() for label, pipeline in models: model = pipeline.fit(train) predictions = model.transform(test) predictions = predictions.withColumn('sq_error', get_pred_error(predictions['rating'], predictions['prediction'])) rmse_score = predictions.groupBy().avg('sq_error').head()[0] # calculate a score score = evaluator.evaluate(predictions) print(label, rmse_score ** 0.5) #Uncomment when you are satisfied with training # final_pred = model.transform(test_data) # final_pred = final_pred.withColumnRenamed('prediction', 'rating') # final_pred.select("test_id","rating").toPandas().to_csv('submission.csv', sep=',', encoding='utf-8',index=False) return
Zhukun Luo Jiangxi University of Finance and Economics ''' #导入sparksql包 from pyspark.sql import SparkSession from pyspark.sql import types as T #创建一个spark环境 spark = ( SparkSession.builder.master("local[4]") #表示创建一个4个线程 .appName("Exploring Joins").config("spark.some.config.option", "some-value").getOrCreate()) sc = spark.sparkContext #创建一个数据模板 schema = T.StructType([ #数据格式 T.StructField("user_id", T.IntegerType(), False), T.StructField("name", T.StringType(), True), T.StructField("sex", T.StringType(), True), T.StructField("age", T.IntegerType(), True), ]) data = [ #数据 (1, "ming Li", "male", 13), (2, "fang Zhang", "female", 12), (2, "hong Wang", "female", 1), ] user_df = spark.createDataFrame( #创建一个dataframe data=data, schema=schema) user_df.toPandas()
select('id', 'connections_count') # Find the most popular superhero with the most connections. # Result is a Row object superhero_connections_by_id_df = superhero_connections_df.groupBy('id'). \ agg(func.sum('connections_count').alias('connections_freq')) superhero_connections_by_id_df.show() # Find min frequency in superhero connections. # first: to convert it to Row object min_frequency = superhero_connections_by_id_df.agg(func.min(func.col('connections_freq')).alias('min_freq')). \ first()['min_freq'] # Get the dataset of superheros' name. superhero_names_schema = types.StructType([ types.StructField("id", types.IntegerType(), False), types.StructField("name", types.StringType(), False) ]) superhero_names_df = spark.read.schema(superhero_names_schema).csv( 'dataset/Marvel+Names', sep=' ').cache() # Get superheros with their name having minimum connection frequency. # First filter and then join two dataframes. most_obscure_superheros = superhero_connections_by_id_df.filter(func.col('connections_freq') == min_frequency).\ join(superhero_names_df, on='id') most_obscure_superheros.show() spark.stop()
def process_immigration_data(spark, input_data, output_data, date_string): # Get execution_date execution_date = stringToDatetimeYYYYMMDD(date_string) # Extract the last two digits from the year year = datetimeToYearShort(execution_date) # Extract month short version apr month = datetimeToMonthShort(execution_date) # Extract the day of month day = datetimeToDayClasic(execution_date) path = input_data + "i94_{0}{1}_sub.sas7bdat".format(month.lower(), year) df_immigration_data = spark.read.format( 'com.github.saurfang.sas.spark').load(path) # df_immigration_data = df_immigration_data.filter('date="2016-04-06"') # df_immigration_data = spark.read.options(delimiter=",", header=True, encoding="UTF-8").csv( # input_data) # trim spaces for arrdate df_immigration_data = trimStrings(df_immigration_data, ['arrdate']) # Set arrdate as integer because it is a timestamp df_immigration_data = df_immigration_data.withColumn( "arrdate", df_immigration_data.arrdate.cast("integer")) # transform the timestamp in a date df_immigration_data = df_immigration_data.withColumn( "date", udf_date_timedelta(df_immigration_data.arrdate).cast("date")) # create the day column df_immigration_data = df_immigration_data.withColumn( 'day', datetimeToDay(df_immigration_data.date)) #filter by day column df_immigration_data.createOrReplaceTempView("immigration_data") df_immigration_data = spark.sql( """SELECT arrdate,day,date,admnum,i94cit,i94res, i94bir,i94port,i94mode, i94addr,i94visa,gender,i94yr,i94mon FROM immigration_data WHERE day={0}""".format(day)) # trim spaces for the rest of the columns df_immigration_data = trimStrings(df_immigration_data, [ 'admnum', 'i94cit', 'i94res', 'i94bir', 'i94port', 'i94mode', 'i94addr', 'i94visa', 'gender' ]) ### Set the rest of columns for time table #transform timestamp in a datetime object to be able to get the hour df_immigration_data = df_immigration_data.withColumn( "datetime", get_datetime(df_immigration_data.arrdate)) df_immigration_data = df_immigration_data.withColumnRenamed( 'i94yr', 'year') df_immigration_data = df_immigration_data.withColumn( 'hour', datetimeToHour(df_immigration_data.datetime)) df_immigration_data = df_immigration_data.withColumn( 'week', datetimeToWeek(df_immigration_data.date)) df_immigration_data = df_immigration_data.withColumn( 'day', datetimeToDay(df_immigration_data.date)) df_immigration_data = df_immigration_data.withColumnRenamed( 'i94mon', 'month') df_immigration_data = df_immigration_data.withColumn( 'weekday', datetimeToWeekDay(df_immigration_data.date)) df_immigration_data = df_immigration_data['admnum', 'arrdate', 'year', 'month', 'day', 'weekday', 'hour', 'week', 'i94cit', 'i94res', 'i94bir', 'i94port', 'i94mode', 'i94addr', 'i94visa', 'gender'] # assure the columns are delivered in the right data type df_immigration_data = df_immigration_data \ .withColumn("admnum", castAbsInt_udf(F.col("admnum"))) \ .withColumn("arrdate", castInt_udf(F.col("arrdate"))) \ .withColumn("year", F.col("year").cast(T.IntegerType())) \ .withColumn("month", F.col("month").cast(T.IntegerType())) \ .withColumn("day", F.col("day").cast(T.IntegerType())) \ .withColumn("weekday", F.col("weekday").cast(T.StringType())) \ .withColumn("hour", F.col("hour").cast(T.IntegerType())) \ .withColumn("week", F.col("week").cast(T.IntegerType())) \ .withColumn("i94cit", castInt_udf(F.col("i94cit"))) \ .withColumn("i94res", castInt_udf(F.col("i94res"))) \ .withColumn("i94bir", castInt_udf(F.col("i94bir"))) \ .withColumn("i94port", F.col("i94port").cast(T.StringType())) \ .withColumn("i94mode", castInt_udf(F.col("i94mode"))) \ .withColumn("i94addr", F.col("i94addr").cast(T.StringType())) \ .withColumn("i94visa", castInt_udf(F.col("i94visa"))) \ .withColumn("gender", F.col("gender").cast(T.StringType())) # filter immigrants without admission number df_immigration_data = df_immigration_data.na.drop(subset=["admnum"]) df_immigration_data.write.mode("overwrite")\ .parquet( output_data + "immigration")
# is the input JSON? "es.input.json": "yes" } ## is there a field in the mapping that should be used to specify the ES document ID # "es.mapping.id": "id" # Define Training Set Structure tweetSchema = tp.StructType([ # Todo use proper timestamp tp.StructField(name='timestamp', dataType=tp.LongType(), nullable=True), tp.StructField(name='tweet', dataType=tp.StringType(), nullable=True) ]) schema = tp.StructType([ tp.StructField(name='id', dataType=tp.StringType(), nullable=True), tp.StructField(name='subjective', dataType=tp.IntegerType(), nullable=True), tp.StructField(name='positive', dataType=tp.IntegerType(), nullable=True), tp.StructField(name='negative', dataType=tp.IntegerType(), nullable=True), tp.StructField(name='ironic', dataType=tp.IntegerType(), nullable=True), tp.StructField(name='lpositive', dataType=tp.IntegerType(), nullable=True), tp.StructField(name='lnegative', dataType=tp.IntegerType(), nullable=True), tp.StructField(name='top', dataType=tp.IntegerType(), nullable=True), tp.StructField(name='tweet', dataType=tp.StringType(), nullable=True) ]) # Create Spark Context sc = SparkContext(appName="Tweet") spark = SparkSession(sc) sc.setLogLevel("WARN")
import datetime as dt spark = get_spark_context(__name__) df = ( spark.readStream.format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", "prices") .option("startingOffsets", "earliest") .load() .selectExpr( "CAST(key AS STRING)", "CAST(value AS STRING)", "timestamp", ) .select( psf.col("key").cast(pst.IntegerType()).alias("match_id"), psf.col("value").alias("body"), "timestamp", psf.to_date(psf.col("timestamp")).alias("arrival_date") ) .coalesce(1) .writeStream.format("json") .outputMode("append") .option("checkpointLocation", f"./_checkpoints/raw") .partitionBy("arrival_date") .trigger(processingTime='1 minute') .start("./raw/prices") ) spark.streams.awaitAnyTermination()
def data_describe(self): sqlContext = SQLContext(self.sc) print('starto read data after explore_saprk_step1_cross:') rootPath=self.parser.get("hdfs_path", "hdfs_data_path") print('start to read actLog_train_single_cross') test_file_path = rootPath + 'actLog_test_single_cross' actLog_test_rdd = self.sc.pickleFile(test_file_path) #比对label,看labels是否合适 labels=[ ('duration_time',typ.IntegerType()), ('device',typ.IntegerType()), ('music_id',typ.IntegerType()), ('item_city',typ.IntegerType()), ('author_id',typ.IntegerType()), ('item_id',typ.IntegerType()), ('user_city',typ.IntegerType()), ('uid',typ.IntegerType()), ('channel',typ.IntegerType()), ('finish',typ.IntegerType()), ('like',typ.IntegerType()), ('time_day',typ.IntegerType()), ('item_pub_month',typ.IntegerType()), ('item_pub_day',typ.LongType()), ('item_pub_hour',typ.IntegerType()), ('item_pub_minute',typ.IntegerType()), ('uid_count_bin',typ.IntegerType()), ('user_city_count_bin',typ.IntegerType()), ('user_city_count_ratio',typ.DoubleType()), ('item_id_count_bin',typ.IntegerType()), ('item_id_count_ratio',typ.DoubleType()), ('author_id_count_bin',typ.IntegerType()), ('author_id_count_ratio',typ.DoubleType()), ('item_city_count_bin',typ.IntegerType()), ('item_city_count_ratio',typ.DoubleType()), ('music_id_count_bin',typ.IntegerType()), ('music_id_count_ratio',typ.DoubleType()), ('device_count_bin',typ.IntegerType()), ('device_count_ratio',typ.DoubleType()), ('uid_author_id_count_bin',typ.IntegerType()), ('uid_author_id_count_ratio',typ.DoubleType()), ('uid_item_city_count_bin',typ.IntegerType()), ('uid_item_city_count_ratio',typ.DoubleType()), ('uid_channel_count_bin',typ.IntegerType()), ('uid_channel_count_ratio',typ.DoubleType()), ('uid_music_id_count_bin',typ.IntegerType()), ('uid_music_id_count_ratio',typ.DoubleType()), ('uid_device_count_bin',typ.IntegerType()), ('uid_device_count_ratio',typ.DoubleType()), ('author_id_channel_count_bin',typ.IntegerType()), ('author_id_channel_count_ratio',typ.DoubleType()), ('author_id_user_city_count_bin',typ.IntegerType()), ('author_id_user_city_count_ratio',typ.DoubleType()), ('author_id_item_city_count_bin',typ.IntegerType()), ('author_id_item_city_count_ratio',typ.DoubleType()), ('author_id_music_id_count_bin',typ.IntegerType()), ('author_id_music_id_count_ratio',typ.DoubleType()), ('uid_channel_device_count_bin',typ.IntegerType()), #改成uid_channel_device ('uid_channel_device_count_ratio',typ.DoubleType()), #改成uid_channel_device ('author_id_item_city_music_id_count_bin',typ.IntegerType()), ('author_id_item_city_music_id_count_ratio',typ.DoubleType()), ] actionLogSchema=typ.StructType([typ.StructField(e[0],e[1],True) for e in labels]) df_actLog_test = sqlContext.createDataFrame(actLog_test_rdd,actionLogSchema) # df_actLog_test.show(1,truncate=False) print('start to read actLog_train_single_cross') train_file_path = rootPath + 'actLog_train_single_cross' actLog_train_rdd = self.sc.pickleFile(train_file_path) df_actLog_train = sqlContext.createDataFrame(actLog_train_rdd,actionLogSchema) # df_actLog_train.show(1,truncate=False) return df_actLog_train, df_actLog_test
def data_explore(self,df_train,df_test): sqlContext = SQLContext(self.sc) print("对item_pub_hour进行离散化") def hourBin(x): if x>=23 or x <=2: return 1 elif 3<=x<8: return 2 elif 8<=x<12: return 3 else: return 4 converHourBin=udf(lambda x :hourBin(x), typ.IntegerType()) df_train = df_train.withColumn("item_pub_hour", converHourBin(df_train.item_pub_hour)) df_test = df_test.withColumn("item_pub_hour", converHourBin(df_test.item_pub_hour)) print("----1、计算统计特征:用户特征和item特征之间的条件概率---------") feats_list = [] condition = ['uid'] authors = ['music_id','item_pub_hour'] #'author_id', 'item_city', 'channel', feats_list.extend([[u_col, a_col] for u_col in condition for a_col in authors]) df_tmp=df_train.select(condition) df2=df_tmp.groupby(condition).count().withColumnRenamed('count',condition[0]+'_count') # df2.show(1,truncate=False) # ['uid','uid_count'] df2.cache() # df_train=df_train.join(df2,condition,'left') # df_train.show(1,truncate=False) # cannot resolve '`uid_count`' given input columns: [time, user_city, like, author_id, uid, device, music_id, finish, duration_time, channel, item_city, item_id] # del df2 # gc.collect() for feature_group in feats_list: print(feature_group+[feature_group[0]+'_count']) #+[feature_group[0]+'_count'] df1=df_train.select(feature_group).groupby(feature_group).count() # df1.show(1,truncate=False) #理论上还是只有3个字段,不包含uid_count df1=df1.join(df2,condition,'left') df1.show(1,truncate=False) #|uid|item_pub_hour|count|uid_count df1=df1.withColumn(feature_group[1]+'_'+feature_group[0]+"_condition_ratio",fn.col('count')/fn.col(feature_group[0]+'_count')) df1=df1.drop('count').drop(feature_group[0]+'_count') df1.show(1,truncate=False) print(df_train.columns) print(df1.columns) df_train=df_train.join(df1,feature_group,"left") #|uid|item_pub_hour|item_pub_hour_uid_condition_ratio df_train.show(1,truncate=False) df_test=df_test.join(df1,feature_group,"left").na.fill({feature_group[1]+'_'+feature_group[0]+"_condition_ratio":0}) #对某一列填充缺失值 df_test.show(1,truncate=False) feats_list = [] condition = ['item_id'] authors = ['uid_city', 'channel'] feats_list.extend([[u_col, a_col] for u_col in condition for a_col in authors]) df_tmp=df_train.select(condition) df2=df_tmp.groupby(condition).count().withColumnRenamed('count',condition[0]+'_count') # df2.show(1,truncate=False) # ['uid','uid_count'] df2.cache() # df_train=df_train.join(df2,condition,'left') # df_train.show(1,truncate=False) # cannot resolve '`uid_count`' given input columns: [time, user_city, like, author_id, uid, device, music_id, finish, duration_time, channel, item_city, item_id] # del df2 # gc.collect() for feature_group in feats_list: print(feature_group+[feature_group[0]+'_count']) #+[feature_group[0]+'_count'] df1=df_train.select(feature_group).groupby(feature_group).count() # df1.show(1,truncate=False) #理论上还是只有3个字段,不包含uid_count df1=df1.join(df2,condition,'left') df1.show(1,truncate=False) df1=df1.withColumn(feature_group[1]+'_'+feature_group[0]+"_condition_ratio",fn.col('count')/fn.col(feature_group[0]+'_count')) df1=df1.drop('count').drop(feature_group[0]+'_count') # df1.show(5) df_train=df_train.join(df1,feature_group,"left") df_train.show(1,truncate=False) df_test=df_test.join(df1,feature_group,"left").na.fill({feature_group[1]+'_'+feature_group[0]+"_condition_ratio":0}) #对某一列填充缺失值 df_test.show(1,truncate=False) df_train=df_train.drop('uid_count').drop('item_id_count') df_train.printSchema() df_test.printSchema() print('-------5.保存数据预处理结果-------') test_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_test_step3_try' os.system("hadoop fs -rm -r {}".format(test_file_path)) df_test.rdd.map(tuple).saveAsPickleFile(test_file_path) del df_test gc.collect() train_file_path = self.parser.get("hdfs_path", "hdfs_data_path") + 'actLog_train_step3_try' os.system("hadoop fs -rm -r {}".format(train_file_path)) #os.system(command) 其参数含义如下所示: command 要执行的命令 df_train.rdd.map(tuple).saveAsPickleFile(train_file_path)
from pyspark.sql import SparkSession, functions as F, types as T from pyspark import StorageLevel as S from datetime import datetime as dt import os # ## Connect to the Spark cluster spark = SparkSession\ .builder\ .appName("ProcessDoubleClick")\ .getOrCreate() # # Data Import # Read in raw data impressionFields = [ T.StructField('advertiserID', T.IntegerType(), False), T.StructField('domain', T.StringType(), False), T.StructField('viewable', T.BooleanType(), False), T.StructField('city', T.StringType(), False), T.StructField('mobileDevice', T.StringType(), False), T.StructField('country', T.StringType(), False), T.StructField('sellerPrice', T.IntegerType(), False), T.StructField('userID', T.IntegerType(), False), T.StructField('impressionID', T.IntegerType(), False), T.StructField('postalCode', T.StringType(), False), T.StructField('carrier', T.StringType(), False), T.StructField('eventType', T.StringType(), False), T.StructField('lineItemID', T.IntegerType(), False), T.StructField('time', T.TimestampType(), False), T.StructField('duration', T.IntegerType(), False), T.StructField('browser', T.StringType(), False),
def lookup(mapping): def fn(v): return mapping.index(v) return F.udf(fn, returnType=T.IntegerType())
from pyspark.sql import * from pyspark.sql import types as T from pyspark import SparkContext def parse(line): items = line.split(",") return (int(items[0]), items[1], int(items[2])) sc = SparkContext() spark = SparkSession.builder.master("local").config( "spark.some.config.option", "some-value").getOrCreate() schema = T.StructType([ T.StructField("class", T.IntegerType(), True), T.StructField("name", T.StringType(), True), T.StructField("score", T.IntegerType(), True), ]) rdd = sc.textFile('/Users/zdeng-ext/school/test.csv').map(parse) df = spark.createDataFrame(rdd, schema) df.write.format('com.databricks.spark.avro').save("people_avro")
import sys assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('temp_range_dataframe').getOrCreate() assert spark.version >= '2.4' # make sure we have Spark 2.4+ observation_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.StringType()), types.StructField('observation', types.StringType()), types.StructField('value', types.IntegerType()), types.StructField('mflag', types.StringType()), types.StructField('qflag', types.StringType()), types.StructField('sflag', types.StringType()), types.StructField('obstime', types.StringType()), ]) def main(inputs, output): weather = spark.read.csv(inputs, schema=observation_schema) filter_weather = weather.filter(weather.qflag.isNull()).cache() Max_weather = filter_weather.filter( weather.observation == ('TMAX')).withColumn('tmax', weather.value) Max_temperature = Max_weather.select('station', 'date', 'tmax') Max_temperature.show(10) Min_weather = filter_weather.filter( weather.observation == ('TMIN')).withColumn('tmin', weather.value) Min_temperature = Min_weather.select('station', 'date', 'tmin')
config = tf.ConfigProto(device_count={'GPU': 0}) config.inter_op_parallelism_threads = 1 config.intra_op_parallelism_threads = 1 K.set_session(tf.Session(config=config)) # Restore from checkpoint. model = deserialize_model(model_bytes, tf.keras.models.load_model) # Perform predictions. for row in rows: fields = row.asDict().copy() # Convert from log domain to real Sales numbers. log_sales = model.predict_on_batch([[row[col]] for col in all_cols])[0] # Add 'Sales' column with prediction results. fields['Sales'] = math.exp(log_sales) yield Row(**fields) return fn # Submit a Spark job to do inference. Horovod framework is not involved here. pred_df = spark.read.parquet('%s/test_df.parquet' % args.data_dir) \ .rdd.mapPartitions(predict_fn(best_model_bytes)).toDF() submission_df = pred_df.select(pred_df.Id.cast(T.IntegerType()), pred_df.Sales).toPandas() submission_df.sort_values(by=['Id']).to_csv(args.local_submission_csv, index=False) print('Saved predictions to %s' % args.local_submission_csv) spark.stop()
def _generate_select_expression_for_extended_string_to_int(source_column, name): """ More robust conversion from StringType to IntegerType. Is able to additionally handle (compared to implicit Spark conversion): * Preceding whitespace * Trailing whitespace * Preceeding and trailing whitespace * underscores as thousand separators Hint ---- Please have a look at the tests to get a better feeling how it behaves under tests/unit/transformer/test_mapper_custom_data_types.py::TestExtendedStringConversions and tests/data/test_fixtures/mapper_custom_data_types_fixtures.py Example ------- >>> from spooq2.transformer import Mapper >>> >>> input_df.head(3) [Row(input_string=" 123456 "), Row(input_string="Hello"), Row(input_string="123_456")] >>> mapping = [("output_value", "input_string", "extended_string_to_int")] >>> output_df = Mapper(mapping).transform(input_df) >>> output_df.head(3) [Row(input_string=123456), Row(input_string=None), Row(input_string=123456)] """ return _generate_select_expression_for_extended_string_to_long(source_column, name).cast(T.IntegerType())
def prepare_df(df): num_rows = df.count() # Expand dates. df = expand_date(df) df = df \ .withColumn('Open', df.Open != '0') \ .withColumn('Promo', df.Promo != '0') \ .withColumn('StateHoliday', df.StateHoliday != '0') \ .withColumn('SchoolHoliday', df.SchoolHoliday != '0') # Merge in store information. store = store_csv.join(store_states_csv, 'Store') df = df.join(store, 'Store') # Merge in Google Trend information. google_trend_all = prepare_google_trend() df = df.join(google_trend_all, ['State', 'Year', 'Week']).select(df['*'], google_trend_all.trend) # Merge in Google Trend for whole Germany. google_trend_de = google_trend_all[google_trend_all.file == 'Rossmann_DE'] google_trend_de = google_trend_de.withColumnRenamed( 'trend', 'trend_de') df = df.join(google_trend_de, ['Year', 'Week']).select(df['*'], google_trend_de.trend_de) # Merge in weather. weather = weather_csv.join( state_names_csv, weather_csv.file == state_names_csv.StateName) df = df.join(weather, ['State', 'Date']) # Fix null values. df = df \ .withColumn('CompetitionOpenSinceYear', F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900))) \ .withColumn('CompetitionOpenSinceMonth', F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1))) \ .withColumn('Promo2SinceYear', F.coalesce(df.Promo2SinceYear, F.lit(1900))) \ .withColumn('Promo2SinceWeek', F.coalesce(df.Promo2SinceWeek, F.lit(1))) # Days & months competition was open, cap to 2 years. df = df.withColumn( 'CompetitionOpenSince', F.to_date( F.format_string('%s-%s-15', df.CompetitionOpenSinceYear, df.CompetitionOpenSinceMonth))) df = df.withColumn( 'CompetitionDaysOpen', F.when( df.CompetitionOpenSinceYear > 1900, F.greatest( F.lit(0), F.least( F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince)))).otherwise(0)) df = df.withColumn('CompetitionMonthsOpen', (df.CompetitionDaysOpen / 30).cast(T.IntegerType())) # Days & weeks of promotion, cap to 25 weeks. df = df.withColumn( 'Promo2Since', F.expr( 'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)' )) df = df.withColumn( 'Promo2Days', F.when( df.Promo2SinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since)))).otherwise(0)) df = df.withColumn('Promo2Weeks', (df.Promo2Days / 7).cast(T.IntegerType())) # Check that we did not lose any rows through inner joins. assert num_rows == df.count(), 'lost rows in joins' return df
import numpy as np import pandas as pd import matplotlib matplotlib.use('Agg') # don't fail when on headless server import matplotlib.pyplot as plt from pyspark.sql import SparkSession, types colour_schema = types.StructType([ types.StructField('R', types.IntegerType(), False), types.StructField('G', types.IntegerType(), False), types.StructField('B', types.IntegerType(), False), types.StructField('word', types.StringType(), False), types.StructField('confidence', types.StringType(), False), ]) def rgb2lab_query(table_name='__THIS__', passthrough_columns=None, input_bytes=True, r='R', g='G', b='B', out_l='labL', out_a='labA', out_b='labB'): """ Build SQL query to convert RGB colours to LAB colours. table_name: name of the input table to query from. Must be '__THIS__' if being used in a SQLTransformer. passthrough_columns: list of column names that should be preserved and selected into the resulting table.
arr=line.split('|') #print("size: ",len(arr)) return [int(arr[0]) if arr[0] else None,int(arr[1]) if arr[1] else None,arr[2],int(arr[3]) if arr[3] else None,int(arr[4]) if arr[4] else None,int(arr[5]) if arr[5] else None] def savetheresult( rdd ): if not rdd.isEmpty(): nrdd = rdd.map(call_split) df = sparkSess.createDataFrame(nrdd,schema) #df.show() #df.coalesce(2).write.mode("append").parquet("s3://bigdata-4/parquet/") df.coalesce(2).write.mode("append").parquet("s3://bigdata-4/post_history/") sparkSess = SparkSession.builder.appName('post_history_table').getOrCreate() sc = sparkSess.sparkContext schema = types.StructType([ types.StructField('unnamed', types.IntegerType()), types.StructField('id', types.IntegerType()), types.StructField('creation_date', types.StringType()), types.StructField('post_id', types.IntegerType()), types.StructField('post_history_type_id', types.IntegerType()), types.StructField('user_id', types.IntegerType())]) ssc = StreamingContext(sc, 1) kvs = KafkaUtils.createDirectStream(ssc, ["post_history"], {"bootstrap.servers": "127.0.0.1:9092"}) lines = kvs.map(lambda x: x[1]) #lines.pprint() lines2=lines.foreachRDD(savetheresult) ssc.start() ssc.awaitTermination()
#get data data = sqlContext.read.parquet( "CTU-Flows_main/Flows.parquet/_yyyymd={}".format(sys.argv[1])) #data = sqlContext.read.parquet(sys.argv[1]) df = data.dropDuplicates() #filter flows with dstIP outside of the university and srcIP inside range 80-83 mask 22 df = data.filter(data.Proto == "tcp").filter( data.DstAddr.startswith("147.32.8")).filter( ~data.SrcAddr.startswith("147.32.8")).select("DstAddr", "Dport", "State", "StartTime", "SrcAddr") #select day from timestamp and convert cast numbers to int df = df.withColumn('Dport', df["Dport"].cast(T.IntegerType())) df = df.withColumn( 'timestamp', unix_timestamp('StartTime', 'yyyy/MM/dd hh:mm:ss.SSSSSS').cast(T.TimestampType())) #add column for day df = df.withColumn( 'day', unix_timestamp('StartTime', 'yyyy/MM/dd').cast(T.TimestampType())) df = df.filter(col('Dport').isNotNull()) #get flows from attackers attackers = df.select('SrcAddr', 'DstAddr', 'Dport', "day").distinct().groupBy( 'SrcAddr', "day").agg( F.countDistinct('DstAddr').alias('addrCount'), F.countDistinct('Dport').alias('portCount')).filter(
import pyspark.sql.types as typ from pyspark.sql import SparkSession spark=SparkSession.builder.appName('spark0').getOrCreate() import pyspark.ml.feature as ft import pyspark.ml.classification as cl import pyspark.ml.regression as reg import pyspark.ml.clustering as clu from pyspark.ml import Pipeline,PipelineModel import pyspark.ml.evaluation as ev import pyspark.ml.tuning as tune #定义数据结构 labels = [ ('INFANT_ALIVE_AT_REPORT', typ.IntegerType()), ('BIRTH_PLACE', typ.StringType()), ('MOTHER_AGE_YEARS', typ.IntegerType()), ('FATHER_COMBINED_AGE', typ.IntegerType()), ('CIG_BEFORE', typ.IntegerType()), ('CIG_1_TRI', typ.IntegerType()), ('CIG_2_TRI', typ.IntegerType()), ('CIG_3_TRI', typ.IntegerType()), ('MOTHER_HEIGHT_IN', typ.IntegerType()), ('MOTHER_PRE_WEIGHT', typ.IntegerType()), ('MOTHER_DELIVERY_WEIGHT', typ.IntegerType()), ('MOTHER_WEIGHT_GAIN', typ.IntegerType()), ('DIABETES_PRE', typ.IntegerType()), ('DIABETES_GEST', typ.IntegerType()), ('HYP_TENS_PRE', typ.IntegerType()), ('HYP_TENS_GEST', typ.IntegerType()), ('PREV_BIRTH_PRETERM', typ.IntegerType())] schema=typ.StructType([typ.StructField(e[0],e[1],False) for e in labels])
]) simple_df = spark.createDataFrame(simple_rdd) # optionally can give colum names #simple_df.printSchema() # will print inferred schema. date is wrongly shown as string #### Specify schema : faster schema = [ ('Date', types.DateType()) , ('Name', types.StringType()) , ('Age', types.IntegerType()) , ('Weight', types.IntegerType()) , ('Location', types.StringType()) ] schema = types.StructType([types.StructField(e[0],e[1], False) for e in schema]) simple_df_schema = spark.createDataFrame( simple_rdd .map(lambda row: [dt.datetime.strptime(row[0], '%Y-%m-%d')] + row[1:] ) , schema=schema ) #simple_df_schema = spark.createDataFrame(simple_rdd,schema=schema)
import sys from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('first Spark app').getOrCreate() assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.1' # make sure we have Spark 2.1+ schema = types.StructType([ types.StructField('id', types.IntegerType(), False), types.StructField('x', types.FloatType(), False), types.StructField('y', types.FloatType(), False), types.StructField('z', types.FloatType(), False), ]) def main(in_directory, out_directory): # Read the data from the JSON files xyz = spark.read.json(in_directory, schema=schema) #xyz.show(); return # Create a DF with what we need: x, (soon y,) and id%10 which we'll aggregate by. with_bins = xyz.select( xyz['x'], xyz['y'], # TODO: also the y values (xyz['id'] % 10).alias('bin'), ) #with_bins.show(); #return # Aggregate by the bin number.
def buildODMatrix(buste_data, datapath, filepath): clean_buste_data = buste_data.na.drop(subset=[ "date", "route", "busCode", "tripNum", "stopPointId", "timestamp", "shapeLon", "shapeLat" ]) filtered_boardings = clean_buste_data.na.drop( subset=['cardNum', 'cardTimestamp']).dropDuplicates( ['cardNum', 'date', 'cardTimestamp']) multiple_boardings = filtered_boardings.groupby('cardNum').count().filter(F.col('count') > 1) \ .select(F.col("cardNum").alias("cardNum1"), F.col("count").alias("count1")) clean_boardings = filtered_boardings.join( multiple_boardings, filtered_boardings.cardNum == multiple_boardings.cardNum1, 'leftsemi') boarding_data = clean_boardings.withColumn('boarding_id', F.monotonically_increasing_id()) user_boarding_w = Window.partitionBy(boarding_data.cardNum, boarding_data.date).orderBy( boarding_data.cardTimestamp) od_matrix_ids = boarding_data.select( F.col('cardNum'), F.col('boarding_id'), F.lead('boarding_id', default=-1).over(user_boarding_w).alias('next_boarding_id'), F.first( 'boarding_id', True).over(user_boarding_w).alias('first_boarding')).withColumn( 'next_boarding_id', F.when( F.col('next_boarding_id') == -1, F.col('first_boarding')).otherwise( F.col('next_boarding_id'))).drop('first_boarding') origin_matrix = boarding_data.select( F.col("route").alias("o_route"), F.col("busCode").alias("o_bus_code"), F.col("date").alias("o_date"), F.col("tripNum").alias("o_tripNum"), F.col("cardTimestamp").alias("o_timestamp"), F.col("shapeId").alias("o_shape_id"), F.col("shapeSequence").alias("o_shape_seq"), F.col("shapeLat").alias("o_shape_lat"), F.col("shapeLon").alias("o_shape_lon"), F.col("stopPointId").alias("o_stop_id"), F.col("boarding_id").alias("o_boarding_id")) next_origin_matrix = boarding_data.select( F.col("route").alias("next_o_route"), F.col("busCode").alias("next_o_bus_code"), F.col("date").alias("next_o_date"), F.col("tripNum").alias("next_o_tripNum"), F.col("cardTimestamp").alias("next_o_timestamp"), F.col("shapeId").alias("next_o_shape_id"), F.col("shapeSequence").alias("next_o_shape_seq"), F.col("shapeLat").alias("next_o_shape_lat"), F.col("shapeLon").alias("next_o_shape_lon"), F.col("stopPointId").alias("next_o_stop_id"), F.col("boarding_id").alias("next_o_boarding_id")) user_trips_data = origin_matrix.join(od_matrix_ids, origin_matrix.o_boarding_id == od_matrix_ids.boarding_id, 'inner') \ .join(next_origin_matrix, od_matrix_ids.next_boarding_id == next_origin_matrix.next_o_boarding_id, 'inner') \ .drop('boarding_id').drop('next_boarding_id') \ .withColumn('o_unixtimestamp',F.unix_timestamp(F.col('o_timestamp'), 'HH:mm:ss')) \ .withColumn('next_o_unixtimestamp',F.unix_timestamp(F.col('next_o_timestamp'), 'HH:mm:ss')) \ .withColumn('leg_duration',F.when(F.col('next_o_unixtimestamp') > F.col('o_unixtimestamp'), \ ((F.col('next_o_unixtimestamp') - F.col('o_unixtimestamp'))/60.0)).otherwise(-1)) \ .orderBy(['cardNum','o_date','o_timestamp']) # .withColumn('o_date',F.from_unixtime(F.unix_timestamp(F.col('o_date'),'yyyy-MM-dd'), 'yyyy-MM-dd'))\ # .withColumn('next_o_date',F.from_unixtime(F.unix_timestamp(F.col('next_o_date'),'yyyy-MM-dd'), 'yyyy-MM-dd')) \ bus_trip_data = clean_buste_data.orderBy(['route','busCode','tripNum','timestamp']) \ .dropDuplicates(['route','busCode','tripNum','stopPointId']) \ .drop('cardNum') \ .withColumn('id',F.monotonically_increasing_id()) \ .withColumn('route', F.col('route').cast(T.IntegerType())) \ .withColumnRenamed('','cardNum') cond = [ bus_trip_data.route == user_trips_data.o_route, bus_trip_data.busCode == user_trips_data.o_bus_code, bus_trip_data.date == user_trips_data.o_date, bus_trip_data.tripNum == user_trips_data.o_tripNum ] w = Window().partitionBy( ['cardNum', 'date', 'route', 'busCode', 'tripNum']).orderBy('dist') filtered_od_matrix = bus_trip_data.join(user_trips_data, cond, 'left_outer') \ .withColumn('dist',dist(F.col('shapeLat'),F.col('shapeLon'),F.col('next_o_shape_lat'),F.col('next_o_shape_lon'))) \ .filter('timestamp > o_timestamp') \ .withColumn('rn', F.row_number().over(w)) \ .where(F.col('rn') == 1) \ .filter('dist <= 1.0') \ .filter(user_trips_data.cardNum.isNotNull()) trips_origins = filtered_od_matrix \ .select(['o_date','o_route','o_bus_code','o_tripNum','o_stop_id','o_timestamp']) \ .groupBy(['o_date','o_route','o_bus_code','o_tripNum','o_stop_id']) \ .count() \ .withColumnRenamed('count','boarding_cnt') \ .withColumnRenamed('o_date','date') \ .withColumnRenamed('o_route','route') \ .withColumnRenamed('o_bus_code','busCode') \ .withColumnRenamed('o_tripNum','tripNum') \ .withColumnRenamed('o_stop_id','stopPointId') trips_destinations = filtered_od_matrix \ .select(['date','route','busCode','tripNum','stopPointId','timestamp']) \ .groupBy(['date','route','busCode','tripNum','stopPointId']) \ .count() \ .withColumnRenamed('count','alighting_cnt') trips_origins.write.csv(path=datapath + 'od/trips_origins/' + filepath, header=True, mode='overwrite') trips_destinations.write.csv(path=datapath + 'od/trips_destinations/' + filepath, header=True, mode='overwrite') trips_o = sqlContext.read.csv(datapath + 'od/trips_origins/' + filepath, header=True, inferSchema=True, nullValue="-") trips_d = sqlContext.read.csv(datapath + 'od/trips_destinations/' + filepath, header=True, inferSchema=True, nullValue="-") trips_passengers = trips_o.join( trips_d, on=['date', 'route', 'busCode', 'tripNum', 'stopPointId'], how='outer') trips_window = Window.partitionBy(['date', 'route', 'busCode', 'tripNum']).orderBy('timestamp') od_matrix_route_boarding = filtered_od_matrix.groupby(['route']).count() \ .withColumnRenamed('count','odmatrix_boarding') od_matrix_route_prop = bus_trip_data.groupby(['route']).count() \ .withColumnRenamed('count','overall_boarding') \ .join(od_matrix_route_boarding, 'route','left_outer') \ .withColumn('extrap_factor',F.when(((F.col('odmatrix_boarding') == 0) | (F.col('odmatrix_boarding').isNull())), 0.0) \ .otherwise(F.col('overall_boarding').cast('float')/F.col('odmatrix_boarding'))) buste_crowdedness_extrapolated = bus_trip_data.join(trips_passengers, on=['date','route','busCode','tripNum','stopPointId'], how='left_outer') \ .withColumn('crowd_bal', F.col('boarding_cnt') - F.col('alighting_cnt')) \ .withColumn('num_pass',F.sum('crowd_bal').over(trips_window)) \ .drop('numPassengers','gps_timestamp','gps_timestamp_in_secs') \ .orderBy(['date','route','busCode','tripNum','timestamp']) \ .join(od_matrix_route_prop, 'route', 'left') \ .drop('overall_boarding','odmatrix_boarding') \ .withColumn('ext_num_pass', F.col('num_pass')*F.col('extrap_factor')) return buste_crowdedness_extrapolated
batch_size=args.batch_size, epochs=args.epochs, verbose=2) keras_model = keras_estimator.fit(train_df).setOutputCols(['Sales']) history = keras_model.getHistory() best_val_rmspe = min(history['val_exp_rmspe']) print('Best RMSPE: %f' % best_val_rmspe) # Save the trained model. keras_model.save(args.local_checkpoint_file) print('Written checkpoint to %s' % args.local_checkpoint_file) # ================ # # FINAL PREDICTION # # ================ # print('================') print('Final prediction') print('================') pred_df = keras_model.transform(test_df) # Convert from log domain to real Sales numbers pred_df = pred_df.withColumn('Sales', F.exp(pred_df.Sales)) submission_df = pred_df.select(pred_df.Id.cast(T.IntegerType()), pred_df.Sales).toPandas() submission_df.sort_values(by=['Id']).to_csv(args.local_submission_csv, index=False) print('Saved predictions to %s' % args.local_submission_csv) spark.stop()
import sys from pyspark.sql import SparkSession, functions, types spark = SparkSession.builder.appName('weather ETL').getOrCreate() assert sys.version_info >= (3, 4) # make sure we have Python 3.4+ assert spark.version >= '2.1' # make sure we have Spark 2.1+ observation_schema = types.StructType([ types.StructField('station', types.StringType(), False), types.StructField('date', types.StringType(), False), types.StructField('observation', types.StringType(), False), types.StructField('value', types.IntegerType(), False), types.StructField('mflag', types.StringType(), False), types.StructField('qflag', types.StringType(), False), types.StructField('sflag', types.StringType(), False), types.StructField('obstime', types.StringType(), False), ]) def main(): in_directory = sys.argv[1] out_directory = sys.argv[2] weather = spark.read.csv(in_directory, schema=observation_schema) # weather.show(); return qflagNull = weather.filter(weather['qflag'].isNull()) # qflagNull.show(); return caStation = qflagNull.filter( functions.substring(qflagNull.station, 1, 2) == 'CA') # caStation.show(); return
import sys, os assert sys.version_info >= (3, 5) # make sure we have Python 3.5+ from pyspark.sql import SparkSession, functions, types, Row from pyspark import SparkConf, SparkContext app_name = "NCAA Basketball" spark = SparkSession.builder.appName(app_name).getOrCreate() assert spark.version >= '2.3' # make sure we have Spark 2.3+ spark.sparkContext.setLogLevel('WARN') # Function that maps the period to minutes remaining @functions.udf(returnType=types.IntegerType()) def period_mins_left(period): if period == '1st Period': return 30 elif period == '2nd Period': return 20 elif period == '3rd Period': return 10 elif period == '4th Period': return 0 elif period == '1st Half': return 20 elif period == '2nd Half': return 0 else: return 0 from resources import play_by_play_schema