def dateTransform(self, columns, currentFormat, outputFormat): """ :param columns Name date columns to be transformed. Columns ha :param currentFormat currentFormat is the current string dat format of columns specified. Of course, all columns specified must have the same format. Otherwise the function is going to return tons of null values because the transformations in the columns with different formats will fail. :param outputFormat output date string format to be expected. """ # Check if currentFormat argument a string datatype: self.__assertTypeStr(currentFormat, "currentFormat") # Check if outputFormat argument a string datatype: self.__assertTypeStr(outputFormat, "outputFormat") # Check if columns argument must be a string or list datatype: self.__assertTypeStrOrList(columns, "columns") if type(columns) == type('str'): columns = [columns] # Check if columns to be process are in dataframe self.__assertColsInDF(columnsProvided=columns, columnsDF=self.__df.columns) exprs = [date_format(unix_timestamp(c, currentFormat).cast("timestamp"), outputFormat).alias( c) if c in columns else c for c in self.__df.columns] self.__df = self.__df.select(*exprs) self.__addTransformation() # checkpoint in case return self
def ageCalculate(self, column, dateFormat, nameColAge): """ This method compute the age of clients based on their born dates. :param column Name of the column born dates column. :param dateFormat String format date of the column provided. :param nameColAge Name of the new column, the new columns is the resulting column of ages. """ # Check if column argument a string datatype: self.__assertTypeStr(column, "column") # Check if dateFormat argument a string datatype: self.__assertTypeStr(dateFormat, "dateFormat") # Asserting if column if in dataFrame: assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame" # Output format date Format = "yyyy-MM-dd" # Some SimpleDateFormat string exprs = format_number( mag( months_between(date_format( unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias( nameColAge) self.__df = self.__df.withColumn(nameColAge, exprs) self.__addTransformation() # checkpoint in case return self
def parse_dates(df, format): """ Parses dateinto year,month,day :param df: input df :param format: the format of the timestamp :return: dataframe """ return df.withColumn('parsed_date', f.to_timestamp(f.col('transaction_date'), format)) \ .withColumn("year", f.year(f.col('parsed_date'))) \ .withColumn("month", f.month(f.col('parsed_date'))) \ .withColumn("day", f.dayofmonth(f.col('parsed_date'))) \ .withColumn("unix_ts", f.unix_timestamp('parsed_date')) \ .drop("transaction_date")
def do_something_only_once(): # the command I use to run this script: #~/spark-1.6.1/bin/spark-submit --packages=com.databricks:spark-avro_2.10:2.0.1,com.databricks:spark-csv_2.10:1.4.0 server.py global topdis, meta, dic, towo, cluto, doctopdat, maxdate, mindate, lda ## Loading of data sc = SparkContext(appName='Simple App') #"local" sqlContext = SQLContext(sc) # Load metadata avro reader = sqlContext.read.format('com.databricks.spark.avro') meta = reader.load('data/spark_metadata.avro') # # Loading topic distributions topdisFile = 'data/spark_output.tuples' csvLoader = sqlContext.read.format('com.databricks.spark.csv') topdis = csvLoader.options(delimiter=',',header='false', inferschema='true').load(topdisFile) strip_first_col_int = udf(lambda row: int(row[1:]), IntegerType()) topdis = topdis.withColumn('C0',strip_first_col_int(topdis['C0'])) strip_first_col_float = udf(lambda row: float(row[1:]), FloatType()) topdis = topdis.withColumn('C1',strip_first_col_float(topdis['C1'])) strip_last_col = udf(lambda row: float(row[:-2]), FloatType()) topdis = topdis.withColumn('C20',strip_last_col(topdis['C20'])) # # Load dictionary CSV dicFile = 'data/spark_dic.csv' csvLoader = sqlContext.read.format('com.databricks.spark.csv') dic = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(dicFile) dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count')) ldaFile = 'data/spark_lda.csv' csvLoader = sqlContext.read.format('com.databricks.spark.csv') lda = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(ldaFile) lda = lda.select(rowNumber().alias('id'), lda.columns).join(dic, dic.id == lda.id, 'inner').cache() # dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count')) # # # Load clustertopics CSV # clutoFile = 'enron_small_clustertopics.csv' # csvLoader = sqlContext.read.format('com.databricks.spark.csv') # cluto = csvLoader.options(delimiter=',', header='false', inferschema='true').load(clutoFile) # # # Load topicswords CSV # towoFile = 'enron_small_lda_transposed.csv' # csvLoader = sqlContext.read.format('com.databricks.spark.csv') # towo = csvLoader.options(delimiter=',', header='false', inferschema='true').load(towoFile) # # Merge topdis which has document id and with metadata, based on document id metasmall = meta.select('id',unix_timestamp(meta['date'],"yyyy-MM-dd'T'HH:mm:ssX").alias("timestamp")) doctopdat = topdis.join(metasmall, metasmall.id == topdis.C0,'inner').cache() maxdate = doctopdat.select(max('timestamp').alias('maxtimestamp')).collect()[0]['maxtimestamp'] mindate = doctopdat.select(min('timestamp').alias('mintimestamp')).collect()[0]['mintimestamp']
# query5.show() # pd = query5.toPandas() # pd.to_csv('static/Input/byDevice.csv', index=False) # Query 6: Number of tweets by Language query6 = spark.sql( "SELECT user.lang, count(*) AS Lang_Count FROM tweets where user.lang is " "not null group by user.lang order by Lang_Count desc LIMIT 10") query6.show() pd = query6.toPandas() pd.to_csv('static/Input/byLanguage.csv', index=False) year = df.select( date_format((from_unixtime( unix_timestamp('user.created_at', 'EEE MMM dd HH:mm:ss ZZZZZ yyyy')).alias('date')), 'MM/dd/yyyy').alias('date')) year.printSchema() year.createOrReplaceTempView("year") # Query 7: Number of tweets by Year query7 = spark.sql( "select count(*) as NumberOfTweets, substr(date, 7, 11) as MovieYear from year group by substr(date, 7, 11) " "order by substr(date, 7, 11) desc") pd = query7.toPandas() pd.to_csv('static/Input/byYear.csv', index=False) query7.show() # Query 8: Hashtag Count
spark = SparkSession.builder.appName('App1').config( conf=SparkConf()).getOrCreate() today = datetime.date.today() for x in range(1, 9): date1 = str(today - datetime.timedelta(days=x)) data = 's3://paragon-datalake/googleanalytics/landing/product_details/product_details_page' + date1 + '.csv' df = spark.read.format('csv').options(header='true', inferSchema='true', sep='\t').load(data) l1 = df.schema.fieldNames() l2 = [] for header in l1: x = "-".join(header.split(" ")) l2.append(x) data = df.toDF(*l2) #data.show() data = data.withColumn("date", data["date"].cast('String')) #data.printSchema() data = data.withColumn( 'date', to_date(unix_timestamp(col('date'), 'yyyyMMdd').cast("timestamp"))) data.write.mode("overwrite").parquet( 's3://paragon-datalake/googleanalytics/processing/product_details/product_details_page' + date1)
from datetime import datetime from pyspark.sql.functions import to_date, udf from pyspark.sql.types import DateType from pyspark.sql.functions import unix_timestamp, from_unixtime from Pipelines.getSparkSession import getSparkSession spark = getSparkSession() sc = spark.sparkContext df = spark.createDataFrame([('1997-02-28', )], ['t']) df.select(to_date(df.t, 'yyyy-MM-dd').alias('dt')).collect() from pyspark.sql.functions import from_unixtime df = spark.createDataFrame([("11/25/1991", ), ("11/24/1991", ), ("11/30/1991", )], ['date_str']) df2 = df.select( 'date_str', from_unixtime(unix_timestamp('date_str', 'MM/dd/yyy')).alias('date')) df2 df2.show()
def get_static_features(spark): df_admissions = spark.read.csv( os.path.join(PATH_MIMIC_ORIGINAL_CSV_FILES, 'ADMISSIONS.csv'), header=True, inferSchema="false").drop("EXPIRE_FLAG").drop("DISCHARGE_LOCATION") df_patients = spark.read.csv( os.path.join(PATH_MIMIC_ORIGINAL_CSV_FILES, 'PATIENTS.csv'), header=True, inferSchema="false").select(['SUBJECT_ID', 'DOB', 'GENDER']) df_merge = df_admissions.join(df_patients, ['SUBJECT_ID']) timeFmt = "yyyy-MM-dd' 'HH:mm:ss" # 2153-09-03 07:15:00 timeDiff = F.round((F.unix_timestamp('ADMITTIME', format=timeFmt) - F.unix_timestamp('DOB', format=timeFmt)) / (60 * 60 * 24 * 365.242)).cast('integer') df_merge = df_merge.withColumn("AGE_ADMISSION", timeDiff) df_merge = QuantileDiscretizer( numBuckets=5, inputCol='AGE_ADMISSION', outputCol='QAGE').fit(df_merge).transform(df_merge) t = { 0.0: 'very-young', 1.0: 'young', 2.0: 'normal', 3.0: 'old', 4.0: 'very-old' } udf_age = udf(lambda x: t[x], StringType()) df_merge = df_merge.withColumn('AGE', udf_age('QAGE')) df_merge = reduce(DataFrame.drop, [ 'SUBJECT_ID', 'ROW_ID', 'ADMITTIME', 'DISCHTIME', 'DEATHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'LANGUAGE', 'RELIGION', 'EDREGTIME', 'EDOUTTIME', 'DIAGNOSIS', 'HOSPITAL_EXPIRE_FLAG', 'HAS_CHARTEVENTS_DATA', 'GENDER', 'DOB', 'DOD', 'DOD_HOSP', 'DOD_SSN', 'EXPIRE_FLAG', 'AGE_ADMISSION', 'QAGE' ], df_merge) df_merge = df_merge.fillna({'MARITAL_STATUS': 'UNKNOWN_MARITAL'}) categories = list( set( flatten([ list(df_merge.select(c).distinct().collect()) for c in df_merge.columns if c not in ['HADM_ID'] ]))) categories_dict = {} for i in range(len(categories)): categories_dict[categories[i]] = float(i) def mapFnLabels(row): one = categories_dict[row.AGE] two = categories_dict[row.ETHNICITY] three = categories_dict[row.MARITAL_STATUS] four = categories_dict[row.INSURANCE] feat_array = [one, two, three, four] four = categories_dict[row.MARITAL_STATUS] five = categories_dict[row.INSURANCE] feat_array = [one, two, four, five] return (row.HADM_ID, feat_array) hadmid_to_static_feats = df_merge.rdd.map(mapFnLabels) return hadmid_to_static_feats
#Assign unique continuous numbes to rows of a dataframe Z = spark.createDataFrame(d.select("colid").distinct().rdd.map(lambda x: x[0]).zipWithUniqueId()) #A window function calculates a return value for every input row of a table based on a group of rows, called the Frame from pyspark.sql.window import Window window = Window.partitionBy(tmp['prediction']).orderBy(df['clust_count'].desc()) tmp.select('*', F.rank().over(window).alias('rank')) .filter(col('rank') <= 5).head(5) #K-means cluster from pyspark.ml.feature import VectorAssembler from pyspark.ml.clustering import KMeans from pyspark.ml.feature import StandardScaler d = scaler.fit(d).transform(d) kmeans = KMeans(k=5, seed=1, featuresCol="scaled") model = kmeans.fit(d) centers = model.clusterCenters() #Converting date to timestamp x = x.withColumn("unix_time", F.unix_timestamp(F.col("DATETIME"), format='yyyy-MM-dd HH:mm:ss')) #pivot(pivot_col, values=None); pivot_col – Name of the column to pivot. values – List of values that will be translated to columns in the output DataFrame df4.groupBy("year").pivot("course", ["dotNET", "Java"]).sum("earnings").collect() #Groupby and count distinct / unique df.groupby("device").agg(F.countDistinct(F.col("colid"))).toPandas() df.groupby("device").agg(F.countDistinct(F.col("id"))).orderBy('count(DISTINCT id)', ascending = False)
from pyspark.sql.functions import regexp_replace from pyspark.sql.functions import unix_timestamp from pyspark.sql.functions import * from pyspark.sql.types import * ## read data from mount point read_path = "/bmathew/clickstream/000000_0" jsonSchema = StructType([ StructField("http_vhost_name", StringType(), True), StructField("event_time", TimestampType(), True), StructField("page", StringType(), True), StructField("user", StringType(), True), StructField("session", StringType(), True) ]) df = spark.read.format("json").schema(jsonSchema).load(read_path) \ .select(current_timestamp().alias('event_load_time') ,from_unixtime(unix_timestamp(from_utc_timestamp(from_unixtime(unix_timestamp()), "PST")),"yyyy-MM-dd HH:mm:ss").alias("event_insert_time") ,col("http_vhost_name").alias("domain_name") ,regexp_replace(regexp_replace(col("event_time"),'T',' '),'Z','').alias("event_time") ,get_json_object('user', '$.browserId').alias('user_id'),get_json_object('page', '$.country').alias('country') ,get_json_object('page', '$.division').alias('division'),get_json_object('page', '$.app').alias('page_app') ,get_json_object('page', '$.id').alias('page_id'),get_json_object('user', '$.browser').alias('browser') ,get_json_object('user', '$.os').alias('os'),get_json_object('user', '$.platform').alias('platform')) \ .where("country IN ('MX','TW','PE','CO','UY','PR','CL','ID','CA','GB','JP','US','PA')") df.write.format("parquet").mode("overwrite").saveAsTable( "bmathew.test_databricks_api")
def test_string_unix_timestamp(data_gen, date_form): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, data_gen, seed=1).select( f.unix_timestamp(f.col('a'), date_form)))
fc_customer["SourceSystemCustomerId"]) va_customer = va_customer.withColumn("CustomerID", va_customer["SourceSystemCustomerId"]) sr_customer = sr_customer.withColumn("CustomerID", sr_customer["SourceSystemCustomerId"]) # COMMAND ---------- # DBTITLE 1,Convert FourthCoffee "CreatedDate" & "UpdatedDate" to date datatype from pyspark.sql.functions import unix_timestamp from pyspark.sql.functions import from_unixtime from pyspark.sql.functions import to_date fc_customer = fc_customer.withColumn( "CreatedDate", to_date(unix_timestamp("CreatedDate", "yyyy-MM-dd").cast("timestamp"))) fc_customer = fc_customer.withColumn( "UpdatedDate", to_date(unix_timestamp("UpdatedDate", "yyyy-MM-dd").cast("timestamp"))) # COMMAND ---------- # DBTITLE 1,Convert FourthCoffee "ZipCode" to a string datatype fc_customer.withColumn("ZipCode", fc_customer["ZipCode"].cast("string")) # COMMAND ---------- # DBTITLE 1,Convert "CustomerId" & "PhoneNumber" to a Long data type from pyspark.sql.types import IntegerType fc_customer = fc_customer.withColumn(
#import plotly.graph_objs as go #from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot import shutil #init_notebook_mode(connected=True) spark = SparkSession.builder.appName( "Python Spark SQL basic example").getOrCreate() sc = spark.sparkContext sqlContext = SQLContext(sc) df = sqlContext.read.load('/test_dev/mba-code/dataset.csv', format='csv', header='true', inferSchema='true') df.select( "InvoiceNo", "StockCode", "Description", "Quantity", "InvoiceDate", "InvoiceDateWS", date_format(from_unixtime(unix_timestamp('InvoiceDateWS', 'mm/dd/yyy')), 'EEEE').alias('weekday'), "CustomerID", "Country").write.save("Invoices.parquet", format="parquet") parquetFile = spark.read.parquet("Invoices.parquet") # Parquet files can also be used to create a temporary view and then used in SQL statements. parquetFile.createOrReplaceTempView("parquetFile") from operator import add from apyori import apriori, load_transactions import pyspark.sql.functions as f from mlxtend.preprocessing import TransactionEncoder from pyspark.sql.functions import explode DescriptionGrp = spark.sql( "SELECT distinct InvoiceNo,StockCode FROM parquetFile group by InvoiceNo,StockCode" ) print(DescriptionGrp.rdd.take(2)) minSupport = 0.05 * DescriptionGrp.rdd.count()
print("\n\n\nSample Records of Filtering are\n\n\n") df_filter.show(10, False) df1 = df_filter.withColumn( "prevTimestamp", lag(df_filter.timestamp, 1).over( Window.partitionBy(df_filter.ip_address).orderBy(df_filter.timestamp))) #definging the time frame window duration of 15 minutes in sesocnds and time format timeFmt = "yyyy-MM-dd'T'HH:mm:ss.SSS" timeWindow = 900 #Adding new columns that are needed in calcualtions df2 = df1.withColumn( "diff_seconds", F.unix_timestamp(df1.timestamp, format=timeFmt) - F.unix_timestamp(df1.prevTimestamp, format=timeFmt)) df3 = df2.withColumn("isNewSession", F.when(df2.diff_seconds <= timeWindow, 0).otherwise(1)) df4 = df3.withColumn( "session_id", sum(df3.isNewSession).over( Window.partitionBy(df3.ip_address).orderBy(df3.ip_address, df3.timestamp))) df5 = df4.withColumn( "session_count_distinct", max(df4.session_id).over(Window.partitionBy(df4.ip_address)))
# 时间窗口,移动平均 from pyspark.sql import functions as F from pyspark.sql.window import Window #function to calculate number of seconds from number of days days = lambda i: i * 86400 df = spark.createDataFrame([(17, "2017-03-10"), (15, "2017-03-10"), (3, "2017-03-11"), (13, "2017-03-12"), (19, "2017-03-13"), (25, "2017-03-14"), (22, "2017-03-16")], ["dollars", "timestampGMT"]) df = df.withColumn('timestampGMT', df.timestampGMT.cast('timestamp')) days = lambda i: i * 86400 #create window by casting timestamp to long (number of seconds) # w = (Window.orderBy(F.col("add_dt_timestamp").cast('long')).rangeBetween(-days(3), 0)) # 此处-days(3)。注意这里往前看三天不包含当天,其实是4天 w = (Window.orderBy(F.col("timestampGMT").cast('long')).rangeBetween(0, days(2))) # 此处包含当天时间,例如:03-10往后两天到03-12, 其实是一共看了3天 df = df.withColumn('rolling_average', F.avg("dollars").over(w)) # 其实还可以先按天汇总,然后rowsBetween # group by time period, 时间段分组 interval = 60*60*24*3 # 3天 df.withColumn('time_interval', F.from_unixtime(F.floor(F.unix_timestamp(F.col('timestampGMT'), "yyyy-MM-dd") / interval) * interval)) \ .groupBy('time_interval') \ .agg(F.collect_list("id").alias("id_list")).show() # group by time period, 时间段分组2 df = df.withColumn("date_index", F.dense_rank().over(Window.orderBy(F.col('timestampGMT')))) date_bucket_func = udf(lambda x: x//3 if x%3 else x//3-1) df = df.withColumn("date_bucket", date_bucket_func(F.col('date_index')))
# raw.describe().show() # COMMAND ---------- #The user and repository with metrics to be charted # USER_FOR_METRICS = getArgument("User") REPOSITORY_FOR_METRICS = getArgument("Repository") # COMMAND ---------- from pyspark.sql import functions, Row, Column raw = raw.withColumn( 'closed', functions.when(raw.closed.isNull(), functions.current_timestamp()).otherwise(raw.closed)) duration = functions.unix_timestamp(raw.closed) - functions.unix_timestamp( raw.created) processed_data = raw.withColumn('age', duration) # processed_data.show() # print(processed_data.count()) # display(processed_data) # COMMAND ---------- #Finds number of rows in which url (id) appears, generating count for number of review teams df1 = raw.withColumn('exploded_review_team', functions.explode(raw.review_teams)) num_teams = df1.groupBy(df1.url).count() num_teams = num_teams.withColumnRenamed('count',
tgtfilePathId = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/riskrun/" tgtfilePathHeader = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/header/" tgtfilePathDataLookup = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/datalookup/" tgtfilePathResult = "s3://" + bucket + "/" + enriched_path + vendor + "/DataModels/result/" dfParquet = sparkSession.read.format("parquet").load(srcfilePath) df = dfParquet.withColumn("year",sf.split("createdDate","\-")[0]) \ .withColumn("month",sf.split("createdDate","\-")[1]) \ .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0]) dfbaseData = df.select([col for col in df.columns]) dfjoin = dfbaseData.join(dfrmi,(dfbaseData.loanApplicationId == dfrmi.loan_application_id) & \ (sf.unix_timestamp(dfbaseData.createdDatePT) - sf.unix_timestamp(dfrmi.date_created) >= 0),'left_outer') \ .join(dfrmsNS,(dfrmi.id == dfrmsNS.input_id),'left_outer') \ .select(dfbaseData.id \ ,dfbaseData.mvpApplicantId \ ,dfbaseData.loanApplicationId \ ,dfbaseData.mvpLoanApplicationId \ ,sf.regexp_replace(dfbaseData.createdDate,"T"," ").cast(TimestampType()).alias("ewsStatusTimestampUTC") \ ,dfbaseData.createdDatePT.alias("ewsStatusTimestampPT") \ ,dfrmi.id.alias("rmiId") \ ,dfrmi.date_created.alias("riskTimestampPT") \ ,dfrmsNS.score_type.alias("scoreType") \ ,dfbaseData.year \ ,dfbaseData.month \ ,dfbaseData.day) dfrankedId = dfjoin.withColumn("row_num", sf.row_number().over(Window.partitionBy("loanApplicationId","ewsStatusTimestampPT").orderBy(sf.desc("riskTimestampPT"),sf.desc("scoreType")))) \
def loadParquet(self): self.log.info('Exception Handling starts') validSourceFormat = self.isValidFormatInSource() if not validSourceFormat: self.log.error("Product Source files not in csv format.") validSourceSchema = self.isValidSchemaInSource() if not validSourceSchema: self.log.error( "Product Source schema does not have all the required columns." ) if not validSourceFormat or not validSourceSchema: self.log.info( "Copy the source files to data processing error path and return." ) self.copyFile( self.productRQ4File, self.dataProcessingErrorPath + '/' + self.productRQ4Name + self.fileFormat) self.copyFile( self.couponsFile, self.dataProcessingErrorPath + '/' + self.couponsName + self.fileFormat) self.copyFile( self.productIdentifierFile, self.dataProcessingErrorPath + '/' + self.productIdentifierName + self.fileFormat) return self.log.info('Source format and schema validation successful.') self.log.info('Reading the input parquet file') dfProduct = self.sparkSession.read.format("com.databricks.spark.csv"). \ option("encoding", "UTF-8"). \ option("ignoreLeadingWhiteSpace", "true"). \ option("ignoreTrailingWhiteSpace", "true"). \ option("header", "true"). \ option("treatEmptyValuesAsNulls", "true"). \ option("inferSchema", "true"). \ option("escape", '"'). \ option("quote", "\""). \ option("multiLine", "true"). \ load(self.productRQ4FilePath).toDF(*self.productRQ4Header) dfProductIden = self.sparkSession.read.format("com.databricks.spark.csv"). \ option("encoding", "UTF-8"). \ option("ignoreLeadingWhiteSpace", "true"). \ option("ignoreTrailingWhiteSpace", "true"). \ option("header", "true"). \ option("treatEmptyValuesAsNulls", "true"). \ option("inferSchema", "true"). \ option("escape", '"'). \ option("quote", "\""). \ option("multiLine", "true"). \ load(self.productIdentifierFilePath).toDF(*self.productIdentifierHeader) dfCoupons = self.sparkSession.read.format("com.databricks.spark.csv"). \ option("encoding", "UTF-8"). \ option("ignoreLeadingWhiteSpace", "true"). \ option("ignoreTrailingWhiteSpace", "true"). \ option("header", "true"). \ option("treatEmptyValuesAsNulls", "true"). \ option("inferSchema", "true"). \ option("escape", '"'). \ option("quote", "\""). \ option("multiLine", "true"). \ load(self.couponsFilePath).toDF(*self.couponsHeader) dfProduct.coalesce(1).write.mode('overwrite').format('parquet').save( self.productRQ4OutputWorkingPath) dfCoupons.coalesce(1).write.mode('overwrite').format('parquet').save( self.couponsOutputWorkingPath) dfProductIden.coalesce(1).write.mode('overwrite').format( 'parquet').save(self.productIdentifierOutputWorkingPath) dfProduct.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\ withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).\ write.mode('append').partitionBy('year', 'month').format('parquet').\ save(self.productRQ4OutputPartitionPath) dfCoupons.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\ withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).\ write.mode('append').partitionBy('year', 'month').format('parquet').\ save(self.couponsOutputPartitionPath) dfProductIden.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\ withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).\ write.mode('append').partitionBy('year', 'month').format('parquet').\ save(self.productIdentifierOutputPartitionPath) self.sparkSession.stop()
spark = SparkSession.builder.appName( 'sp500_agg_calculations').getOrCreate() # reading in the data data = spark.read.format('csv') \ .options(header='true', inferschema='false') \ .load(sys.argv[1]) data = data.select('Symbol', 'Name', 'Date', 'Log Return') \ .filter(col('Log Return') != '') data = data.withColumn( 'Date', to_date(unix_timestamp(col('Date'), 'yyyy-MM-dd').cast('timestamp'))) data = data.withColumn('Log Return', col('Log Return').cast('float').alias('Log Return')) # cumulative log return sum cum_window = Window.partitionBy(data['Symbol']).orderBy( data['Date']).rangeBetween(Window.unboundedPreceding, 0) cum_sum = data.withColumn('CumulativeSum', sum('Log Return').over(cum_window)) cum_sum = cum_sum.orderBy(col('Symbol'), col('Date')) # total log return sum sum_window = Window.partitionBy('Symbol') sp_sum = cum_sum.withColumn('MaxDate', max('Date').over(sum_window)) \ .where(col('Date') == col('MaxDate')) \ .drop('MaxDate', 'Log Return', 'Date') \
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session # job = Job(glueContext) # job.init(args['JOB_NAME'], args) spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") is_dev = True ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) #------------------------------------------------------------------------------------------------------------------# # ------------------------------------------------------------------------------------------------------------------# def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime, package_starttime, student_level_code, student_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_status_code is not None: text_concat += str(student_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = f.udf(concaText, StringType()) def convertStudentIdToLong(student_id): try: student_id_long = long(student_id) return student_id_long except: return 0L convertStudentIdToLong = f.udf(convertStudentIdToLong, LongType()) # ------------------------------------------------------------------------------------------------------------------# ################################################## # Lay du lieu kiem tra ky thuat trong bang student_technical_test dyf_datasourceTech = glueContext.create_dynamic_frame.from_catalog( database="dm_toa", table_name="student_technical_test_odin") # print('dyf_datasourceTech') # dyf_datasourceTech.printSchema() # Chon cac truong can thiet dyf_datasourceTech = dyf_datasourceTech.select_fields([ '_key', 'thoigianhenktkt', 'ketluan', 'emailhocvien', 'dauthoigian', 'emailadvisor', 'nguoitiepnhan', 'trinhdohocvien' ]) dyf_datasourceTech = dyf_datasourceTech.resolveChoice( specs=[('_key', 'cast:long')]) if (dyf_datasourceTech.count() > 0): dyf_datasourceTech = Filter.apply( frame=dyf_datasourceTech, f=lambda x: x["emailhocvien"] is not None and x["emailhocvien"] != '' and x["thoigianhenktkt"] is not None and x[ "thoigianhenktkt"] != '' and x["ketluan"] == 'Pass') dyf_datasourceTech_numeber = dyf_datasourceTech.count() print("Count data 2: ", dyf_datasourceTech_numeber) if dyf_datasourceTech_numeber < 1: return dy_datasourceTech = dyf_datasourceTech.toDF() dy_datasourceTech = dy_datasourceTech.limit(100) print('dy_datasourceTech') dy_datasourceTech.printSchema() dy_datasourceTech = dy_datasourceTech.withColumn( 'thoigianhenktkt_id', f.unix_timestamp('thoigianhenktkt', 'yyyy-MM-dd HH:mm:ss').cast('long')) print('dy_datasourceTech__2') dy_datasourceTech.printSchema() # lay thoi gian kich hoat dau tien w2 = Window.partitionBy("emailhocvien").orderBy( f.col("thoigianhenktkt_id").desc()) dy_datasourceTech = dy_datasourceTech.withColumn("row", f.row_number().over(w2)) \ .where(f.col('row') <= 1) \ print('dy_datasourceTech__3') dy_datasourceTech.printSchema() #--------------------------------------------------------------------------------------------------------------# dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") # chon cac field dyf_student_contact = dyf_student_contact.select_fields( ['_key', 'contact_id', 'student_id', 'user_name']) dyf_student_contact = Filter.apply( frame=dyf_student_contact, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '' and x["student_id"] is not None and x["student_id"] != '' and x[ "user_name"] is not None and x["user_name"] != '') dyf_student_contact_number = dyf_student_contact.count() print('dyf_student_contact_number::number: ', dyf_student_contact_number) if dyf_student_contact_number < 1: return dy_student_contact = dyf_student_contact.toDF() dy_student_contact.dropDuplicates(['student_id']) dy_join_teach_concat = dy_datasourceTech.join( dy_student_contact, dy_datasourceTech.emailhocvien == dy_student_contact.user_name) print('dyf_join_teach_concat::schema') dy_join_teach_concat.printSchema() join_teach_concat_number = dy_join_teach_concat.count() print('join_teach_concat_number::number: ', join_teach_concat_number) if join_teach_concat_number < 1: return #--------------------------------------------------------------------------------------------------------------# dyf_student_package_status = glueContext.create_dynamic_frame.from_catalog( database="od_student_behavior", table_name="student_status") dyf_student_package_status = dyf_student_package_status\ .select_fields(['contact_id', 'status_code', 'start_date', 'end_date'])\ .rename_field('contact_id', 'contact_id_ps') print('dyf_student_package_status::drop_duplicates') df_student_package_status = dyf_student_package_status.toDF() print('dyf_student_package_status::drop_duplicates::before: ', df_student_package_status.count()) df_student_package_status = df_student_package_status.drop_duplicates() print('dyf_student_package_status::drop_duplicates::after: ', df_student_package_status.count()) print('dy_student_package_status') df_student_package_status.printSchema() # --------------------------------------------------------------------------------------------------------------# dyf_student_package = glueContext.create_dynamic_frame.from_catalog( database="od_student_behavior", table_name="student_package") print('dyf_student_package__0') dyf_student_package.printSchema() dyf_student_package = dyf_student_package \ .select_fields(['student_id', 'package_code', 'start_time', 'end_time'])\ .rename_field('student_id', 'student_id_pk') # --------------------------------------------------------------------------------------------------------------# print('dyf_student_package__1') dyf_student_package.printSchema() dyf_student_package = dyf_student_package.resolveChoice( specs=[('start_time', 'cast:long'), ('end_time', 'cast:long')]) print('dyf_student_package__2') dyf_student_package.printSchema() df_student_package = dyf_student_package.toDF() print('df_student_package::drop_duplicates::before: ', df_student_package.count()) df_student_package = df_student_package.drop_duplicates() print('df_student_package::drop_duplicates::after: ', df_student_package.count()) print('df_student_package') df_student_package.printSchema() df_student_package.show(3) df_student_package_number = df_student_package.count() print('df_student_package_number: ', df_student_package_number) # --------------------------------------------------------------------------------------------------------------# # --------------------------------------------------------------------------------------------------------------# dy_join_teach_concat_number = dy_join_teach_concat.count() print('dy_join_teach_concat_number: ', dy_join_teach_concat_number) join_result = dy_join_teach_concat\ .join(df_student_package_status, (dy_join_teach_concat.contact_id == df_student_package_status.contact_id_ps) & (dy_join_teach_concat.thoigianhenktkt_id >= df_student_package_status.start_date) & (dy_join_teach_concat.thoigianhenktkt_id < df_student_package_status.end_date), 'left' )\ .join(df_student_package, (dy_join_teach_concat.student_id == df_student_package.student_id_pk) &(dy_join_teach_concat.thoigianhenktkt_id >= df_student_package.start_time) &(dy_join_teach_concat.thoigianhenktkt_id < df_student_package.end_time), 'left' ) print('join_result') join_result.printSchema() join_result_number = join_result.count() print('join_result_number: ', join_result_number) if join_result_number < 1: return join_result.show(3) student_id_unavailable = 0L package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L package_code_unavailable = 'UNAVAILABLE' student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' measure1_unavailable = 0 measure2_unavailable = 0 measure3_unavailable = 0 measure4_unavailable = float(0.0) # join_result = join_result.withColumnRenamed('student_id', 'student_id_a') join_result = join_result.select( join_result.thoigianhenktkt_id.alias('student_behavior_date'), f.lit(5L).alias('behavior_id'), 'student_id', join_result.contact_id.alias('contact_id'), join_result.package_code.alias('package_code'), join_result.end_time.cast('long').alias('package_endtime'), join_result.start_time.cast('long').alias('package_starttime'), join_result.trinhdohocvien.cast('string').alias( 'student_level_code'), join_result.status_code.cast('string').alias( 'student_status_code'), f.lit(today_second).cast('long').alias('transformed_at'), ) join_result = join_result.na.fill({ 'package_code': package_code_unavailable, 'package_endtime': package_starttime_unavailable, 'package_starttime': package_endtime_unavailable, 'student_level_code': student_level_code_unavailable, 'student_status_code': student_status_code_unavailable }) print('join_result--1') join_result.printSchema() join_result.show(1) join_result = join_result.withColumn( 'student_behavior_id', f.md5( concaText(join_result.student_behavior_date, join_result.behavior_id, join_result.student_id, join_result.contact_id, join_result.package_code, join_result.package_endtime, join_result.package_starttime, join_result.student_level_code, join_result.student_status_code, join_result.transformed_at))) # print('join_result--2') join_result.printSchema() join_result.show(5) dyf_join_result = DynamicFrame.fromDF(join_result, glueContext, 'dyf_join_result') dyf_join_result = Filter.apply( frame=dyf_join_result, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '') apply_ouput = ApplyMapping.apply( frame=dyf_join_result, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("student_behavior_date", "long", "student_behavior_date", "long"), ("behavior_id", "long", "behavior_id", "long"), ("student_id", "string", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "long", "package_code", "string"), ("package_endtime", "long", "package_endtime", "long"), ("package_starttime", "long", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long")]) dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2") # glueContext.write_dynamic_frame.from_options( frame=dfy_output, connection_type="s3", connection_options={ "path": "s3://dtsodin/student_behavior/student_behavior", "partitionKeys": ["behavior_id"] }, format="parquet")
# Ponto de entrada para gerar uma sessão do spark spark = SparkSession.builder.appName("PySpark Test").getOrCreate() # ==> Lendo de um arquivo CSV(local) sem a necessidade de gerar um schema # ==> df = spark.read.csv(r'./resources/data.csv',header=True) # Criando um DataFrame passando os valores do array e o schema definido df = spark.createDataFrame(data=data, schema=schema) # Criando um novo DataFrame que irá incluir uma nova coluna, # convertendo campo String em date # e excluindo a coluna de data(origem em string) df2 = df.withColumn( 'Atualizacao', F.to_date( F.unix_timestamp( F.col('Data_Atualizacao'), 'yyyy-MM-dd').cast("timestamp"))).drop('Data_Atualizacao') # Criando o Dataframe final adcionando uma nova coluna # que fará o rank ordenando pela data de atualização, # agrupado por municipio e ordenado df_final = df2.withColumn( "Ordem Transacao", F.dense_rank().over( W.partitionBy("Municipio").orderBy("Atualizacao"))).orderBy( "Municipio", "Ordem Transacao") # Exibindo resultado df_final.show(truncate=False)
.getOrCreate() ## creating spark and hive context sc = spark.sparkContext sqlctxt = HiveContext(sc) ## reading dat from hive table data = sqlctxt.sql( "SELECT nav_date,fund_id,client_name,datetime,date_identifier from h011gtcsandbox.xnd_pricing_data" ) data = data.dropna() ## changing date column to required format data = data.withColumn( 'nav_date', f.from_unixtime(f.unix_timestamp('nav_date', 'MM/dd/yyy')).alias('nav_date')) data = data.withColumn('nav_date', data.nav_date.cast(DateType())) ## changing datetime to required format data = data.withColumn( 'datetime', f.from_unixtime(f.unix_timestamp('datetime', 'MM/dd/yy HH:mm'))) data = data.withColumn('datetime', data.datetime.cast(TimestampType())) data = data.filter((data.nav_date >= '2017-10-02') & (data.nav_date <= '2018-04-31')) ## collecting date for partitioning date = data.select(f.max('nav_date')).collect()[0][0] year = date.year month = date.month
def test_unix_timestamp(data_gen): assert_gpu_and_cpu_are_equal_collect(lambda spark: unary_op_df( spark, data_gen).select(f.unix_timestamp(f.col('a'))))
by flight) order by UTC' ) adsbTable = adsbTable.dropDuplicates(['flight', 'lon', 'lat']) # the next set of lines will handle the scenario where the same flight number from one airport arrives at another # airport, then departs to another airport we do not want to treat this as one flight, rather multiple flights, for # example, if we have flight ACA108 that comes in from Montreal to Toronto then leaves for Winnipeg, we will have # ACA108-0 and ACA108-1 # # if there was just one flight number, then only -0 is appended # partition the flight by UTC window_flightNum_UTC = Window.partitionBy("flight").orderBy("UTC") # calculate the deltaTime between the previous and current coordinate for each flight adsbDf=adsbTable.withColumn("deltaTime", psf.unix_timestamp('UTC') - psf.lag(psf.unix_timestamp('UTC')). \ over(window_flightNum_UTC)) # create a trigger when the time elapse from the previous coordinates to the next is more than 1200 seconds # (20 minutes) note that this isn't foolproof, if there is gaps in your coverage that is 20 minutes or longer then # this will break that one flight into two separate flights, adjust the 1200 seconds below to suit your # requirements adsbDf1 = adsbDf.withColumn( "trigger", psf.when(adsbDf.deltaTime > 1200, 1).otherwise(0)) # create the suffix for the flight by using the cumulative sum of the triggers for each flight adsbDf2 = adsbDf1.withColumn( "suffix", psf.sum(adsbDf1.trigger).over(window_flightNum_UTC))
def filter_chart_events(spark, orig_chrtevents_file_path, admissions_csv_file_path, filtered_chrtevents_outfile_path): #TAKES ONLY THE RELEVANT ITEM ROWS FROM THE CHARTEVENTS.CSV file item_mappings = get_event_key_ids() #use subset of large CHARTEVENTS.csv file for faster development chrtevents_file_path_to_use = orig_chrtevents_file_path use_sample_subset_lines = False if use_sample_subset_lines: chartevents_sample_temp_file = "CHARTEVENTS_SAMPLE.csv" chrtevents_file_path_to_use = chartevents_sample_temp_file temp_file = open(chartevents_sample_temp_file, "w+") with open(orig_chrtevents_file_path) as orig_file: i = 0 for line in orig_file: temp_file.write(line) i = i + 1 if i > 500000: break temp_file.close() # LOS *** #********* los_path = os.path.join(PATH_MIMIC_ORIGINAL_CSV_FILES, "ICUSTAYS.csv") df_los = spark.read.csv(los_path, header=True, inferSchema="false") #Filter out less than one day lenght of stays df_los = df_los.filter(col('LOS') >= 1).select(['HADM_ID']) df_chartevents = spark.read.csv(chrtevents_file_path_to_use, header=True, inferSchema="false") filtered_chartevents = df_chartevents.filter( col('ITEMID').isin(list(item_mappings.keys()))) filtered_chartevents = filtered_chartevents.withColumn( "ITEMNAME", translate(item_mappings)("ITEMID")) #join filtered_chartevents with ADMISSIONS.csv on HADMID --- only keep HADMID AND ADMITTIME COLUMNS FROM ADMISSIONS df_admissions = spark.read.csv(admissions_csv_file_path, header=True, inferSchema="false").select( 'HADM_ID', 'ADMITTIME') #add column that contains the hour the observation occurred after admission (0 - X) filtered_chartevents = filtered_chartevents.join(df_admissions, ['HADM_ID']) timeFmt = "yyyy-MM-dd' 'HH:mm:ss" #2153-09-03 07:15:00 timeDiff = F.round( (F.unix_timestamp('CHARTTIME', format=timeFmt) - F.unix_timestamp('ADMITTIME', format=timeFmt)) / 60 / 60 ).cast( 'integer' ) #calc diff, convert seconds to minutes, minutes to hours, then math.floor to remove decimal places (for hourly bin/aggregations) filtered_chartevents = filtered_chartevents.withColumn( "HOUR_OF_OBS_AFTER_HADM", timeDiff) # F.round( ).cast('integer') #filter out all observations where X > 48 (occurred after initial 48 hours of admission) filtered_chartevents = filtered_chartevents.filter( (col('HOUR_OF_OBS_AFTER_HADM') <= 48) & (col('HOUR_OF_OBS_AFTER_HADM') >= 0)) filtered_chartevents = df_los.join(filtered_chartevents, ['HADM_ID']) #REMOVE columns that are not needed (keep CHARTEVENTS cols, ITEMNAME, HOUR_OF_OBS_AFTER_HADM filtered_chartevents = reduce(DataFrame.drop, ['ADMITTIME'], filtered_chartevents) with open(filtered_chrtevents_outfile_path, "w+") as f: w = csv.DictWriter(f, fieldnames=filtered_chartevents.schema.names) w.writeheader() for rdd_row in filtered_chartevents.rdd.toLocalIterator(): w.writerow(rdd_row.asDict())
def main(): """ - Downloads outstanding data - Sets up Spark environment - Loads data - Summarises data - Merges data - Prepares data for modelling :return: None """ # --- Download data (if its not already downloaded) if config.MODE == 'prod': # In production mode we want to download all the csv files # Datasets in develop are controlled by the user # Download and save taxi journey data download_data(config.TAXI_DATA_URLS, config.TAXI_DATA_DIR) # Download and save road traffic accident data download_data(config.ACCIDENT_DATA_URLS, config.ACCIDENT_DATA_DIR) # --- Set up Spark environment spark = SparkSession.builder.appName('Basics').getOrCreate() # --- Load data # Load and parse taxi data, add an id column taxi_df = load_data(spark, data_dir=config.TAXI_DATA_DIR, schema=config.TAXI_DATA_SCHEMA)\ .withColumn(config.TAXI_ID_COL, monotonically_increasing_id()) logging.info( f'Selecting a random {config.SAMPLE_RATE * 100}% of data before merging' ) # Get a random 1% of data with random seed=1 splits = taxi_df.randomSplit([1 - config.SAMPLE_RATE, config.SAMPLE_RATE], seed=1) taxi_df = splits[1] # Load and parse accident data, add an id column and a timestamp column accident_df = load_data(spark, data_dir=config.ACCIDENT_DATA_DIR, schema=config.ACCIDENT_DATA_SCHEMA)\ .withColumn(config.ACCIDENT_ID_COL, monotonically_increasing_id())\ .withColumn('accident_timestamp', unix_timestamp('date', 'MM/dd/yyyy').cast('timestamp')) # --- Summarise data # Plot and save data summary (if its not already saved) # plot_summary(taxi_df, 'pickup_datetime', config.TAXI_ID_COL, config.TAXI_VOLUME_PLOT_FILE) # plot_summary(accident_df, 'accident_timestamp', config.ACCIDENT_ID_COL, config.ACCIDENT_VOLUME_PLOT_FILE) # --- Create ML features # Merge nearby accidents with taxi trips (this is a very long running process) df = merge_accidents(taxi_df, accident_df) # Create day of week, hour of day values from time stamp df = timestamps_to_features(df, 'pickup_datetime') df = timestamps_to_features(df, 'dropoff_datetime') # --- Log the results of the data preparation stages logging.info('Data preparation complete') # RDD has a countApprox() function that is quicker than .count(), unsure if the conversion from DataFrame to RDD # cancels out the savings logging.info(f'Number of rows: {df.rdd.countApprox(10)}') logging.info(f'Data schema: \n{df._jdf.schema().treeString()}') # TODO extract all this to function(s) # --- Train a gradient boosted trees model to predict the duration of a trip # Create the target variable ('label' is a special column name in MLlib logging.info( 'Creating label column (based on the delta between dropoff_datetime and pickup_datetime)' ) df = df.withColumn( 'label', unix_timestamp(df['dropoff_datetime']) - unix_timestamp(df['pickup_datetime'])) logging.info('Dropping rows that contain null (for a subset of columns)') # Drop any samples with a NULL value df = df.na.drop( how="any", subset=[config.TAXI_DATA_SCHEMA.fieldNames()].extend([ 'pickup_datetime_day_of_week', 'pickup_datetime_hour_of_day', 'pickup_datetime_month_of_year', 'dropoff_datetime_day_of_week', 'dropoff_datetime_hour_of_day', 'dropoff_datetime_month_of_year' ])) logging.info('Filling any remaining null values with 99999') # Filling na values with code 99999 df = df.na.fill(value=99999) logging.info('Spliting records into training and testing sets') # Split the data into training and test sets (30% held out for testing) (training_df, testing_df) = df.randomSplit([0.7, 0.3]) # Define an "VectorAssembler", which joins multiple columns into a single vector ignore = [ 'label', 'pickup_datetime', 'dropoff_datetime', config.ACCIDENT_ID_COL, config.TAXI_ID_COL ] assembler = VectorAssembler( inputCols=[col for col in df.columns if col not in ignore], outputCol='features') # Transform the data using the defined assembler logging.info('Transforming data using VectorAssembler') df = assembler.transform(df) # Define a "VectorIndexer" which converts categorical fields (defined by having less than 20 unique values) into # one-hot (?) encoded data feature_indexer = VectorIndexer(inputCol="features", outputCol="indexedFeatures").fit( df.select('features')) # Train a GBT model gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10) # Chain assembler, indexer and GBT in a Pipeline pipeline = Pipeline(stages=[assembler, feature_indexer, gbt]) logging.info('Running model pipeline') # Train model. This also runs the indexer. model_pipeline = pipeline.fit(training_df) logging.info('Making model predictions') # Make predictions predictions = model_pipeline.transform(testing_df) # Select example rows to display predictions.select("prediction", "label", "features").show(5) logging.info('Evaluating predictions') # Select (prediction, true label) and compute test error evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse") rmse = evaluator.evaluate(predictions) logging.info(f'Root Mean Squared Error (RMSE) on test data: {rmse}') # TODO extract this to a function # Save the outputs (predictions and model) import uuid run_id = uuid.uuid4() date = datetime.date.today() model_pipeline.save( f'{config.MODEL_DIR}/{run_id}/model-{config.MODE}-{date}') predictions.select("prediction", "label").write.csv( f'{config.PREDICTIONS_DIR}/{run_id}/predictions-{config.MODE}-{date}.csv' ) # Print some cool stuff about the model gbt_model = model_pipeline.stages[2] logging.info(gbt_model) attrs = sorted((attr["idx"], attr["name"]) for attr in (chain( *df.schema["features"].metadata["ml_attr"]["attrs"].values()))) for idx, name in attrs: if gbt_model.featureImportances[idx]: print(name, gbt_model.featureImportances[idx])
def txtToPq_v2(inputFolder, pqFolder, pqFileName, searchString = "*.txt", append = True): """ Read in all txt files in a folder, convert to parquet, and either append parquet or create new parquet This version is compatible with some of the v1.1 files inside s3://flight.price.11 Main difference: leg1 is renamed to leg1 @params: inputFolder - Required : input folder that contains json line txt files (Str) pqFolder - Required : folder to save the parquet files into (Str) pqFileName - Required : parquet file name (Bool) append - Optional : append to existing parquet or create new parquet searchString - Optional : search string that identifies all the json line text files (Str) """ flightv1_1 = spark.read.json(os.path.join(inputFolder, searchString)) flightv1_1_2 = (flightv1_1.withColumn('trip', col('trip').cast('string')) .withColumn('stayDays', correct_stay_days_UDF(col('trip'), col('stayDays'))) .withColumn('depDate', to_date('depDate')) .withColumn('searchDate', to_date('searchDate')) .selectExpr('*', 'date_add(depDate, stayDays) as retDate')# this is when the return trip starts, might arrive a day later .withColumn('airline_code', flightv1_1.leg1.carrierSummary.airlineCodes.getItem(0)) .withColumn('airline_codes', flightv1_1.leg1.carrierSummary.airlineCodes) .withColumn('airline_codes_leg2', flightv1_1.leg2.carrierSummary.airlineCodes) .withColumn('departureTime', flightv1_1.leg1.departureTime.isoStr) .withColumn('departureTime_leg2', flightv1_1.leg2.departureTime.isoStr) .withColumn('arrivalTime', flightv1_1.leg1.arrivalTime.isoStr) .withColumn('arrivalTime_leg2', flightv1_1.leg2.arrivalTime.isoStr) # .withColumn('check_bag_inc', flightv1_1.leg1.arrivalTime) .withColumn('airlineName', flightv1_1.leg1.carrierSummary.airlineName) .withColumn('airlineName_leg2', flightv1_1.leg2.carrierSummary.airlineName) .withColumn('duration_m', (F.unix_timestamp('arrivalTime', format=timeFmt) - F.unix_timestamp('departureTime', format=timeFmt))/60) .withColumn('duration_m_leg2', (F.unix_timestamp('arrivalTime_leg2', format=timeFmt) - F.unix_timestamp('departureTime_leg2', format=timeFmt))/60) # .withColumn('duration', flightv1_1.timeline1.getItem(1).duration) .withColumn('airlineCode', flightv1_1.timeline1.getItem(0).carrier.airlineCode) .withColumn('flightNumber', flightv1_1.timeline1.getItem(0).carrier.flightNumber.cast('string')) .select('*', F.concat(col('airlineCode'), col('flightNumber')).alias('flight_code')) .drop('airlineCode', 'flightNumber') .withColumn('plane', flightv1_1.timeline1.getItem(0).carrier.plane) .withColumn('stops', flightv1_1.leg1.stops.cast('byte')) .withColumn('stops_leg2', flightv1_1.leg2.stops.cast('byte')) # .withColumn('stop_list', flightv1_1.leg1.stop_list)# need to do more work .withColumn('stop_airport', take_all_level1_str(flightv1_1.leg1.stop_list, lit('airport'))) .withColumn('stop_duration', take_all_level1_str(flightv1_1.leg1.stop_list, lit('duration'))) # .withColumn('stop_list_leg2', flightv1_1.leg2.stop_list) .withColumn('stop_airport_leg2', take_all_level1_str(flightv1_1.leg2.stop_list, lit('airport'))) .withColumn('stop_duration_leg2', take_all_level1_str(flightv1_1.leg2.stop_list, lit('duration'))) .withColumn('noOfTicketsLeft', correct_tickets_left_UDF(flightv1_1.leg1.carrierSummary.noOfTicketsLeft)) .withColumn('noOfTicketsLeft', col('noOfTicketsLeft').cast('byte')) .withColumn('noOfTicketsLeft_leg2', correct_tickets_left_UDF(flightv1_1.leg2.carrierSummary.noOfTicketsLeft)) .withColumn('noOfTicketsLeft_leg2', col('noOfTicketsLeft_leg2').cast('byte')) .withColumn('fromCityAirportCode', flightv1_1.leg1.departureLocation.airportCode) .withColumn('toCityAirportCode', flightv1_1.leg1.arrivalLocation.airportCode) .withColumn('fromCityAirportCode_leg2', flightv1_1.leg2.departureLocation.airportCode) .withColumn('toCityAirportCode_leg2', flightv1_1.leg2.arrivalLocation.airportCode) # carrier leg 1 .withColumn('carrierAirProviderId', flightv1_1.leg1.carrierSummary.airProviderId) .withColumn('carrierAirlineImageFileName', flightv1_1.leg1.carrierSummary.airlineImageFileName) .withColumn('carrierMixedCabinClass', flightv1_1.leg1.carrierSummary.mixedCabinClass) .withColumn('carrierMultiStop', flightv1_1.leg1.carrierSummary.multiStop) .withColumn('carrierNextDayArrival', flightv1_1.leg1.carrierSummary.nextDayArrival) # carrier leg 2 .withColumn('carrierAirProviderId_leg2', flightv1_1.leg2.carrierSummary.airProviderId) .withColumn('carrierAirlineImageFileName_leg2', flightv1_1.leg2.carrierSummary.airlineImageFileName) .withColumn('carrierMixedCabinClass_leg2', flightv1_1.leg2.carrierSummary.mixedCabinClass) .withColumn('carrierMultiStop_leg2', flightv1_1.leg2.carrierSummary.multiStop) .withColumn('carrierNextDayArrival_leg2', flightv1_1.leg2.carrierSummary.nextDayArrival) ### Leg 1 ## Leg 1 departure # .withColumn('timeline_departureAirport', take_all_airport(flightv1_1.timeline1, lit('departureAirport'))) .withColumn('timeline_departureAirport_cityState', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('airportCityState'))) .withColumn('timeline_departureAirport_city', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('city'))) .withColumn('timeline_departureAirport_code', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('code'))) .withColumn('timeline_departureAirport_localName', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('localName'))) .withColumn('timeline_departureAirport_longName', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('longName'))) .withColumn('timeline_departureAirport_name', take_all_level2_str(flightv1_1.timeline1, lit('departureAirport'), lit('name'))) .withColumn('timeline_departureTime', take_all_level2_str(flightv1_1.timeline1, lit('departureTime'), lit('isoStr'))) ## Leg 1 arrival .withColumn('timeline_arrivalAirport_cityState', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('airportCityState'))) .withColumn('timeline_arrivalAirport_city', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('city'))) .withColumn('timeline_arrivalAirport_code', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('code'))) .withColumn('timeline_arrivalAirport_localName', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('localName'))) .withColumn('timeline_arrivalAirport_longName', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('longName'))) .withColumn('timeline_arrivalAirport_name', take_all_level2_str(flightv1_1.timeline1, lit('arrivalAirport'), lit('name'))) .withColumn('timeline_arrivalTime', take_all_level2_str(flightv1_1.timeline1, lit('arrivalTime'), lit('isoStr'))) # distance .withColumn('timeline_distance', take_all_level2_str(flightv1_1.timeline1, lit('distance'), lit('formattedTotal'))) # carrier .withColumn('timeline_plane', take_all_level2_str(flightv1_1.timeline1, lit('carrier'), lit('plane'))) # brandedFareName .withColumn('timeline_brandedFareName', take_all_level1_str(flightv1_1.timeline1, lit('brandedFareName'))) # type .withColumn('timeline_type', take_all_level1_str(flightv1_1.timeline1, lit('type'))) ### Leg 2 ## Leg 2 departure .withColumn('timeline_departureAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('airportCityState'))) .withColumn('timeline_departureAirport_city_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('city'))) .withColumn('timeline_departureAirport_code_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('code'))) .withColumn('timeline_departureAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('localName'))) .withColumn('timeline_departureAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('longName'))) .withColumn('timeline_departureAirport_name_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureAirport'), lit('name'))) .withColumn('timeline_departureTime_leg2', take_all_level2_str(flightv1_1.timeline2, lit('departureTime'), lit('isoStr'))) ## Leg 2 arrival .withColumn('timeline_arrivalAirport_cityState_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('airportCityState'))) .withColumn('timeline_arrivalAirport_city_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('city'))) .withColumn('timeline_arrivalAirport_code_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('code'))) .withColumn('timeline_arrivalAirport_localName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('localName'))) .withColumn('timeline_arrivalAirport_longName_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('longName'))) .withColumn('timeline_arrivalAirport_name_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalAirport'), lit('name'))) .withColumn('timeline_arrivalTime_leg2', take_all_level2_str(flightv1_1.timeline2, lit('arrivalTime'), lit('isoStr'))) # distance .withColumn('timeline_distance_leg2', take_all_level2_str(flightv1_1.timeline2, lit('distance'), lit('formattedTotal'))) # carrier .withColumn('timeline_plane_leg2', take_all_level2_str(flightv1_1.timeline2, lit('carrier'), lit('plane'))) # brandedFareName .withColumn('timeline_brandedFareName_leg2', take_all_level1_str(flightv1_1.timeline2, lit('brandedFareName'))) # type .withColumn('timeline_type_leg2', take_all_level1_str(flightv1_1.timeline2, lit('type'))) # create variables droppped from v1.0 .withColumn('span_days', lit(99)) .withColumn('power', lit(False)) .withColumn('video', lit(False)) .withColumn('wifi', lit(False)) .withColumn('stop_info', col('stop_airport')) #placeholder. can't figure out how to create struct literal .select('price', 'version', 'searchDate', 'tableName', 'task_id', 'currencyCode', 'fromCity', 'toCity', 'trip', 'depDate', 'retDate', 'stayDays', 'departureTime', 'arrivalTime', 'departureTime_leg2', 'arrivalTime_leg2', 'airlineName', 'airlineName_leg2', 'duration_m', 'duration_m_leg2', 'flight_code', 'plane', 'stops', 'stops_leg2', 'stop_airport', 'stop_duration', 'stop_airport_leg2', 'stop_duration_leg2', 'noOfTicketsLeft', 'noOfTicketsLeft_leg2', 'airline_code', 'airline_codes', 'airline_codes_leg2', 'fromCityAirportCode', 'toCityAirportCode', 'fromCityAirportCode_leg2', 'toCityAirportCode_leg2', 'carrierAirProviderId', 'carrierAirlineImageFileName', 'carrierMixedCabinClass', 'carrierMultiStop', 'carrierNextDayArrival', 'carrierAirProviderId_leg2', 'carrierAirlineImageFileName_leg2', 'carrierMixedCabinClass_leg2', 'carrierMultiStop_leg2', 'carrierNextDayArrival_leg2', #'url', ## leg 1 # departure 'timeline_departureAirport_cityState', 'timeline_departureAirport_city', 'timeline_departureAirport_code', 'timeline_departureAirport_localName', 'timeline_departureAirport_longName', 'timeline_departureAirport_name', 'timeline_departureTime', # arrival 'timeline_arrivalAirport_cityState', 'timeline_arrivalAirport_city', 'timeline_arrivalAirport_code', 'timeline_arrivalAirport_localName', 'timeline_arrivalAirport_longName', 'timeline_arrivalAirport_name', 'timeline_arrivalTime', 'timeline_distance', 'timeline_plane', 'timeline_brandedFareName', 'timeline_type', ## leg 2 # departure 'timeline_departureAirport_cityState_leg2', 'timeline_departureAirport_city_leg2', 'timeline_departureAirport_code_leg2', 'timeline_departureAirport_localName_leg2', 'timeline_departureAirport_longName_leg2', 'timeline_departureAirport_name_leg2', 'timeline_departureTime_leg2', # arrival 'timeline_arrivalAirport_cityState_leg2', 'timeline_arrivalAirport_city_leg2', 'timeline_arrivalAirport_code_leg2', 'timeline_arrivalAirport_localName_leg2', 'timeline_arrivalAirport_longName_leg2', 'timeline_arrivalAirport_name_leg2', 'timeline_arrivalTime_leg2', 'timeline_distance_leg2', 'timeline_plane_leg2', 'timeline_brandedFareName_leg2', 'timeline_type_leg2', # variables dropped from v1.0 'span_days', 'power', 'video', 'wifi', 'stop_info' ) ) if append: flightv1_1_2.repartition(1).write.mode('append').parquet(os.path.join(pqFolder, pqFileName)) else: flightv1_1_2.repartition(1).write.parquet(os.path.join(pqFolder, pqFileName))
spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false") spark._jsc.hadoopConfiguration().set("parquet.enable.summary-metadata", "false") ## Read in data by pointing to its's table name in Glue Data Catalog schema = StructType() \ .add('source', StringType()) \ .add('type', StringType()) \ .add('data', StringType()) \ .add('ts', StringType()) src = spark.read.load('s3://demoanalyticsapp-output/*/*/*/*/', format='parquet', schema=schema) ## Build out some new columns with data extracted from the JSON string df = src \ .withColumn('n_ts', F.unix_timestamp('ts', "yyyy-MM-dd'T'hh:mm:ss").cast('timestamp')) \ .withColumn('agerange', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].agerange'), \ StructType().add('high', IntegerType()).add('low', IntegerType()) \ ) \ ) \ .withColumn('smile', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].smile'), \ StructType().add('confidence', DoubleType()).add('value', BooleanType()) \ ) \ ) \ .withColumn('eyeglasses', \ F.from_json( \ F.get_json_object('data', '$.facedetails[*].eyeglasses'), \
def main(): parser = argparse.ArgumentParser() parser.add_argument("--lang", "-l", default="enwiki", type=str, help="language to parse (en or enwiki)") parser.add_argument( "--start", "-t1", default=None, type=str, help= "start day to parse [inclusive] (YYYY-MM-DD); default: previous day - 7days" ) parser.add_argument( "--end", "-t2", default=None, type=str, help="end day to parse [exclusive] (YYYY-MM-DD); default: current day") args = parser.parse_args() lang = args.lang.replace('wiki', '') wiki_db = lang + 'wiki' t1 = args.start t2 = args.end if t1 != None and t2 != None: try: date_start = datetime.datetime.strptime(t1, '%Y-%m-%d') date_end = datetime.datetime.strptime(t2, '%Y-%m-%d') except ValueError: print('Provide correct day-format YYYY-MM-DD') else: date_start = datetime.date.today() - datetime.timedelta(days=8) date_end = datetime.date.today() date_start_str = date_start.strftime('%Y-%m-%d') date_end_str = date_end.strftime('%Y-%m-%d') #### other parameters ## filter pageviews from actor with more than 500 pageviews ## the aim is to filter automated traffic that is not tagged as spider n_p_max = 500 ## maximum number of pageviews/user/day n_p_min = 1 ## minimum number of pageviews/user/day ## filtering sessions dt = 3600 ## cutoff for splitting sessions(interevent time between 2 pageivews) nlen_min = 2 ## min length of session nlen_max = 30 ## max length of session ## sessions will be saved locally in filename_save path_save = os.path.abspath('../data/%s/' % lang) # filename_save = '%s.reading-sessions-%s--%s'%(lang,date_start_str,date_end_str) filename_save = '%s.reading-sessions' % (lang) ## tmp-directory for data on hive (will be deleted) base_dir_hdfs = '/tmp/mwaddlink/sessions' ### start spark = SparkSession.builder\ .master('yarn')\ .appName('reading-sessions')\ .enableHiveSupport()\ .getOrCreate() ######## ## query ################################################ ## time-window ts_start = calendar.timegm(date_start.timetuple()) ts_end = calendar.timegm(date_end.timetuple()) row_timestamp = F.unix_timestamp( F.concat(F.col('year'), F.lit('-'), F.col('month'), F.lit('-'), F.col('day'), F.lit(' '), F.col('hour'), F.lit(':00:00'))) ## window for counting pageviews per actor per day w_p = Window.partitionBy(F.col('actor_signature_per_project_family'), F.col('year'), F.col('month'), F.col('day')) ### actor table (filtered webrequests) ## https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Traffic/Pageview_actor df_actor = ( spark.read.table('wmf.pageview_actor').where( row_timestamp >= ts_start).where(row_timestamp < ts_end).where( F.col('is_pageview') == True) ## agent-type user to filter spiders ## https://meta.wikimedia.org/wiki/Research:Page_view/Tags#Spider .where(F.col('agent_type') == "user") ## user: desktop/mobile/mobile app; isaac filters != mobile app .where(F.col('access_method') != "mobile app") ## only wikis .where(F.col('normalized_host.project_family') == 'wikipedia') ## only namespace 0 .where(F.col('namespace_id') == 0).withColumn( 'wiki_db', F.concat(F.col('normalized_host.project'), F.lit('wiki')))) ## filter only specific wiki (or all if wiki_db=='wikidata') if wiki_db == 'wikidata': pass else: df_actor = df_actor.where(F.col('wiki_db') == wiki_db) ## checkpoint for inspecting table # df_actor.limit(10).write.mode('overwrite').parquet('/user/mgerlach/sessions/test.parquet') # filter maximum and minimum pageviews per user # n_p is the number of pageviews per actor per day (across projects) df_actor = (df_actor.withColumn( 'n_p', F.sum(F.lit(1)).over(w_p)).where(F.col('n_p') >= n_p_min).where( F.col('n_p') <= n_p_max)) ## join the wikidata-item to each pageview ## we keep only pageviews for which we have a correpsionding wikidata-item id ## table with mapping wikidata-ids to page-ids ## partition wikidb and page-id ordered by snapshot w_wd = Window.partitionBy(F.col('wiki_db'), F.col('page_id')).orderBy( F.col('snapshot').desc()) df_wd = ( spark.read.table('wmf.wikidata_item_page_link') ## snapshot: this is a partition! .where( F.col('snapshot') >= '2020-07-01') ## resolve issues with non-mathcing wikidata-items ## only wikis (enwiki, ... not: wikisource) .where(F.col('wiki_db').endswith('wiki'))) ## filter only specific wiki (or all if wiki_db=='wikidata') if wiki_db == 'wikidata': pass else: df_wd = df_wd.where(F.col('wiki_db') == wiki_db) ## get the most recent wikidata-item for each pid+wikidb df_wd = (df_wd.withColumn( 'item_id_latest', F.first(F.col('item_id')).over(w_wd)).select( 'wiki_db', 'page_id', F.col('item_id_latest').alias('item_id')).drop_duplicates()) df_actor_wd = (df_actor.join(df_wd, on=['page_id', 'wiki_db'], how='inner')) ## aggregate all pageviews with same actor-signature across wikis to get sessions df_actor_wd_agg = ( df_actor_wd.groupby('actor_signature_per_project_family').agg( F.first(F.col('access_method')).alias( 'access_method'), ## this could change along a session F.first(F.col('geocoded_data')).alias('geocoded_data'), # F.first(F.col('n_p_by_user')).alias('session_length'), F.array_sort( F.collect_list( F.struct( F.col('ts'), F.col('page_id'), F.col('pageview_info.page_title').alias('page_title'), F.col('wiki_db'), F.col('item_id').alias('qid'), ))).alias('session'))) # df_actor_wd_agg.limit(10).write.mode('overwrite').parquet('/user/mgerlach/sessions/test.parquet') ## apply filter to the sessions try: os.mkdir(path_save) except FileExistsError: pass PATH_TMP = os.path.join(path_save, 'tmp') try: os.mkdir(PATH_TMP) except FileExistsError: pass ## hdfs-storing, some temporary files which will be deleted later output_hdfs_dir = os.path.join(base_dir_hdfs, filename_save) os.system('hadoop fs -rm -r %s' % output_hdfs_dir) ## local storing base_dir_local = path_save output_local_dir_tmp = os.path.join(base_dir_local, 'tmp', filename_save) output_local_file = os.path.join(base_dir_local, filename_save) ## load data # requests = spark.read.load(filename).rdd.map(lambda x: x['session']) requests = df_actor_wd_agg.rdd.map(lambda x: x['session']) ## keep only pageviews from a language requests = requests.map(lambda rs: [r for r in rs if r['page_id'] != None]) to_str = lambda x: ' '.join([str(e['page_id']) for e in x]) (requests.map(parse_requests).filter( filter_blacklist_qid) ## remove main_page .filter(lambda x: len(x) >= nlen_min ) ## only sessions with at least length nlen_min .map(filter_consecutive_articles ) ## remove consecutive calls to same article .filter(lambda x: len(x) >= nlen_min ) ## only sessions with at least length nlen_min .flatMap(lambda x: sessionize(x, dt=dt) ) ## break sessions if interevent time is too large .filter(lambda x: len(x) >= nlen_min ) ## only sessions with at least length nlen_min .filter(lambda x: len(x) <= nlen_max ) ## only sessions with at most length nlen_max .map(to_str) ## conctenate session as single string ## write to hdfs .saveAsTextFile( output_hdfs_dir, compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")) ## copy to local (set of tmp-dirs) os.system('hadoop fs -copyToLocal %s %s' % (output_hdfs_dir, output_local_dir_tmp)) ## concatenate and unzip into single file os.system('cat %s/* | gunzip > %s' % (output_local_dir_tmp, output_local_file)) # ## remove set of tmp-dirs os.system('rm -rf %s' % output_local_dir_tmp) # ## remove hadoop data os.system('hadoop fs -rm -r %s' % output_hdfs_dir) print('Path to reading sessions: %s' % filename_save) return filename_save
def accident_fact_dimension(): #Download all the required csv files from S3 sc_2005 = spark.read.csv( '/home/workspace/uk-data/accidents_2005_to_2007.csv', header=True, sep=",") sc_2011 = spark.read.csv( '/home/workspace/uk-data/accidents_2009_to_2011.csv', header=True, sep=",") sc_2014 = spark.read.csv( '/home/workspace/uk-data/accidents_2012_to_2014.csv', header=True, sep=",") sc_LAD = spark.read.csv('/home/workspace/uk-data/LAD.csv', header=True, sep=",") sc_LAD3 = spark.read.csv('/home/workspace/uk-data/LAD3.csv', header=True, sep=",") sc_LAD4 = spark.read.csv('/home/workspace/uk-data/LAD4.csv', header=True, sep=",") #Combine the accident data def unionAll(*dfs): return reduce(DataFrame.unionAll, dfs) sc_accidents = unionAll(sc_2005, sc_2011, sc_2014) ##Convert Columns to their respective data types int_cols = ['Location_Easting_OSGR', 'Location_Northing_OSGR', 'Police_Force', 'Accident_Severity', \ 'Number_of_Vehicles', 'Number_of_Casualties', 'Urban_or_Rural_Area', '1st_Road_Number', '2nd_Road_Number'] float_cols = ['Longitude', 'Latitude'] for col_name in int_cols: sc_accidents = sc_accidents.withColumn(col_name, F.col(col_name).cast('int')) for col_name in float_cols: sc_accidents = sc_accidents.withColumn(col_name, F.col(col_name).cast('float')) #copy the dataframe into another one sc_uk = sc_accidents.select("*") #### DROP COLUMNS WITH MORE THAN 10,000 NULL VALUES ##### ##### AND AFTER THAT REMOVE NULL VALUES FROM THE DF ##### drop_items = [] for col in sc_uk.columns: null_length = sc_uk.select(col).filter(F.col(col).isNull()).count() none_length = sc_uk.select(col).filter(F.col(col) == 'None').count() if (null_length >= 10000) or (none_length >= 10000): #print(col, ':', null_length, ' ', none_length) drop_items.append(col) sc_uk = sc_uk.drop(*drop_items) sc_uk = sc_uk.na.drop() #create a new column called date_time sc_uk = sc_uk.withColumn( 'date_time', F.concat(F.col("Date"), F.lit(" "), F.col("Time"))) #Drop rows if the accidents have the same time and position more than ONCE, that means it is a duplicate sc_uk = sc_uk.dropDuplicates(['date_time', 'Longitude', 'Latitude']) #### CREATE LOCATION DIMENSION TABLE #### #Accident Index -- as the primary key for the location table loc_table = sc_uk.select('Accident_Index', 'Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude', 'Latitude').distinct() loc_table.show(10) ##### CREATE TIME DIMENSION TABLE ###### #this requires some feature engineering sc_uk = sc_uk.drop(*['Date', 'Time']) sc_uk = sc_uk.withColumn( 'date_time', F.from_unixtime(F.unix_timestamp("date_time", "dd/MM/yy HH:mm")).cast('timestamp')) sc_uk = sc_uk.withColumn('Day_of_Week', dayofweek('date_time')) sc_uk = sc_uk.withColumn('Year', year('Year')) sc_uk = sc_uk.withColumn('Month', month(to_date("date_time", "MM/dd/yyyy"))) sc_uk = sc_uk.withColumn('week_of_year', weekofyear(to_date('date_time'))) sc_uk = sc_uk.withColumn( "week_of_month", date_format(to_date("date_time", "dd/MMM/yyyy"), "W")) time_table = sc_uk.select('date_time', 'Day_of_Week', 'week_of_year', 'week_of_month', 'Month', 'Year').dropDuplicates() ##Time Table time_table.show(10) ###### CREATE LOCAL AUTHORITY DIMENSION TABLE ###### sc_uk = sc_uk.withColumnRenamed("Local_Authority_(Highway)", "LAD_Code") sc_LAD = sc_LAD.withColumnRenamed("Supergroup name","Geographical Area") \ .withColumnRenamed("Code","LAD_Code") #join uk accident DF with the LAD DF sc_uk = sc_uk.join(sc_LAD, ["LAD_Code"], how='inner') sc_LAD2 = sc_LAD3.join(sc_LAD4, ["Sub-Group"], how='inner') sc_LAD2 = sc_LAD2.withColumnRenamed("ONS Code", "LAD_Code") sc_uk = sc_uk.join(sc_LAD2, ["LAD_Code"], how='inner') sc_uk = sc_uk.drop('Name') #LAD table LAD_table = sc_uk.select( "LAD_Code", F.col('Sub-Group').alias('SubGroup'), F.col('Highway Authority').alias('HighwayAuthority'), F.col('Sub-Group Description').alias('SubGroup_Description'), F.col('Super-Group').alias('SuperGroup'), F.col('Network Density Range').alias('Network_Density_Range'), F.col('Urban Road Range').alias('Urban_Road_Range'), F.col('Region/Country').alias('Region_Country'), F.col('Geographical Area').alias('Geographical_Area')).distinct() LAD_table.show(10) #### CREATE CONDITION DIMENSION TABLE##### #consist of 'Light_Conditions', 'Weather_Conditions', 'Road_Surface_Conditions' ##Change "Other" value in Weather Condition table to "Unknown"- sc_uk = sc_uk.withColumn( "Weather_Conditions", regexp_replace("Weather_Conditions", "Other", "Unknown")) #create a column known as conditions that consist of all the conditions sc_uk = sc_uk.withColumn('conditions', F.concat(F.col('Light_Conditions'), \ F.lit(" and "), F.col('Weather_Conditions'), F.lit(" and "), F.col('Road_Surface_Conditions'))) #Extract values from columns def extract_values(df, column): values = df.select(column).distinct().collect() values = [x[0] for x in values] return values values = extract_values(sc_uk, 'conditions') #this function will be used to create primary id for dimension tables def new_column(column): if column in values: return values.index(column) + 1 udfValue = udf(new_column, Int()) sc_uk = sc_uk.withColumn('Condition_Index', udfValue('conditions')) #condition table condition_table = sc_uk.select("Condition_Index", 'Weather_Conditions', 'Light_Conditions', 'Road_Surface_Conditions').distinct() condition_table.sort('Condition_Index').show(10) #### CREATE ROAD DIMENSION TABLE ##### #'1st_Road_Class', 'Road_Type', '2nd_Road_Class' sc_uk = sc_uk.withColumn( 'road_classification', F.concat(F.col('1st_Road_Class'), F.lit(" and "), F.col('Road_Type'), F.lit(" and "), F.col('2nd_Road_Class'))) values = extract_values(sc_uk, 'road_classification') udfValue = udf(new_column, Int()) sc_uk = sc_uk.withColumn('Road_Index', udfValue('road_classification')) #road table road_table = sc_uk.select('Road_Index', '1st_Road_Class', 'Road_Type', '2nd_Road_Class').distinct() road_table.sort('Road_Index').show(5) #### CREATE PEDESTRIAN DIMENSION TABLE ##### # Pedestrian_Crossing-Human_Control, Pedestrian_Crossing-Physical_Facilities sc_uk = sc_uk.withColumn( 'pedestrian', F.concat(F.col('Pedestrian_Crossing-Human_Control'), F.lit(" and "), F.col('Pedestrian_Crossing-Physical_Facilities'))) values = extract_values(sc_uk, 'pedestrian') udfValue = udf(new_column, Int()) sc_uk = sc_uk.withColumn('Pedestrian_Index', udfValue('pedestrian')) #pedestrian table pedestrian_table = sc_uk.select( 'Pedestrian_Index', 'Pedestrian_Crossing-Human_Control', 'Pedestrian_Crossing-Physical_Facilities').distinct() pedestrian_table.sort('Pedestrian_Index').show(10) #drop the features used to create ids for the dimension table drop_list = ['conditions', 'road_classification', 'pedestrian'] sc_uk = sc_uk.drop(*drop_list) ##### CREATE ACCIDENT TABLE --- FACT TABLE ##### #ACCIDENT INDEX -- PRIMARY KEY FOR LOCATION TABLE #LAD_CODE -- PRIMARY KEY FOR LOCAL AUTHORITY DISTRICT TABLE #DATE TIME -- PRIMARY KEY FOR TIME TABLE #CONDITION INDEX -- PRIMARY KEY FOR CONDITIONS TABLE #ROAD INDEX -- PRIMARY KEY FOR ROAD TABLE #PEDESTRIAN INDEX -- PRIMARY KEY FOR PEDESTRIAN TABLE accident_fact_table = sc_uk.select( 'Accident_Index', 'LAD_Code', 'date_time', 'Year', 'Police_Force', 'Accident_Severity', 'Number_of_Vehicles', 'Number_of_Casualties', '1st_Road_Number', 'Speed_limit', '2nd_Road_Number', 'Urban_or_Rural_Area', 'Did_Police_Officer_Attend_Scene_of_Accident', 'Condition_Index', 'Road_Index', 'Pedestrian_Index') #write the tables into csv format loc_table.toPandas().to_csv('/home/workspace/uk-accident/loc_table.csv', index=False, header=None) time_table.toPandas().to_csv('/home/workspace/uk-accident/time_table.csv', index=False, header=None) LAD_table.toPandas().to_csv('/home/workspace/uk-accident/LAD_table.csv', index=False, header=None) condition_table.toPandas().to_csv( '/home/workspace/uk-accident/condition_table.csv', index=False, header=None) pedestrian_table.toPandas().to_csv( '/home/workspace/uk-accident/pedestrian_table.csv', index=False, header=None) road_table.toPandas().to_csv('/home/workspace/uk-accident/rd_table.csv', index=False, header=None) accident_fact_table.toPandas().to_csv( '/home/workspace/uk-accident/accident_fact_table.csv', index=False, header=None)
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source ho_chi_minh_timezone = pytz.timezone('Asia/Ho_Chi_Minh') today = datetime.now(ho_chi_minh_timezone) today_second = long(today.strftime("%s")) print('today_id: ', today_second) #------------------------------------------------------------------------------------------------------------------# def getSolanBaoLuu(solan_baoluu, songay_baoluu): if solan_baoluu is None: solan_baoluu = 0 if songay_baoluu is None: songay_baoluu = 0 if solan_baoluu > songay_baoluu: return songay_baoluu return solan_baoluu getSolanBaoLuu = udf(getSolanBaoLuu, LongType()) def getSoNgayBaoLuu(solan_baoluu, songay_baoluu): if solan_baoluu is None: solan_baoluu = 0 if songay_baoluu is None: songay_baoluu = 0 if songay_baoluu > solan_baoluu: return songay_baoluu return solan_baoluu getSoNgayBaoLuu = udf(getSoNgayBaoLuu, LongType()) def getContactId(code, contact_id_advisor): if code is not None: return code return contact_id_advisor getContactId = udf(getContactId, StringType()) def concaText(student_behavior_date, behavior_id, student_id, contact_id, package_code, package_endtime,package_starttime, student_level_code, student_status_code, transformed_at): text_concat = "" if student_behavior_date is not None: text_concat += str(student_behavior_date) if behavior_id is not None: text_concat += str(behavior_id) if student_id is not None: text_concat += str(student_id) if contact_id is not None: text_concat += str(contact_id) if package_code is not None: text_concat += str(package_code) if package_endtime is not None: text_concat += str(package_endtime) if package_starttime is not None: text_concat += str(package_starttime) if student_level_code is not None: text_concat += str(student_level_code) if student_status_code is not None: text_concat += str(student_status_code) if transformed_at is not None: text_concat += str(transformed_at) return text_concat concaText = udf(concaText, StringType()) # ------------------------------------------------------------------------------------------------------------------# #------------------------------------------------------------------------------------------------------------------# dyf_poss_ghinhan_hp = glueContext.create_dynamic_frame.from_catalog(database='poss', table_name='ghinhan_hp') dyf_poss_ghinhan_hp = dyf_poss_ghinhan_hp.select_fields(['_key', 'id', 'ngay_thanhtoan', 'so_tien', 'khoa_hoc_makh', 'trang_thai']) dyf_poss_ghinhan_hp = dyf_poss_ghinhan_hp.resolveChoice(specs=[('_key', 'cast:long')]) try: df_flag = spark.read.parquet("s3a://dtsodin/flag/student_behavior/sb1_dong_tien.parquet") read_from_index = df_flag.collect()[0]['flag'] print('read from index: ', read_from_index) dyf_poss_ghinhan_hp = Filter.apply(frame=dyf_poss_ghinhan_hp, f=lambda x: x["_key"] > read_from_index) except: print('read flag file error ') dyf_poss_ghinhan_hp_number = dyf_poss_ghinhan_hp.count() print('dyf_poss_ghinhan_hp_number: ', dyf_poss_ghinhan_hp_number) if dyf_poss_ghinhan_hp_number < 1: return #-------------------------------------------------------------------------------------------------------------------# dyf_poss_khoa_hoc = glueContext.create_dynamic_frame.from_catalog(database='poss', table_name='khoa_hoc') dyf_poss_khoa_hoc = dyf_poss_khoa_hoc.select_fields(['makh', 'mahv', 'goi_sanpham_id', 'trang_thai']) # -------------------------------------------------------------------------------------------------------------------# dyf_poss_hoc_vien = glueContext.create_dynamic_frame.from_catalog(database='poss', table_name='hoc_vien') dyf_poss_hoc_vien = dyf_poss_hoc_vien.select_fields(['mahv', 'crm_id', 'trang_thai']).rename_field( 'mahv', 'mahv_hv') # -------------------------------------------------------------------------------------------------------------------# dyf_poss_goi_sanpham = glueContext.create_dynamic_frame.from_catalog(database='poss', table_name='goi_sanpham') dyf_poss_goi_sanpham = dyf_poss_goi_sanpham.select_fields(['ma', 'id', 'solan_baoluu', 'songay_baoluu', 'trang_thai']) # -------------------------------------------------------------------------------------------------------------------# # -------------------------------------------------------------------------------------------------------------------# dyf_crm_goi_contacts = glueContext.create_dynamic_frame.from_catalog(database='crm_native', table_name='contacts') # print('dyf_crm_goi_contacts::full') # # dyf_crm_goi_contacts.printSchema() dyf_crm_goi_contacts = dyf_crm_goi_contacts.select_fields(['Code']).rename_field('Code', 'code') dyf_crm_goi_contacts = Filter.apply(frame=dyf_crm_goi_contacts, f=lambda x: x["code"] is not None and x["code"] != '') dy_crm_goi_contacts = dyf_crm_goi_contacts.toDF() dy_crm_goi_contacts = dy_crm_goi_contacts.dropDuplicates() # print('dy_crm_goi_contacts') # dy_crm_goi_contacts.printSchema() # -------------------------------------------------------------------------------------------------------------------# dyf_advisor_student_contact = glueContext.create_dynamic_frame.from_catalog(database='tig_advisor', table_name='student_contact') dyf_advisor_student_contact = dyf_advisor_student_contact.select_fields(['student_id', 'contact_id']) dyf_advisor_student_contact = Filter.apply(frame=dyf_advisor_student_contact, f=lambda x: x["student_id"] is not None and x["student_id"] != '' and x["contact_id"] is not None and x["contact_id"] != '')\ .rename_field('student_id', 'student_id_advisor')\ .rename_field('contact_id', 'contact_id_advisor') dy_advisor_student_contact = dyf_advisor_student_contact.toDF() dy_advisor_student_contact = dy_advisor_student_contact.dropDuplicates(['student_id_advisor']) # print('dy_advisor_student_contact') # dy_advisor_student_contact.printSchema() # -------------------------------------------------------------------------------------------------------------------# # print('dyf_poss_ghinhan_hp') # dyf_poss_ghinhan_hp.printSchema() # # print('dyf_poss_khoa_hoc') # dyf_poss_khoa_hoc.printSchema() # # print('dyf_poss_hoc_vien') # dyf_poss_hoc_vien.printSchema() # # print('dyf_poss_goi_sanpham') # dyf_poss_goi_sanpham.printSchema() dy_poss_ghinhan_hp = dyf_poss_ghinhan_hp.toDF() dy_poss_ghinhan_hp = dy_poss_ghinhan_hp.dropDuplicates(['id']) dy_poss_khoa_hoc = dyf_poss_khoa_hoc.toDF() dy_poss_khoa_hoc = dy_poss_khoa_hoc.dropDuplicates(['makh', 'mahv']) dy_poss_hoc_vien = dyf_poss_hoc_vien.toDF() dy_poss_hoc_vien = dy_poss_hoc_vien.dropDuplicates(['mahv_hv']) dy_poss_goi_sanpham = dyf_poss_goi_sanpham.toDF() dy_poss_hoc_vien = dy_poss_hoc_vien.dropDuplicates(['crm_id']) poss_ghinhan_hp_number = dy_poss_ghinhan_hp.count() # print('poss_ghinhan_hp_number: ', poss_ghinhan_hp_number) if poss_ghinhan_hp_number < 1: return df_dong_tien = dy_poss_ghinhan_hp.join(dy_poss_khoa_hoc, dy_poss_ghinhan_hp.khoa_hoc_makh == dy_poss_khoa_hoc.makh, 'left')\ .join(dy_poss_hoc_vien, dy_poss_hoc_vien.mahv_hv == dy_poss_khoa_hoc.mahv, 'left')\ .join(dy_poss_goi_sanpham, dy_poss_goi_sanpham.id == dy_poss_khoa_hoc.goi_sanpham_id, 'left') df_dong_tien = df_dong_tien.select('ngay_thanhtoan', 'ma', 'crm_id', 'so_tien', getSolanBaoLuu(df_dong_tien['solan_baoluu'], df_dong_tien['songay_baoluu']) .alias('solan_baoluu_t'), getSoNgayBaoLuu(df_dong_tien['solan_baoluu'], df_dong_tien['songay_baoluu']) .alias('songay_baoluu_t')) # print('df_dong_tien') # df_dong_tien.printSchema() #check lms_id and contact_id df_dong_tien_student = df_dong_tien.join(dy_crm_goi_contacts, df_dong_tien.crm_id == dy_crm_goi_contacts.code, 'left')\ .join(dy_advisor_student_contact, df_dong_tien.crm_id == dy_advisor_student_contact.student_id_advisor, 'left') # print('df_dong_tien_student-----') # df_dong_tien_student.printSchema() df_dong_tien_student = df_dong_tien_student.filter(df_dong_tien_student.code.isNotNull() | (df_dong_tien_student.contact_id_advisor.isNotNull())) df_dong_tien_student = df_dong_tien_student.limit(100) student_id_unavailable = 0L package_endtime_unavailable = 0L package_starttime_unavailable = 0L student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' measure1_unavailable = 0 measure2_unavailable = 0 measure3_unavailable = 0 measure4_unavailable = float(0.0) df_dong_tien_student = df_dong_tien_student.select( f.unix_timestamp(df_dong_tien_student.ngay_thanhtoan, 'yyyy-MM-dd').alias('student_behavior_date'), f.lit(1L).alias('behavior_id'), f.lit(student_id_unavailable).cast('long').alias('student_id'), getContactId(df_dong_tien_student.code, df_dong_tien_student.contact_id_advisor).alias('contact_id'), df_dong_tien_student.ma.alias('package_code'), f.lit(package_endtime_unavailable).cast('long').alias('package_endtime'), f.lit(package_starttime_unavailable).cast('long').alias('package_starttime'), f.lit(student_level_code_unavailable).cast('string').alias('student_level_code'), f.lit(student_status_code_unavailable).cast('string').alias('student_status_code'), f.lit(today_second).alias('transformed_at'), 'so_tien', 'solan_baoluu_t', 'songay_baoluu_t', f.lit(measure4_unavailable).alias('measure4') ) print('df_dong_tien_student--1') df_dong_tien_student.printSchema() df_dong_tien_student.show(1) df_dong_tien_student2 = df_dong_tien_student.withColumn('student_behavior_id', f.md5(concaText(df_dong_tien_student.student_behavior_date, df_dong_tien_student.behavior_id, df_dong_tien_student.student_id, df_dong_tien_student.contact_id, df_dong_tien_student.package_code, df_dong_tien_student.package_endtime, df_dong_tien_student.package_starttime, df_dong_tien_student.student_level_code, df_dong_tien_student.student_status_code, df_dong_tien_student.transformed_at))) print('df_dong_tien_student2--2') df_dong_tien_student2.printSchema() df_dong_tien_student2.show(5) dyf_dong_tien_student = DynamicFrame.fromDF(df_dong_tien_student2, glueContext, 'dyf_dong_tien_student') dyf_dong_tien_student = Filter.apply(frame=dyf_dong_tien_student, f=lambda x: x["contact_id"] is not None and x["contact_id"] != '') apply_ouput = ApplyMapping.apply(frame=dyf_dong_tien_student, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("student_behavior_date", "long", "student_behavior_date", "long"), ("behavior_id", "long", "behavior_id", "long"), ("student_id", "long", "student_id", "long"), ("contact_id", "string", "contact_id", "string"), ("package_code", "long", "package_code", "string"), ("package_endtime", "long", "package_endtime", "long"), ("package_starttime", "long", "package_starttime", "long"), ("student_level_code", "string", "student_level_code", "string"), ("student_status_code", "string", "student_status_code", "string"), ("transformed_at", "long", "transformed_at", "long") ]) dfy_output = ResolveChoice.apply(frame=apply_ouput, choice="make_cols", transformation_ctx="resolvechoice2") glueContext.write_dynamic_frame.from_options( frame=dfy_output, connection_type="s3", connection_options={"path": "s3://dtsodin/student_behavior/student_behavior", "partitionKeys": ["behavior_id"]}, format="parquet") apply_general = ApplyMapping.apply(frame=dyf_dong_tien_student, mappings=[("student_behavior_id", "string", "student_behavior_id", "string"), ("so_tien", "double", "measure1", "float"), ("solan_baoluu_t", "long", "measure2", "float"), ("songay_baoluu_t", "long", "measure3", "float"), ("measure4", "float", "measure4", "float"), ("behavior_id", "long", "behavior_id", "long") ]) dfy_output2 = ResolveChoice.apply(frame=apply_general, choice="make_cols", transformation_ctx="resolvechoice2") print('dfy_output2::') dfy_output2.show(5) glueContext.write_dynamic_frame.from_options( frame=dfy_output2, connection_type="s3", connection_options={"path": "s3://dtsodin/student_behavior/student_general_behavior", "partitionKeys": ["behavior_id"]}, format="parquet") flag = dy_poss_ghinhan_hp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet("s3a://dtsodin/flag/student_behavior/sb1_dong_tien.parquet", mode="overwrite")
# COMMAND ---------- dfNew.select("CallType").distinct().show(20, False) # COMMAND ---------- display( dfNew.select("CallType").groupBy("CallType").count().orderBy( "count", ascending=False)) # COMMAND ---------- from pyspark.sql.functions import unix_timestamp # COMMAND ---------- pattern1 = 'MM/dd/yyyy' pattern2 = 'MM/dd/yyyy hh:mm:ss a' to_pattern1 = 'yyyy/MM/dd' display( dfNew.withColumn("newCol", unix_timestamp( dfNew["CallDate"], pattern1 ).cast("timestamp") ).drop("CallDate")\ .withColumn("newCol2", unix_timestamp( dfNew["WatchDate"], pattern1 ).cast("timestamp") ).drop("WatchDate")\ .withColumn("newCol3", unix_timestamp( dfNew['ReceivedDtTm'], pattern2).cast("timestamp")) ) # COMMAND ---------- dfNew.select('ReceivedDtTm').show() # COMMAND ----------
"Accept NULL hypothesis - No significant difference between groups." ) #----------------------------------------------------------------------------------------------------------------------- #[1] Telecommunication data ingestion cdr = spark.read.option("delimiter", "\t")\ .csv('inputs/telecom/november/*.txt')\ .toDF('square_id', 'time_interval', 'country_code', 'SMS_in', 'SMS_out', 'Call_in', 'Call_out', 'internet_traffic') cdr = cdr.withColumn( 'time_istant', func.from_unixtime(cdr.time_interval / 1000, 'yyyy-MM-dd HH:mm:ss:SSS"')) cdr = cdr.withColumn( 'time_istant', func.unix_timestamp(cdr.time_istant, 'yyyy-MM-dd HH:mm:ss:SSS"').cast('timestamp')) cdr = cdr.withColumn('weekday', func.dayofweek(cdr['time_istant'])) cdr = cdr.withColumn('time_istant', func.date_trunc('hour', 'time_istant')) cdr = cdr.withColumn( "time_istant", func.from_unixtime(func.unix_timestamp(cdr.time_istant), "yyyy/MM/dd HH:mm")) cdr = cdr.withColumn('SMS_in', cdr['SMS_in'].cast(FloatType())) cdr = cdr.withColumn('SMS_out', cdr['SMS_out'].cast(FloatType())) cdr = cdr.withColumn('Call_in', cdr['Call_in'].cast(FloatType())) cdr = cdr.withColumn('Call_out', cdr['Call_out'].cast(FloatType())) cdr= cdr.groupBy(cdr.time_istant,cdr.weekday)\ .agg(func.sum("SMS_out").alias("smsInUscita"),func.sum("SMS_in").alias("smsInEntrata"),func.sum("Call_in").alias("chiamateInEntrata"),func.sum("Call_out").alias("chiamateInUscita"),func.sum("internet_traffic").alias("internet")) cdr = cdr.withColumn('SMS_total', cdr['smsInUscita'] + cdr['smsInEntrata']) cdr = cdr.withColumn('Call_total', cdr['chiamateInEntrata'] + cdr['chiamateInUscita'])
def spark_data_flow(): get_region_label_udf = fun.udf(get_region_label, tp.StringType()) get_region_relation_label_udf = fun.udf(partial(lambda r: r, 'BELONG'), tp.StringType()) # 城市中间数据 prd_basic_df = spark.read.parquet("{path}/prd_basic_df/{version}".format( path=TMP_PATH, version=RELATION_VERSION)) # 地域映射表 mapping_df = spark.read.csv('{path}/{file_name}'.format( path=IN_PATH, file_name=FILE_NAME), sep='\t', header=True).dropDuplicates(['company_county']) # 获取地域节点的唯一ID(省、市、区各自的唯一ID) region_id_df = mapping_df.where(mapping_df.county.isNotNull()).select( 'county', 'company_county').union( mapping_df.where(mapping_df.county.isNull()).where( mapping_df.city.isNotNull()).select( 'city', 'company_county')).union( mapping_df.where(mapping_df.county.isNull()).where( mapping_df.city.isNull()).where( mapping_df.province.isNotNull()).select( 'province', 'company_county')).withColumnRenamed( 'county', 'region').cache() # 地域公司数量分布 raw_region_df = prd_basic_df.select('company_county').groupBy( 'company_county').count().withColumnRenamed('count', 'company_num') # 各省份分布(mapping_df 后加一列,表示这个地区的公司总数) tmp_region_df = mapping_df.join(raw_region_df, 'company_county', 'left_outer').fillna(0L).dropDuplicates( ['company_county']).cache() # 每个省、市、区的公司总数 tmp_region_2_df = tmp_region_df.groupBy('province').agg({ 'company_num': 'sum' }).union(tmp_region_df.groupBy('city').agg({'company_num': 'sum'})).union( tmp_region_df.groupBy('county').agg({'company_num': 'sum'})) # 输出 node(地区编码、公司总数、地区名称、节点创建时间、节点更新时间、标签) prd_region_node_df = tmp_region_2_df.join( region_id_df, tmp_region_2_df.province == region_id_df.region).select( region_id_df.company_county.alias('region_code:ID'), fun.when(tmp_region_2_df['sum(company_num)'].isNotNull(), tmp_region_2_df['sum(company_num)']).otherwise(0).alias( 'company_num:int'), region_id_df.region.alias('name'), fun.unix_timestamp().alias('create_time:long'), fun.unix_timestamp().alias('update_time:long'), get_region_label_udf().alias(':LABEL')).dropDuplicates( ['region_code:ID']).cache() ''' 地域节点与其他节点的关系 ''' # 区-企业关系 prd_region_edge_1_df = prd_basic_df.join( mapping_df.where(mapping_df.county.isNotNull()).select( 'county', 'company_county'), 'company_county').select( prd_basic_df.bbd_qyxx_id.alias(':START_ID'), prd_basic_df.company_county.alias(':END_ID'), fun.unix_timestamp().alias('create_time:long'), get_region_relation_label_udf().alias(':TYPE')) # 区-市关系 prd_region_edge_2_df = mapping_df.select( 'city', 'county', 'company_county').where(mapping_df.county.isNotNull()).join( prd_region_node_df, prd_region_node_df.name == mapping_df.city).select( mapping_df.company_county.alias(':START_ID'), prd_region_node_df['region_code:ID'].alias(':END_ID'), fun.unix_timestamp().alias('create_time:long'), get_region_relation_label_udf().alias(':TYPE')) # 市-省关系 prd_region_edge_3_df = mapping_df.select( 'province', 'city', 'county', 'company_county').where(mapping_df.county.isNull()).where( mapping_df.city.isNotNull()).where( mapping_df.province.isNotNull()).join( prd_region_node_df, prd_region_node_df.name == mapping_df.province).select( mapping_df.company_county.alias(':START_ID'), prd_region_node_df['region_code:ID'].alias(':END_ID'), fun.unix_timestamp().alias('create_time:long'), get_region_relation_label_udf().alias(':TYPE')) # 事件-区域关系 # 由于这个关系不好解析,并且玩法太多,这里暂时不管 # 后面如果要拿来讲故事,那么可以手工构建该类关系 # prd_region_edge_4_df = '' prd_region_edge_df = prd_region_edge_1_df.union( prd_region_edge_2_df).union(prd_region_edge_3_df) return prd_region_node_df, prd_region_edge_df
Flight_date=p[6], Airline=p[7], Change=int(p[8]), Price=float(p[9]), Depart_hour=p[10], Depart_from=p[11], Arrival_hour=p[12], Arrive_to=p[13])) sqlContext = SQLContext(sc) dt = sqlContext.createDataFrame(flights) dt.registerTempTable("flights") timeFmt1 = "yyyy-MM-dd" timeFmt2 = "dd/MM/yy" timeDiff = ((F.unix_timestamp('Flight_date', format=timeFmt2) - F.unix_timestamp('Scrap_date', format=timeFmt1)) / (24 * 3600)).cast(IntegerType()) dt = dt.withColumn("Days", timeDiff) processed = dt df = processed.groupBy([ "Scrap_time", "Country_from", "Country_to", "There_or_Back", "Flight_date", "Airline", "Change", "Depart_hour", "Depart_from", "Arrival_hour", "Arrive_to" ]).agg( F.mean(processed.Price).alias('Mean'), F.stddev(processed.Price).alias('Stddev'), F.count('Price').alias('Count'))
StructField("boost_flag", IntegerType(), True), StructField("app_name", StringType(), True), StructField("site_name", StringType(), True), StructField("publisher_id", StringType(), True), StructField("publisher_name", StringType(), True), StructField("app_bundle", StringType(), True), StructField("store_url", StringType(), True), StructField("secure", IntegerType(), True) ]) request_log_filtered=spark.read.option("sep", "|")\ .option("header", "True")\ .schema(schema)\ .csv(args.requests)\ .where( (F.col("request_source")==4) & (F.col("consumer_id_method")==150))\ .select(F.unix_timestamp(F.col("request_time"), "yyyy-MM-dd HH:mm:ss.SSS ZZZZZ").alias("time_stamp"), F.col("timezone_offset"), F.col("consumer_id"), F.col("gender"), F.col("birth_year"), F.col("latitude"), F.col("longitude"), F.col("location_type"), F.col("ip"), F.col("request_id"), F.col("at_home_or_away"), F.col("app_id"), F.col("app_bundle"), F.col("request_source")).cache() schema_meta_log = StructType([