def tfunc(t,rdd,rddb): # texts try: #----- texts if topic=="TT_raw": rowRdd = rdd.map(lambda w: Row(id=w['id'],author=w['user_screen_name'],\ body=w['body'], created_utc=str(int(w['timestamp_ms'])/1000), \ pharmatags=w['pharmatags'],conditiontags=w['conditiontags'], symptomtags=w['symptomtags'])) else: # this is reddit rowRdd = rdd.map(lambda w: Row(id=w['id'],author=w['author'],\ body=w['body'], created_utc=w['created_utc'], \ pharmatags=w['pharmatags'],conditiontags=w['conditiontags'], symptomtags=w['symptomtags'])) texts = getSqlContextInstance(rdd.context).createDataFrame(rowRdd) texts.registerTempTable("texts") texts = texts.select(texts.id,from_unixtime(texts.created_utc).alias('created_utc'),texts.author,texts.body, explode(texts.pharmatags).alias('pharmatag'), texts.conditiontags, texts.symptomtags) # return texts.rdd #----- bids rowRdd2= rddb.map(lambda w: Row(price=w['price'], pharmatag=w['pharmatags'])) bids = getSqlContextInstance(rddb.context).createDataFrame(rowRdd2) bids.registerTempTable("bids") getSqlContextInstance(rdd.context).cacheTable('bids') bids = bids.select(bids.price,bids.pharmatag) # #---- texts ids joined with pharma bids, java webservice already sorted by price idbids = bids.join(texts,texts.pharmatag==bids.pharmatag,'inner').select(texts.id,texts.author, texts.created_utc, texts.body, texts.conditiontags, texts.symptomtags, bids.pharmatag,bids.price).limit(1) idbids.registerTempTable("idbids") idbids.show() return idbids.rdd # #-----texts id & bids, find min # DEPRECATED, will just return the top matched, since it's already sorted by price by java service # idsbidsmin = getSqlContextInstance(rddb.context).sql("SELECT id, author, created_utc, body, pharmatag, conditiontags, symptomtags, max(price) as price FROM idbids GROUP BY id,author, created_utc, body, conditiontags, symptomtags, pharmatag ") # idsbidsmin.registerTempTable("idsbidsmin") # dataframe # idsbidsmin.show() # return idsbidsmin.rdd except 'Exception': pass
config = ConfigParser.ConfigParser() config.read('config.ini') kuduMaster = config.get('hadoop','kudu_masters') kuduPort = config.get('hadoop','kudu_port') # ### Create a Spark Session spark = SparkSession.builder.appName("Sensor Analytics").getOrCreate() sc = spark.sparkContext sqc = SQLContext(sc) # ## Analyze Maintenance Costs # We start our analysis with visualizing the distribution of maintenance costs rawMaintCosts = sqc.read.format('org.apache.kudu.spark.kudu')\ .option('kudu.master',kuduMaster)\ .option('kudu.table','impala::sensors.maintenance').load()\ .withColumn('day', F.to_date(F.from_unixtime('maint_date')))\ .withColumn('month', F.date_format(F.from_unixtime('maint_date'),'yyyy-MMM'))\ .orderBy('maint_date') maintCosts = rawMaintCosts.toPandas() # ### Summary Statistics on Maintenance Costs maintCosts.describe() # ### Boxplot of Monthly Maintenance Costs sb.set(style="ticks", palette="muted", color_codes=True) sb.boxplot(x="cost", y="month", data=maintCosts, whis=np.inf, color='r') sb.despine(trim=True) # ### Pairplot Comparing Maintenance Cost and Duration sb.pairplot(maintCosts, hue="type", vars=['cost','duration'])
def process_data(self): ############################################################################## # DECLARE VARIABLES ############################################################################## dt_range = self.study_dates("2020-07-30") dt = dt_range s1_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata' s1_initial_bucket_depth = 'cuebiq/daily-feed/US/' s1_bucket_output = 'cuebiq/daily-feed-reduced/US/' s2_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata' s2_initial_bucket_depth = 'cuebiq/daily-feed-reduced/US/' s2_bucket_output = 'cuebiq/processed-data/US/micro-clusters/' anchor_dist = 430 time_thresh = 28800 part_num = 9 gps_schema = StructType([ StructField("utc_timestamp", IntegerType(), True), StructField("device_id", StringType(), True), StructField("os", IntegerType(), True), StructField("latitude", FloatType(), True), StructField("longitude", FloatType(), True), StructField("accuracy", IntegerType(), True), StructField("tz_offset", IntegerType(), True) ]) s2_gps_schema = StructType([ StructField("utc_timestamp", IntegerType(), True), StructField("device_id", StringType(), True), StructField("os", IntegerType(), True), StructField("latitude", FloatType(), True), StructField("longitude", FloatType(), True), StructField("accuracy", IntegerType(), True), StructField("tz_offset", IntegerType(), True), StructField("row_number", IntegerType(), True) ]) ############################################################################## # WINDOWS ############################################################################## w = Window().partitionBy('device_id').orderBy('utc_timestamp') l = Window().partitionBy('device_id', 'lin_grp').orderBy('utc_timestamp') w2 = Window().partitionBy('device_id').orderBy('row_number') ############################################################################## # BEGIN DAILY ITERATION ############################################################################## print("Reading in files for {}".format(str(dt['study_dt'])[:10])) print("s3://{}/{}[{}|{}|{}]/*.gz".format(s1_bucket_name, s1_initial_bucket_depth, dt['s3_before'], dt['s3_study_dt'], dt['s3_after'])) print("") ################################################################################################# # START STEP 1 ################################################################################################# df1 = dataFrameReader \ .options(header = 'false', delimiter = '\t', codec = 'gzip') \ .schema(gps_schema) \ .format("csv") \ .load("/opt/spark/sample_data/daily-feed/US/2020729*/*.csv.gz") #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth + dt['s3_before'] +"/*.gz") # the day before df2 = dataFrameReader \ .options(header = 'false', delimiter = '\t', codec = 'gzip') \ .schema(gps_schema) \ .format("csv") \ .load("/opt/spark/sample_data/daily-feed/US/2020730*/*.csv.gz") #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth + dt['s3_study_dt'] +"/*.gz") # actual study date df3 = dataFrameReader \ .options(header = 'false', delimiter = '\t', codec = 'gzip') \ .schema(gps_schema) \ .format("csv") \ .load("/opt/spark/sample_data/daily-feed/US/2020731*/*.csv.gz") #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth + dt['s3_after'] +"/*.gz") # the day after # Union data from three inputs into 1 dataframe df = df1.union(df2).union(df3) \ .repartition(part_num, 'device_id') del df1 del df2 del df3 ############################################################################## # FILTER INITIAL JUNK RECORDS # Removes duplicated records (based on time and id), poor accuracy, bad coordinates, and timestamps outside of study range ############################################################################## df = df.na.drop(subset=['latitude','longitude','tz_offset','accuracy']) \ .filter(((df['accuracy'] >= 5) & (df['accuracy'] <= 65)) \ & ((~(df['latitude'] == 0)) | ~(df['longitude'] == 0)) \ & (df['utc_timestamp'] + df['tz_offset']) \ .between(dt['utc_study_dt'], dt['utc_after'])) \ .dropDuplicates(['utc_timestamp','device_id']) ############################################################################## # EXCESSIVE SPEED REMOVAL ############################################################################## df = df.withColumn('dist_to',distance(df['latitude'], df['longitude'], lead(df['latitude'],1).over(w), \ lead(df['longitude'],1).over(w))) \ .withColumn('sec_to', (lead(df['utc_timestamp'], 1).over(w) - df['utc_timestamp'])) \ .withColumn('speed_to', rate_of_speed(col('dist_to'), col('sec_to'),'hour')) \ .withColumn('dist_from', lag(col('dist_to'), 1).over(w)) \ .withColumn('sec_from', lag(col('sec_to'), 1).over(w)) \ .withColumn('speed_from', lag(col('speed_to'), 1).over(w)) \ .filter(((col('dist_to').isNull()) | (col('dist_from').isNull())) \ | ((((col('speed_from') + col('speed_to')) / 2) <= 90) | ((col('dist_to') >= 150) | (col('dist_from') >= 150))) \ & ((col('speed_from') < 600) & (col('speed_to') < 600)) \ & ((col('speed_from') < 20) | (col('speed_to') < 20))) \ .select('utc_timestamp', 'device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset') ############################################################################## # LINEAR TRAVEL PING REMOVAL # Break pings out into groups of 4 to measure the linear distance ############################################################################## #Assign a record number and linear grouping and lead distance df = df.withColumn('RecordNum',row_number().over(w)) \ .withColumn('lin_grp', py.ceil(row_number().over(w) / 4)) \ .withColumn('dist_to', distance(df['latitude'], df['longitude'], \ lead(df['latitude'],1).over(l), lead(df['longitude'],1).over(l),'meters')) # Create aggregated table for linear groupings expr = [py.min(col('utc_timestamp')).alias('min_utc_timestamp'),py.max(col('utc_timestamp')).alias('max_utc_timestamp'), \ py.count(col('utc_timestamp')).alias('cnt'),py.sum(col('dist_to')).alias('sum_dist'),py.min(col('dist_to')).alias('min_dist')] dfl_grp = df.groupBy('device_id', 'lin_grp').agg(*expr) dfl_grp.createOrReplaceTempView('dfl_grp') df.createOrReplaceTempView('dfl') # Grab just the first and last records in each linear grouping and append aggregated info dfls = spark.sql( "SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \ a.lin_grp, b.sum_dist, b.min_dist, b.cnt \ FROM dfl as a INNER JOIN dfl_grp as b \ ON a.device_id = b.device_id \ AND a.lin_grp = b.lin_grp \ AND a.utc_timestamp = b.min_utc_timestamp \ UNION ALL \ SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \ a.lin_grp, b.sum_dist, b.min_dist, b.cnt \ FROM dfl as a INNER JOIN dfl_grp as b \ ON a.device_id = b.device_id \ AND a.lin_grp = b.lin_grp \ AND a.utc_timestamp = b.max_utc_timestamp") # Measure the distance between first and last in each linear grouping and compare to sum distance of all points # Only keep groups that meet criteria for being straight-line df_j = dfls.withColumn('strt_dist', distance(dfls['latitude'],dfls['longitude'], \ lead(dfls['latitude'],1).over(l), \ lead(dfls['longitude'],1).over(l), 'meters')) \ .withColumn('lin',col('strt_dist') / dfls['sum_dist']) \ .na.drop(subset=['strt_dist']) \ .filter((dfls['min_dist'] > 0) \ & (col('strt_dist').between(150, 2000)) \ & (dfls['cnt'] == 4) \ & (col('lin') >= .99825)) \ .select('device_id','lin_grp', 'lin') # Outer join main dataframe to linears groups to filter non-linear pings df = df.join(df_j, ['device_id','lin_grp'], how='left_outer') \ .filter(col('lin').isNull()) \ .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset') del dfl_grp del dfls del df_j ####################################### # CHAIN # Calculating the dynamic chain threshold to find proximate ping relationships ####################################### df = df.withColumn('chain_dist', ((((df['accuracy'] + lead(df['accuracy'],1).over(w)) - 10) * (230 / 120) + 200))) \ .withColumn('chain', when((distance(df['latitude'], df['longitude'], \ lead(df['latitude'],1).over(w), lead(df['longitude'], 1).over(w),'feet')) <= col('chain_dist'), 1) .when((distance(df['latitude'], df['longitude'], \ lag(df['latitude'],1).over(w), lag(df['longitude'], 1).over(w),'feet')) <= lag(col('chain_dist'), 1).over(w), 1)) \ .filter(col('chain') == 1) \ .withColumn('row_number', row_number().over(w)) \ .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset','row_number') \ .persist() df \ .repartition(100,'device_id').sortWithinPartitions('device_id','row_number') \ .write \ .csv(path="/opt/spark/sample_data/daily-feed-reduced/"+dt['s3_study_dt'], mode="append", compression="gzip", sep=",") #.csv(path="s3://" + s1_bucket_name + '/' + s1_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",") ############################################################################################## # START STEP 2 ############################################################################################## print('Begin micro-clustering') # INITIALIZE ANCHOR TABLE - Create initial anchor start points based on row number = 1 and distance threshold self.df_dist = df.withColumn('tz_timestamp', df['utc_timestamp'] + df['tz_offset']) \ .withColumn('anchor', when(df['row_number'] == 1, col('tz_timestamp')) \ .when(distance(df['latitude'], df['longitude'], \ lag(df['latitude'],1).over(w2),lag(df['longitude'],1).over(w2),'feet') \ >= anchor_dist, col('tz_timestamp')) \ .when(col('tz_timestamp') - lag(col('tz_timestamp'),1).over(w2) >= time_thresh, col('tz_timestamp'))) \ .select('tz_timestamp','device_id','os','latitude','longitude','accuracy','row_number','anchor') \ .repartition(part_num, 'device_id') \ .persist() print('df_dist starting count = {}'.format( self.df_dist.count())) # Materialize table for caching df.unpersist() del df ##################################################################################################### # ITERATE THROUGH DATAFRAME ANCHOR PROCESS - iterations are broken out to speed up checkpointing # Checkpointing is used to chop off the physical plans of the dataframes that grow with each iteration ###################################################################################################### df_anchor1 = self.anchor_func(3, 3) df_anchor2 = self.anchor_func(5, 5) df_anchor3 = self.anchor_func(12, 6) df_anchor4 = self.anchor_func(20, 5) df_anchor5 = self.anchor_func(30, 5) df_anchor6 = self.anchor_func(50, 5) df_anchor7 = self.anchor_func(80, 5, 1000000) df_anchor8 = self.anchor_func(1000, 5, 1000000) ################################################################################################## # Collect remaining pings to driver for Python analysis print('collect remaining pings') anchor_list = self.df_dist.rdd.map(lambda row: {'timestamp':row[0], 'device_id':row[1], 'latitude':row[3], \ 'longitude':row[4], 'anchor':row[7]}).collect() # Sort elements in list by device_id and timestamp anchor_list.sort(key=operator.itemgetter('device_id', 'timestamp')) # Python analysis on driver of final remaining pings print('iterate through remaining pings on driver') anchor_dr = [] for r in anchor_list: if r['anchor'] is not None: anchor_dr.append(r) else: if anchor_dr[-1]['device_id'] == r['device_id']: if distance_dr(r['latitude'],r['longitude'], \ anchor_dr[-1]['latitude'], \ anchor_dr[-1]['longitude'], 'feet') <= anchor_dist \ & r['timestamp'] - anchor_dr[-1]['timestamp'] < time_thresh: anchor_dr.append({'timestamp':r['timestamp'], 'device_id':r['device_id'], \ 'latitude':anchor_dr[-1]['latitude'], 'longitude':anchor_dr[-1]['longitude'], \ 'anchor':anchor_dr[-1]['anchor']}) else: r['anchor'] = r['timestamp'] anchor_dr.append(r) # Condense result table for dataframe distribution print('generate driver anchor table') new_anchor = [] for r in anchor_dr: new_anchor.append([r['timestamp'], r['device_id'], r['anchor']]) # Bring driver results back into a distributed dataframe and join results print('disperse driver anchor table back to cluster') new_anchor_schema = StructType([ StructField('tz_timestamp', IntegerType(), True), StructField('device_id', StringType(), True), StructField('anchor', IntegerType(), True) ]) df_anchor_dr = spark.createDataFrame(new_anchor,new_anchor_schema) \ .repartition(part_num, 'device_id') # Join remaining anchors to main analysis table self.df_dist = self.df_dist.select('tz_timestamp','device_id','os','latitude','longitude', \ 'accuracy','row_number') \ .join(df_anchor_dr,['tz_timestamp','device_id']) \ # Union all anchor tables together and sort print('finalizing anchor results into central table') df_anchors_fnl = df_anchor1.union(df_anchor2).union(df_anchor3).union(df_anchor4).union(df_anchor5) \ .union(df_anchor6).union(df_anchor7).union(df_anchor8).union(self.df_dist) \ .repartition(part_num,'device_id') \ .persist() self.df_dist.unpersist() ####################################################################################### # Calculate centroids ####################################################################################### print('start calculating centroids') # Get max accuracy value for each micro-cluster and filter clusters with fewer than 2 pings df_anchor_grp = df_anchors_fnl.groupBy('device_id','anchor').agg(*[py.max(col('accuracy')).alias('max_accuracy'), \ py.count(col('tz_timestamp')).alias('cnt')]) \ .withColumn('max_acc_1', col('max_accuracy') + 1) \ .filter(col('cnt') > 1) \ .select('device_id','anchor','max_acc_1','cnt') # Calculate the nominator for each micro-cluster df_anchors_fnl = df_anchors_fnl.join(df_anchor_grp, ['device_id','anchor']) \ .withColumn('nom',col('max_acc_1') - col('accuracy')) df_denom = df_anchors_fnl.groupBy( 'device_id', 'anchor').agg(*[py.sum(col('nom')).alias('denom')]) df_anchors_fnl = df_anchors_fnl.join(df_denom, ['device_id','anchor']) \ .withColumn('weight', df_anchors_fnl['nom'] / df_denom['denom']) \ .withColumn('lat', df_anchors_fnl['latitude'] * col('weight')) \ .withColumn('lon', df_anchors_fnl['longitude'] * col('weight')) expr = [py.sum(col('lat')).alias('new_latitude'), py.sum(col('lon')).alias('new_longitude'), \ py.avg(col('latitude')).alias('avg_latitude'), py.avg(col('longitude')).alias('avg_longitude'), \ py.count(col('tz_timestamp')).alias('cluster_png_cnt'), py.first(col('os')).alias('os'), \ py.min(col('tz_timestamp')).alias('start_timestamp'), py.max(col('tz_timestamp')).alias('end_timestamp'), \ py.avg(col('accuracy')).alias('avg_accuracy')] df_micro = df_anchors_fnl.groupBy('device_id','anchor').agg(*expr) \ .withColumn('fnl_lat', (col('new_latitude') * (3/4)) + (col('avg_latitude') * (1/4))) \ .withColumn('fnl_lon', (col('new_longitude') * (3/4)) + (col('avg_longitude') * (1/4))) \ .withColumn('geohash9', geohash_udf_9(col('fnl_lat'), col('fnl_lon'))) \ .withColumn('dwell_seconds', col('end_timestamp') - col('start_timestamp')) \ .withColumn('start_tm', py.from_unixtime(col('start_timestamp'))) \ .withColumn('end_tm', py.from_unixtime(col('end_timestamp'))) \ .filter(col('dwell_seconds') > 1) \ .select('device_id','os','start_tm','end_tm', \ 'dwell_seconds','cluster_png_cnt', col('fnl_lat').alias('latitude'), \ col('fnl_lon').alias('longitude'), 'geohash9', 'avg_accuracy') df_micro \ .repartition(100,'device_id').sortWithinPartitions('device_id','start_tm') \ .write \ .csv(path="/opt/spark/sample_data/processed-data/" + dt['s3_study_dt'], mode="append", compression="gzip", sep=",") #.csv(path="s3://" + s2_bucket_name + '/' + s2_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",") df_anchors_fnl.unpersist() return
def process_log_data(spark, input_data, output_data): ''' Get the files from log folders and compose a DataFrame. Create the users, time and songplays tables with the desired columns and format. Parameters: spark (object): Previous created spark object. input_data(string): Key for AWS S3 objects to read. output_data(string): Key for AWS S3 objects to save. Returns: None ''' # get filepath to log data file log_data = input_data + 'log_data' # read log data file # smaller data to test: s3a://{}:{}@udacity-dend/log_data/2018/11/2018-11-12*.json df = spark.read.json("s3a://{}:{}@udacity-dend/log_data/*/*/*.json"\ .format(os.environ['AWS_ACCESS_KEY_ID'],os.environ['AWS_SECRET_ACCESS_KEY'])) # filter by actions for song plays df = df.filter(df['page'] == 'NextSong') # extract columns for users table users_columns = ['userId', 'firstName', 'lastName', 'gender', 'level'] users_table = df.select(*users_columns).dropDuplicates() # write users table to parquet files users_table.write.parquet(output_data + '/users', mode='overwrite') # create datetime column from original timestamp column df = df.withColumn('datetime', from_unixtime(col('ts') / 1000)) # extract columns to create time table df_time = df.select('datetime').dropDuplicates() time_table = df_time.withColumnRenamed('datetime', 'start_time')\ .orderBy('start_time', ascending=True)\ .withColumn('hour', hour(col('start_time')))\ .withColumn('day', dayofmonth(col('start_time')))\ .withColumn('week', weekofyear(col('start_time')))\ .withColumn('month', month(col('start_time')))\ .withColumn('year', year(col('start_time')))\ .withColumn('weekday', dayofweek(col('start_time'))) # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + '/time', mode='overwrite', partitionBy=['year', 'month']) # read in song data to use for songplays table basePath = output_data + '/songs/' song_df = spark.read.option("basePath", basePath).parquet(output_data + '/songs/*') # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, df.song == song_df.title, how='left') songplays_table = songplays_table.drop('song', 'artist', 'title', 'year', 'duration') columns_name = [ 'start_time', 'user_id', 'level', 'session_id', 'location', 'user_agent', 'song_id', 'artist_id' ] songplays_table = songplays_table.toDF(*columns_name) songplays_table = songplays_table.withColumn('month', month(col('start_time')))\ .withColumn('year', year(col('start_time'))) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + '/songplays', mode='overwrite', partitionBy=['year', 'month'])
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source dyf_ds_results = glueContext.create_dynamic_frame.from_catalog(database='dts-odin_ncsbasic', table_name='results') dyf_ds_results = dyf_ds_results.resolveChoice(specs=[('_key', 'cast:long')]) # try: # df_flag = spark.read.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_ncsb.parquet") # read_from_index = df_flag.collect()[0]['flag'] # print('read from index: ', read_from_index) # dyf_ds_results = Filter.apply(frame=dyf_ds_results, # f=lambda x: x["_key"] > read_from_index) # except: # print('read flag file error ') # # dyf_ds_results = dyf_ds_results.select_fields( # ['_key', '_id', 'userid', 'time_begin', 'time_end', 'timecreated']).rename_field( # '_id', 'id') dy_cache = dyf_ds_results.toDF() dy_cache = dy_cache.cache() dyf_ds_results = DynamicFrame.fromDF(dy_cache, glueContext, 'dyf_ds_results') #doc moc flag tu s3 print('dyf_ds_results::schema') dyf_ds_results.printSchema() dyf_ds_results.show(5) if (dyf_ds_results.count() > 0): #--------------------------------------------------------------------------------------------------------------# dyf_student_contact_email = glueContext.create_dynamic_frame.from_catalog(database='tig_advisor', table_name='student_contact_email') dyf_student_contact_email = dyf_student_contact_email.select_fields(['email', 'contact_id']) dyf_student_contact_email = Filter.apply(frame=dyf_student_contact_email, f=lambda x: x["email"] is not None and x["email"] != '') df_student_contact_email = dyf_student_contact_email.toDF() df_student_contact_email = df_student_contact_email.dropDuplicates(['contact_id', 'email']) dyf_student_contact_email = DynamicFrame.fromDF(df_student_contact_email, glueContext, "dyf_student_contact_email") # -------------------------------------------------------------------------------------------------------------# # -------------------------------------------------------------------------------------------------------------# dyf_users = glueContext.create_dynamic_frame.from_catalog(database='dts-odin_ncsbasic', table_name='users') dyf_users = dyf_users.select_fields(['_id', 'email']) # -------------------------------------------------------------------------------------------------------------# # -------------------------------------------------------------------------------------------------------------# dyf_ds_results_nscb = Filter.apply(frame=dyf_ds_results, f=lambda x: x["time_begin"] is not None and x["time_begin"] != '' and x["time_end"] is not None and x["time_end"] != '' and x["time_begin"] < x["time_end"] and x["timecreated"] is not None and x["timecreated"] != '') # -------------------------------------------------------------------------------------------------------------# # -------------------------------------------------------------------------------------------------------------# # ds_df_results = ds_results.toDF() # ds_df_results = ds_df_results.where('time_begin IS NOT NULL AND time_end IS NOT NULL') # ds_results_nscb = DynamicFrame.fromDF(ds_df_results, glueContext, 'ds_results_nscb') # map ls ncsb vs contact_id join_ncsb1 = Join.apply(dyf_ds_results_nscb, dyf_users, 'userid', '_id') join_ncsb2 = Join.apply(join_ncsb1, dyf_student_contact_email, 'email', 'email') print('join_ncsb2::schema') join_ncsb2.printSchema() join_ncsb2.show(5) # convert data join_ncsb2 = Filter.apply(frame=join_ncsb2, f=lambda x: x["contact_id"] is not None) data_df_ncsb = join_ncsb2.toDF() data_df_ncsb = data_df_ncsb.withColumn('sogio', (data_df_ncsb.time_end - data_df_ncsb.time_begin) / 3600) data_df_ncsb = data_df_ncsb.withColumn("giovao", from_unixtime(data_df_ncsb.time_begin)) data_df_ncsb = data_df_ncsb.withColumn("ngay_tao", from_unixtime(data_df_ncsb.timecreated)) data_df_ncsb = data_df_ncsb.withColumn('id_time', from_unixtime(data_df_ncsb.time_begin, "yyyyMMdd")) # data_df_ncsb = data_df_ncsb.where("contact_id IS NOT NULL") data_df_ncsb = data_df_ncsb.where("sogio > 0.0") data_df_ncsb = data_df_ncsb.groupby('contact_id', 'id_time').agg(f.sum('sogio').alias("tong_so_gio"), f.count('contact_id')) data_df_ncsb = data_df_ncsb.dropDuplicates(['contact_id', 'id_time']) data_ncsb = DynamicFrame.fromDF(data_df_ncsb, glueContext, 'data_ncsb') data_ncsb = data_ncsb.resolveChoice(specs=[('tong_so_gio', 'cast:float')]) # -------------------------------------------------------------------------------------------------------------# # -------------------------------------------------------------------------------------------------------------# # tinh bang "fact_hieusuathoctap" # df_hieusuathoctap = dropnullfields1.toDF() print ('data_ncsb::data_ncsb::data_ncsb::printSchema------------------') data_ncsb.printSchema() # print ('data_ncsb::data_ncsb::data_ncsb::show------------------') data_ncsb.show(10) print('data_ncsb::number: ', data_ncsb.count()) # tinh so ca hoc, thoi gian hoc cua hoc vien trong ngay id_time applymapping2 = ApplyMapping.apply(frame=data_ncsb, mappings=[("contact_id", "string", "contact_id", "string"), ("id_time", 'string', 'id_time', 'bigint'), ("count(contact_id)", 'long', 'soca', 'int'), ("tong_so_gio", 'float', 'sogio', 'float')]) resolvechoice2 = ResolveChoice.apply(frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields2 = DropNullFields.apply(frame=resolvechoice2, transformation_ctx="dropnullfields2") datasink2 = glueContext.write_dynamic_frame.from_jdbc_conf(frame=dropnullfields2, catalog_connection="glue_redshift", connection_options={"dbtable": "temp_staging_lich_su_tu_hoc_ncsb_v2", "database": "dts_odin", "postactions": """INSERT into mapping_changed_status_student(user_id, change_status_date_id, to_status_id, measure1, measure2) SELECT um.user_id, hwb.id_time, 52, hwb.soca, hwb.sogio FROM temp_staging_lich_su_tu_hoc_ncsb_v2 hwb LEFT JOIN user_map um ON um.source_type = 1 AND um.source_id = hwb.contact_id WHERE um.user_id is not null; DROP TABLE IF EXISTS public.temp_staging_lich_su_tu_hoc_ncsb_v2""" }, redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/ncsb_2", transformation_ctx="datasink4") df_datasource = dyf_ds_results.toDF() flag = df_datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet("s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_ncsb.parquet", mode="overwrite") dy_cache.unpersist()
StructField('work_or_load', IntegerType(), True), \ StructField('plug_id', IntegerType(), True), \ StructField('household_id', IntegerType(), True), \ StructField('house_id', IntegerType(), True)] schema = StructType(fields) # carrega o arquivo csv para o dataframe 'df' df = spark.read.load("sample-00.csv", format="csv", sep=",", schema=schema, header="false") # converte a coluna 'ts' do time stamp para o formato yyyy-MM-dd HH:mm # que sera utilizado no agrupamento dos dados df = df.withColumn('ts', from_unixtime('ts', "yyyy-MM-dd HH:mm")) \ # filtra a coluna 'work_or_load' para pegar apenas as linhas que sao load df = df.filter(df.work_or_load == 1) # realiza o agrupamento por casa ('house_id'), comodo ('household_id'), tomada # ('plug_id') e janela de uma hora ('window('ts', "1 hour")'). Em seguida, realiza # a agregacao calculando a media da coluna value. # Dado que o trabalho (W) = potencia (P) * intervalo de tempo (delta_t), temos que # ao fazer esta operacao estamos calculando o W em Wh multiplicando a potencia # media em uma hora pelo intervalo de 1 hora. Por fim, ordenamos os dados. # # OBS: como temos um intervalo de tempo de uma hora nao e necessario realizar a # multiplicacao efetivamente. df = df.groupBy('house_id', 'household_id', 'plug_id', window('ts', "1 hour")) \ .agg({"value":"avg"}) \
}) #if you have multiple saves below this prevents reloading the data every time pw_df.cache() #join on the grid to get the feeder and tx for each outage pw_df = pw_df.join(grid, on='site_id', how='inner') #We should mark every row with the number of unique sensors reporting in +-5 days so we now the denominator for SAIDI/SAIFI pw_distinct_core_id = pw_df.select("time", "core_id", "feeder_id", "tx") pw_distinct_core_id_by_feeder = pw_distinct_core_id.groupBy( F.window("time", '10 days', '1 day'), "feeder_id").agg(F.countDistinct("core_id")) pw_distinct_core_id_by_feeder = pw_distinct_core_id_by_feeder.withColumn( "window_mid_point", F.from_unixtime((F.unix_timestamp(col("window.start")) + F.unix_timestamp(col("window.end"))) / 2)) pw_distinct_core_id_by_feeder = pw_distinct_core_id_by_feeder.select( "feeder_id", col("count(DISTINCT core_id)").alias("sensors_reporting"), "window_mid_point") pw_distinct_core_id_by_tx = pw_distinct_core_id.groupBy( F.window("time", '10 days', '1 day'), "tx").agg(F.countDistinct("core_id")) pw_distinct_core_id_by_tx = pw_distinct_core_id_by_tx.withColumn( "window_mid_point", F.from_unixtime((F.unix_timestamp(col("window.start")) + F.unix_timestamp(col("window.end"))) / 2)) pw_distinct_core_id_by_tx = pw_distinct_core_id_by_tx.select( "tx", col("count(DISTINCT core_id)").alias("sensors_reporting"), "window_mid_point")
def main(): "Main function" optmgr = OptionParser() opts = optmgr.parser.parse_args() # setup spark/sql context to be used for communication with HDFS sc = SparkContext(appName="phedex_br") if not opts.yarn: sc.setLogLevel("ERROR") sqlContext = HiveContext(sc) schema_def = schema() # read given file(s) into RDD if opts.fname: pdf = sqlContext.read.format('com.databricks.spark.csv')\ .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(opts.fname, schema = schema_def) elif opts.basedir: fromdate, todate = defDates(opts.fromdate, opts.todate) files = getFileList(opts.basedir, fromdate, todate) msg = "Between dates %s and %s found %d directories" % (fromdate, todate, len(files)) print msg if not files: return pdf = unionAll([sqlContext.read.format('com.databricks.spark.csv') .options(treatEmptyValuesAsNulls='true', nullValue='null')\ .load(file_path, schema = schema_def) \ for file_path in files]) else: raise ValueError("File or directory not specified. Specify fname or basedir parameters.") # parsing additional data (to given data adding: group name, node kind, acquisition era, data tier, now date) groupdic, nodedic = getJoinDic() acquisition_era_reg = r"^/[^/]*/([^/^-]*)-[^/]*/[^/]*$" data_tier_reg = r"^/[^/]*/[^/^-]*-[^/]*/([^/]*)$" groupf = udf(lambda x: groupdic[x], StringType()) nodef = udf(lambda x: nodedic[x], StringType()) ndf = pdf.withColumn("br_user_group", groupf(pdf.br_user_group_id)) \ .withColumn("node_kind", nodef(pdf.node_id)) \ .withColumn("now", from_unixtime(pdf.now_sec, "YYYY-MM-dd")) \ .withColumn("acquisition_era", when(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, acquisition_era_reg, 1))) \ .withColumn("data_tier", when(regexp_extract(pdf.dataset_name, data_tier_reg, 1) == "",\ lit("null")).otherwise(regexp_extract(pdf.dataset_name, data_tier_reg, 1))) # print dataframe schema if opts.verbose: ndf.show() print("pdf data type", type(ndf)) ndf.printSchema() # process aggregation parameters keys = [key.lower().strip() for key in opts.keys.split(',')] results = [result.lower().strip() for result in opts.results.split(',')] aggregations = [agg.strip() for agg in opts.aggregations.split(',')] order = [orde.strip() for orde in opts.order.split(',')] if opts.order else [] asc = [asce.strip() for asce in opts.asc.split(',')] if opts.order else [] filtc, filtv = opts.filt.split(":") if opts.filt else (None,None) validateAggregationParams(keys, results, aggregations, order, filtc) if filtc and filtv: ndf = ndf.filter(getattr(ndf, filtc) == filtv) # if delta aggregation is used if DELTA in aggregations: validateDeltaParam(opts.interval, results) result = results[0] #1 for all dates generate interval group dictionary datedic = generateDateDict(fromdate, todate, opts.interval) boundic = generateBoundDict(datedic) max_interval = max(datedic.values()) interval_group = udf(lambda x: datedic[x], IntegerType()) interval_start = udf(lambda x: boundic[x][0], StringType()) interval_end = udf(lambda x: boundic[x][1], StringType()) #2 group data by block, node, interval and last result in the interval ndf = ndf.select(ndf.block_name, ndf.node_name, ndf.now, getattr(ndf, result)) idf = ndf.withColumn("interval_group", interval_group(ndf.now)) win = Window.partitionBy(idf.block_name, idf.node_name, idf.interval_group).orderBy(idf.now.desc()) idf = idf.withColumn("row_number", rowNumber().over(win)) rdf = idf.where((idf.row_number == 1) & (idf.interval_group != 0))\ .withColumn(result, when(idf.now == interval_end(idf.interval_group), getattr(idf, result)).otherwise(lit(0))) rdf = rdf.select(rdf.block_name, rdf.node_name, rdf.interval_group, getattr(rdf, result)) rdf.cache() #3 create intervals that not exist but has minus delta win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) adf = rdf.withColumn("interval_group_aft", lead(rdf.interval_group, 1, 0).over(win)) hdf = adf.filter(((adf.interval_group + 1) != adf.interval_group_aft) & (adf.interval_group != max_interval))\ .withColumn("interval_group", adf.interval_group + 1)\ .withColumn(result, lit(0))\ .drop(adf.interval_group_aft) #4 join data frames idf = rdf.unionAll(hdf) #3 join every interval with previous interval win = Window.partitionBy(idf.block_name, idf.node_name).orderBy(idf.interval_group) fdf = idf.withColumn("delta", getattr(idf, result) - lag(getattr(idf, result), 1, 0).over(win)) #5 calculate delta_plus and delta_minus columns and aggregate by date and node ddf =fdf.withColumn("delta_plus", when(fdf.delta > 0, fdf.delta).otherwise(0)) \ .withColumn("delta_minus", when(fdf.delta < 0, fdf.delta).otherwise(0)) aggres = ddf.groupBy(ddf.node_name, ddf.interval_group).agg(sum(ddf.delta_plus).alias("delta_plus"),\ sum(ddf.delta_minus).alias("delta_minus")) aggres = aggres.select(aggres.node_name, interval_end(aggres.interval_group).alias("date"), aggres.delta_plus, aggres.delta_minus) else: resAgg_dic = zipResultAgg(results, aggregations) order, asc = formOrdAsc(order, asc, resAgg_dic) # perform aggregation if order: aggres = ndf.groupBy(keys).agg(resAgg_dic).orderBy(order, ascending=asc) else: aggres = ndf.groupBy(keys).agg(resAgg_dic) # output results if opts.fout: fout_header = formFileHeader(opts.fout) if opts.header: aggres.write.format('com.databricks.spark.csv').options(header = 'true').save(fout_header) else: aggres.write.format('com.databricks.spark.csv').save(fout_header) else: aggres.show(50)
def preprocess_data(input, output): """Based on preprocess_data.ipynb.""" print('input=%s, output=%s' % (input, output)) sc = SparkContext.getOrCreate() sql_sc = SQLContext(sc) schema = StructType([ StructField('VendorID', IntegerType(), True), StructField('tpep_pickup_datetime', TimestampType(), True), StructField('tpep_dropoff_datetime', TimestampType(), True), StructField('passenger_count', IntegerType(), True), StructField('trip_distance', DoubleType(), True), StructField('pickup_longitude', DoubleType(), True), StructField('pickup_latitude', DoubleType(), True), StructField('RateCodeID', IntegerType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('dropoff_longitude', DoubleType(), True), StructField('dropoff_latitude', DoubleType(), True), StructField('payment_type', IntegerType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), ]) raw_sdf = sql_sc.read.csv(input, header=True, schema=schema, timestampFormat='yyyy-MM-dd HH:mm:ss') # Convert timestamp from EST to UTC. clean_sdf = raw_sdf.withColumn( 'tpep_pickup_timestamp_ms', unix_timestamp(raw_sdf['tpep_pickup_datetime']) * 1000 + 5 * 60 * 60 * 1000) clean_sdf = clean_sdf.withColumn( 'tpep_dropoff_timestamp_ms', unix_timestamp(raw_sdf['tpep_dropoff_datetime']) * 1000 + 5 * 60 * 60 * 1000) # Only consider the first 2 days of data. end_timestamp = pd.Timestamp('2015-03-03 00:00:00').tz_localize( 'Etc/GMT+5') filtered_sdf = clean_sdf.filter('tpep_dropoff_timestamp_ms <= %d' % int(end_timestamp.value / 1e6)) all_events_rdd = filtered_sdf.rdd.flatMap(create_events) all_events_sdf = sql_sc.createDataFrame(all_events_rdd) all_events2_sdf = all_events_sdf.withColumn( 'timestamp_str', from_unixtime(all_events_sdf['timestamp'] / 1000)) # Sort all events so streaming_data_generator.py can read events in time order. sorted_sdf = all_events2_sdf.orderBy('timestamp') sorted_sdf.write.mode('overwrite').format('json').save(output)
import shutil #init_notebook_mode(connected=True) spark = SparkSession.builder.appName( "Python Spark SQL basic example").getOrCreate() sc = spark.sparkContext sqlContext = SQLContext(sc) df = sqlContext.read.load('/test_dev/mba-code/dataset.csv', format='csv', header='true', inferSchema='true', encoding='UTF-8') df.select( "InvoiceNo", "StockCode", "Description", "Quantity", "InvoiceDate", "InvoiceDateWS", date_format(from_unixtime(unix_timestamp('InvoiceDateWS', 'mm/dd/yyy')), 'EEEE').alias('weekday'), "CustomerID", "Country").write.save("Invoices.parquet", format="parquet") parquetFile = spark.read.parquet("Invoices.parquet") # Parquet files can also be used to create a temporary view and then used in SQL statements. parquetFile.createOrReplaceTempView("parquetFile") DescriptionGrp = spark.sql( "SELECT distinct InvoiceNo,StockCode FROM parquetFile group by InvoiceNo,StockCode" ) #print(DescriptionGrp.rdd.take(2)) minSupport = 0.05 * DescriptionGrp.rdd.count() apr_tem = DescriptionGrp.rdd.map(lambda x: (x[0], list([x[1]]))).reduceByKey( lambda x, y: x + y) schema = StructType([ StructField("id", StringType(), True), StructField("items", ArrayType(StringType()), True)
from pyspark.sql.types import FloatType #import statsmodels.formula.api as smf # from pyspark.sql.functions import regexp_replace, col # from pyspark.ml.regression import LinearRegression # from sklearn.linear_model import LinearRegression from pyspark.sql.functions import broadcast from pyspark.sql.functions import * if __name__=='__main__': sc = SparkContext() spark = SparkSession(sc) pv = spark.read.csv('hdfs:///tmp/bdm/nyc_parking_violation/', header = True,inferSchema = True) pv = pv.select('Issue Date', 'Violation County', 'Street Name', 'House Number') pv = pv.withColumn('Date', from_unixtime(unix_timestamp('Issue Date', 'MM/dd/yyyy'))) pv = pv.withColumn('Year',f.year(pv['Date'])) pv = pv.filter(pv["Year"] >= (2015)) \ .filter(pv["Year"] <= (2019)) pv = pv.na.drop() pv = pv.withColumn('street name',f.lower(pv['Street Name'])) borough_dict = {'NY':1, 'MAN':1, 'MH':1, 'NEWY':1, 'NEW':1, 'Y':1, "NY":1, 'BX':2, 'BRONX':2, 'K':3, 'BK':3, 'KING':3, 'KINGS':3, 'Q':4, 'QN':4, 'QNS':4, 'QU':4, 'QUEEN':4, 'R':5, 'RICHMOND':5} mapping_expr = create_map([lit(x) for x in chain(*borough_dict.items())]) pv = pv.withColumn("BOROCODE", mapping_expr.getItem(col("Violation County"))) pv = pv.withColumn("HN_int",(f.regexp_replace("House Number", "-", "")))
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source #------------------------------------------------------------------------------------------------------------------# dyf_native_talk = glueContext.create_dynamic_frame.from_catalog( database='native_talk', table_name='native_talk_history_log_api') dyf_native_talk = dyf_native_talk.resolveChoice(specs=[('_key', 'cast:long')]) try: df_flag = spark.read.parquet( "s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk_thanh_cong.parquet" ) read_from_index = df_flag.collect()[0]['flag'] print('read from index: ', read_from_index) dyf_native_talk = Filter.apply(frame=dyf_native_talk, f=lambda x: x["_key"] > read_from_index) except: print('read flag file error ') dyf_native_talk = dyf_native_talk.select_fields([ '_key', 'learning_date', 'speaking_dialog_score', 'username', 'updated_time' ]) dy_cache = dyf_native_talk.toDF() dy_cache = dy_cache.cache() dyf_native_talk = DynamicFrame.fromDF(dy_cache, glueContext, 'dyf_native_talk') print('dy_cache------------') dy_cache.printSchema() print('dy_cache: ', dy_cache.count()) dy_cache.show(2) #------------------------------------------------------------------------------------------------------------------# if (dyf_native_talk.count() > 0): #---------------------------------------------------------datasource0-----------------------------------------------------# dyf_native_talk = Filter.apply( frame=dyf_native_talk, f=lambda x: x["username"] is not None and x["username"] != '' and x["speaking_dialog_score"] is not None and x[ "speaking_dialog_score"] > 0 and x[ "learning_date"] is not None and x["learning_date"] != '') # ----------------------------------datasource1---------------------------------------------------------------------------# if (dyf_native_talk.count() > 0): dyf_nt_account_mapping = glueContext.create_dynamic_frame.from_catalog( database='native_talk', table_name='native_talk_account_mapping') dyf_nt_account_mapping = dyf_nt_account_mapping.select_fields( ['contact_id', 'username']).rename_field('username', 'nativetalk_user') dy_cache_2 = dyf_nt_account_mapping.toDF() dy_cache_2 = dy_cache_2.cache() dyf_nt_account_mapping = DynamicFrame.fromDF( dy_cache_2, glueContext, 'dyf_nt_account_mapping') dyf_nt_account_mapping = Filter.apply( frame=dyf_nt_account_mapping, f=lambda x: x["nativetalk_user"] is not None and x[ "nativetalk_user"] != '') # ----------------------------------datasource1---------------------------------------------------------------------------# # -------------------------------------------------------------------------------------------------------------# join = Join.apply(dyf_native_talk, dyf_nt_account_mapping, 'username', 'nativetalk_user') if (join.count() > 0): df_nativetalk = join.toDF() df_nativetalk = df_nativetalk.withColumn( 'sogio', f.lit(0.083333)) #5 phut df_nativetalk = df_nativetalk.withColumn( 'id_time', from_unixtime( unix_timestamp(df_nativetalk.learning_date, "yyyy-MM-dd"), "yyyyMMdd")) df_nativetalk = df_nativetalk.where("contact_id IS NOT NULL") data_nativetalk = DynamicFrame.fromDF(df_nativetalk, glueContext, 'data_nativetalk') data_nativetalk = data_nativetalk.resolveChoice( specs=[('sogio', 'cast:float')]) # -------------------------------------------------------------------------------------------------------------# print('data_nativetalk----------') data_nativetalk.printSchema() # tinh bang "fact_hieusuathoctap" df_hieusuathoctap = data_nativetalk.toDF() # tinh so ca hoc, thoi gian hoc cua hoc vien trong ngay id_time df_hieusuathoctap = df_hieusuathoctap.groupby( 'contact_id', 'id_time').agg(f.sum('sogio'), f.count('contact_id')) df_hieusuathoctap = df_hieusuathoctap.withColumn( 'tu_hoc_type_id', f.lit(400)) data_hieusuathoctap = DynamicFrame.fromDF( df_hieusuathoctap, glueContext, 'data_hieusuathoctap') data_hieusuathoctap = data_hieusuathoctap.resolveChoice( specs=[('sum(sogio)', 'cast:double')]) print( 'data_hieusuathoctap::data_hieusuathoctap::data_hieusuathoctap------------------------------------------' ) data_hieusuathoctap.printSchema() applymapping2 = ApplyMapping.apply( frame=data_hieusuathoctap, mappings=[("contact_id", "string", "contact_id", "string"), ("id_time", 'string', 'id_time', 'bigint'), ("count(contact_id)", 'long', 'soca', 'int'), ("sum(sogio)", 'double', 'sogio', 'double'), ("tu_hoc_type_id", 'int', "tu_hoc_type_id", "int")]) resolvechoice2 = ResolveChoice.apply( frame=applymapping2, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields2 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields2") print('dropnullfields2 number: ', dropnullfields2.count()) datasink2 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields2, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_staging_lich_su_tu_hoc_native_talk___", "database": "dts_odin", "postactions": """INSERT into mapping_changed_status_student(user_id, change_status_date_id, to_status_id, measure1, measure2) SELECT um.user_id, hwb.id_time, 56, hwb.soca, round(hwb.sogio, 4) FROM temp_staging_lich_su_tu_hoc_native_talk___ hwb LEFT JOIN user_map um ON um.source_type = 1 AND um.source_id = hwb.contact_id; DROP TABLE IF EXISTS public.temp_staging_lich_su_tu_hoc_native_talk___ """ }, redshift_tmp_dir="s3n://dts-odin/temp/tu-hoc/hwb/", transformation_ctx="datasink2") df_datasource = dyf_native_talk.toDF() flag = df_datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet( "s3a://dts-odin/flag/student_status/tu_hoc/tu_hoc_native_talk_thanh_cong.parquet", mode="overwrite") dy_cache.unpersist() dy_cache_2.unpersist()
from __future__ import print_function import pyspark from pyspark.sql import functions as F import drpyspark drpyspark.enable_debug_output() with pyspark.SparkContext() as sc: sqlContext = pyspark.sql.SQLContext(sc) logs = sc.parallelize([ {'timestamp': 1470663000, 'url': 'http://example.com/', 'ip': '192.168.1.1'}, {'timestamp': 1470663163, 'url': 'http://example.com/', 'ip': '192.168.1.1'}, {'timestamp': 1470663277, 'url': 'http://example.com/article1', 'ip': '192.168.1.2'}, {'timestamp': 1470663277, 'url': 'http://example.com/article2', 'ip': '192.168.1.2'}, {'timestamp': 1470663277, 'url': 'http://example.com/article3', 'ip': '192.168.1.2'}, ]) logs = logs.map(lambda l: pyspark.sql.Row(**l)) logs = (sqlContext.createDataFrame(logs) .withColumn('timestamp', F.to_date(F.from_unixtime('timestamp'))) .withColumn('minute', F.date_format('timestamp', "yyyy-MM-dd'T'HH"))) (logs .groupBy(['minute', 'url']) .count() .show())
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session student_id_unavailable = '0' package_endtime_unavailable = 99999999999L package_starttime_unavailable = 0L student_level_code_unavailable = 'UNAVAILABLE' student_status_code_unavailable = 'UNAVAILABLE' package_endtime = 'package_endtime' package_starttime = 'package_starttime' student_level_code = 'student_level_code' student_status_code = 'student_status_code' EXPIRED = 'EXPIRED' dyf_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product_history") dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.select_fields( [ '_key', 'contact_id', 'used_product_id', 'status_old', 'status_new', 'status_description', 'timecreated' ]) # .rename_field('contact_id', 'contactid') dyf_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.resolveChoice( specs=[('_key', 'cast:long')]) # try: # df_flag = spark.read.parquet("s3://dtsodin/flag/flag_trang_thai_tai_khoan_expired_lan_n.parquet") # max_key = df_flag.collect()[0]['flag'] # print("max_key: ", max_key) # # Chi lay nhung ban ghi lon hon max_key da luu, ko load full # dyf_tpe_enduser_used_product_history = Filter.apply(frame=dyf_tpe_enduser_used_product_history, f=lambda x: x["_key"] > max_key) # except: # print('read flag file error ') print dyf_tpe_enduser_used_product_history.count() if dyf_tpe_enduser_used_product_history.count() > 0: try: dyf_tpe_invoice_product_details = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_invoice_product_details") dyf_tpe_invoice_product_details = dyf_tpe_invoice_product_details.select_fields( ['id', 'cat_code']) dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id']).rename_field('contact_id', 'contactid') ##################### Join and Filter data df_tpe_enduser_used_product_history = dyf_tpe_enduser_used_product_history.toDF( ) df_tpe_used_product_history_step1 = df_tpe_enduser_used_product_history.groupby('contact_id', 'used_product_id').agg( f.max("timecreated").alias("max_timecreated")) \ .withColumnRenamed("contact_id", "contact_id_temp") print df_tpe_used_product_history_step1.count() df_tpe_used_product_history_step1.show(20) df_tpe_used_product_history_step2 = df_tpe_used_product_history_step1.groupby( 'contact_id_temp').agg( f.max("max_timecreated").alias("max_timecreated"), f.count("used_product_id").alias("count_used_product_id")) print df_tpe_used_product_history_step2.count() df_tpe_used_product_history_step2.show(20) print "EEEEEEEEEEEEEEEEEEEEEEEEE" dyf_tpe_used_product_history = DynamicFrame.fromDF( df_tpe_used_product_history_step2, glueContext, "dyf_tpe_used_product_history") dyf_part_one = Filter.apply( frame=dyf_tpe_used_product_history, f=lambda x: x["count_used_product_id"] > 1) # dyf_part_two = Filter.apply(frame=df_tpe_enduser_used_product_history, # f=lambda x: x["used_product_id"] > 1) df_part_one = dyf_part_one.toDF() df_part_one = df_part_one.join( df_tpe_enduser_used_product_history, (df_part_one.contact_id_temp == df_tpe_enduser_used_product_history.contact_id) & (df_part_one.max_timecreated == df_tpe_enduser_used_product_history.timecreated) & (df_part_one.used_product_id_temp == df_tpe_enduser_used_product_history.used_product_id)) dyf_part_one = DynamicFrame.fromDF(df_part_one, glueContext, "dyf_part_one") dyf_part_one = dyf_part_one.select_fields([ 'contact_id', 'used_product_id', 'status_old', 'status_new', 'status_description', 'timecreated' ]) dyf_join_part_one_product_details = Join.apply( dyf_part_one, dyf_tpe_invoice_product_details, 'used_product_id', 'id') dyf_join_part_one_product_details.printSchema() print "total 01: ", dyf_join_part_one_product_details.count() dyf_join_part_one_product_details.toDF().show(2) dyf_join_part_one_contact = Join.apply( dyf_join_part_one_product_details, dyf_student_contact, 'contact_id', 'contactid') dyf_join_part_one_contact = dyf_join_part_one_contact \ .select_fields(['contact_id', 'student_id', 'status_new', 'status_description', 'timecreated']) dyf_join_part_one_contact.printSchema() print "total 02: ", dyf_join_part_one_contact.count() dyf_join_part_one_contact.toDF().show(2) # df_join_part_one = dyf_join_part_one_contact.toDF() ###################################### ######## START cancelled dyf_join_cancelled_status = Filter.apply( frame=dyf_join_part_one_contact, f=lambda x: x["status_new"] == CANCELLED) print "dyf_join_cancelled_status ", dyf_join_cancelled_status.count( ) dyf_join_cancelled_status.toDF().show(2) df_join_cancelled_status = dyf_join_cancelled_status.toDF() df_join_cancelled_status = df_join_cancelled_status \ .withColumn("change_status_date_id", from_unixtime(df_join_cancelled_status.timecreated, 'yyyyMMdd').cast("long")) \ .withColumn("from_status_id", f.lit(None).cast("long")) \ .withColumn("to_status_id", f.lit(214).cast("long")) \ .withColumn("measure1", f.lit(None).cast("long")) \ .withColumn("measure2", f.lit(None).cast("long")) \ .withColumn("description", df_join_cancelled_status.status_description) \ .withColumn("timestamp1", f.lit(None).cast("long")) df_join_cancelled_status.show(3) dyf_join_cancelled_status = DynamicFrame.fromDF( df_join_cancelled_status, glueContext, "dyf_join_cancelled_status") dyf_join_cancelled_status = dyf_join_cancelled_status \ .select_fields(['contact_id', 'student_id', 'change_status_date_id', 'from_status_id', 'to_status_id', 'measure1', 'measure2', 'description', 'timestamp1']) dyf_join_cancelled_status.printSchema() df_join_cancelled_status = dyf_join_cancelled_status.toDF() ####### END ######## START expired dyf_join_expired_status = Filter.apply( frame=dyf_join_part_one_contact, f=lambda x: x["status_new"] == EXPIRED) print "dyf_join_expired_status ", dyf_join_expired_status.count() dyf_join_expired_status.toDF().show(2) df_join_expired_status = dyf_join_expired_status.toDF() df_join_expired_status = df_join_expired_status \ .withColumn("change_status_date_id", from_unixtime(df_join_expired_status.timecreated, 'yyyyMMdd').cast("long")) \ .withColumn("from_status_id", f.lit(None).cast("long")) \ .withColumn("to_status_id", f.lit(215).cast("long")) \ .withColumn("measure1", f.lit(None).cast("long")) \ .withColumn("measure2", f.lit(None).cast("long")) \ .withColumn("description", df_join_expired_status.status_description) \ .withColumn("timestamp1", f.lit(None).cast("long")) df_join_expired_status.show(3) dyf_join_expired_status = DynamicFrame.fromDF( df_join_expired_status, glueContext, "dyf_join_expired_status") dyf_join_expired_status = dyf_join_expired_status \ .select_fields(['contact_id', 'student_id', 'change_status_date_id', 'from_status_id', 'to_status_id', 'measure1', 'measure2', 'description', 'timestamp1']) dyf_join_expired_status.printSchema() df_join_expired_status = dyf_join_expired_status.toDF() ####### END df_join_expired_status = df_join_expired_status.withColumn( "user_id", f.lit(None).cast("long")) dyf_join_status = DynamicFrame.fromDF(df_join_expired_status, glueContext, "dyf_join_status") applymapping1 = ApplyMapping.apply( frame=dyf_join_status, mappings=[("student_id", "string", "student_id", "long"), ("user_id", "long", "user_id", "long"), ("change_status_date_id", "long", "change_status_date_id", "long"), ("from_status_id", "long", "from_status_id", "long"), ("to_status_id", "long", "to_status_id", "long"), ("measure1", "long", "measure1", "double"), ("measure2", "long", "measure2", "double"), ("description", "string", "description", "string"), ("timestamp1", "long", "timestamp1", "long"), ("contact_id", "string", "contact_id", "string")]) resolvechoice1 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice1") dropnullfields1 = DropNullFields.apply( frame=resolvechoice1, transformation_ctx="dropnullfields1") print resolvechoice1.count() resolvechoice1.printSchema() resolvechoice1.show(5) print('START WRITE TO REDSHIFT -------------------------') datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields1, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student_temp", "database": "dts_odin" }, redshift_tmp_dir= "s3a://dtsodin/temp/mapping_changed_status_student_temp/", transformation_ctx="datasink1") print('START WRITE TO S3-------------------------') # datasink6 = glueContext.write_dynamic_frame.from_options(frame=dropnullfields1, connection_type="s3", # connection_options={ # "path": "s3://dtsodin/student_behavior/student_behavior/", # "partitionKeys": ["behavior_id"]}, # format="parquet", # transformation_ctx="datasink6") print('END WRITE TO S3-------------------------') df_temp = dyf_tpe_enduser_used_product_history.toDF() flag = df_temp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://dtsodin/flag/flag_trang_thai_tai_khoan_expired_lan_n.parquet", mode="overwrite") except Exception as e: print "Something was wrong ", e
# Some data points have not registered with correct dropoff latitude and longtitude # and show up as zero df2 = df1[(df1['dropoff_longitude'] < 0)\ & (df1['dropoff_latitude'] > 0)\ &(df1['pickup_longitude'] != df1['dropoff_longitude'])\ & (df1['pickup_latitude'] != df1['dropoff_latitude'])] # Adding a time duration for each taxi ride. #For machine learning, the hour of the day integer as well as day of the #week integer will be important for the algorithm to learn from one #month worth of datapoints. time_duration = unix_timestamp("tpep_dropoff_datetime",format = time_format)\ - unix_timestamp("tpep_pickup_datetime", format = time_format) df3 = df2.withColumn("time_duration",time_duration)\ .withColumn("hour",hour(df2.tpep_pickup_datetime))\ .withColumn("dayOfWeek",from_unixtime(unix_timestamp\ (df1.tpep_pickup_datetime,time_format),"uuuuu").cast("Integer")) # A Taxi will not drive more than 500 miles for a single ride. # A taxi ride will be more than 10 seconds even if you go just 10 metres df4 = df3[(df3['trip_distance'] < 500)\ & (df3['time_duration'] > 10)] # Remove negative cost fields df5 = df4[(df4['fare_amount'] > 0)\ & (df4['extra'] >= 0)\ & (df4['mta_tax'] >= 0)\ & (df4['tip_amount'] >= 0)\ & (df4['tolls_amount'] >= 0)\ & (df4['improvement_surcharge'] >= 0)\ & (df4['total_amount'] > 0)]
def process_log_data(spark, input_data, output_data): """ This function takes the log data from Udacity's S3 input file and processes it. This is done by extracting the user, time and songplay tables and then loading it back to the S3 buckegt I've created in AWS. Parameters: spark : Spark Session input_data : The S3 bucket location of song_data, think 'input' output_data : The S3 bucket location of the song_data, think 'ouput' """ #Using print statement to understand where in spark statement we are print("\n Taking in log data as variable from S3's input location....") # get full filepath to song data file #log_data = input_data + 'log_data/*/*/*.json' #utilizing exact folder set of data set to speed up execution in WorkSpace (please use commented out log_data variable above to run full etl with wildcards) log_data = input_data + 'log_data/2018/11/*.json' #Using print statement to understand where in spark statement we are print("\n Defining log Schema....") log_schema = Struct([SFld("artist", Str()), SFld("auth", Str()), SFld("firstName", Str()), SFld("gender", Str()), SFld("itemInSession", Lng()), SFld("lastName", Str()), SFld("length", Dbl()), SFld("level", Str()), SFld("location", Str()), SFld("method", Str()), SFld("page", Str()), SFld("registration", Dbl()), SFld("sessionId", Lng()), SFld("song", Str()), SFld("status", Str()), SFld("ts", Str()), SFld("userAgent", Str()), SFld("userId", Str())]) #Using print statement to understand where in spark statement we are print("\n Reading log data JSON files from S3's input location....") # read log data file df = spark.read.json(log_data, schema = log_schema, mode='PERMISSIVE', columnNameOfCorruptRecord='corruptRecord').drop_duplicates() #Using print statement to understand where in spark statement we are print("\n Filtering page by NextSong....") # filter by actions for song plays df = df.filter(df.page == 'NextSong').drop_duplicates() #Using print statement to understand where in spark statement we are print("\n Creating select statement for users data creation....") # extract columns for users table users_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level').where(df.userId != None).drop_duplicates() #Using print statement to understand where in spark statement we are print("\n Writing parquet file for users table....") # write users table to parquet files users_table.write.mode('overwrite').parquet(output_data + 'users_table/') #Using print statement to understand where in spark statement we are print("\n Creating timeStamp variable....") # create timestamp column from original timestamp column df = df.withColumn("timestamp", to_timestamp(from_unixtime(col("ts") / 1000))) #Using print statement to understand where in spark statement we are print("\n Creating select statement for time data creation....") # extract columns to create time table time_table = ( df.select("timestamp").withColumn("hour", hour("timestamp")).withColumn("day", dayofmonth("timestamp")) \ .withColumn("week", weekofyear("timestamp")).withColumn("weekday", dayofweek("timestamp")).withColumn("weekdayName", date_format("timestamp", "E")) \ .withColumn("month", month("timestamp")).withColumn("year", year("timestamp")).drop_duplicates() ) #Using print statement to understand where in spark statement we are print("\n Writing parquet file for time table and partitioned by year and month....") # write time table to parquet files partitioned by year and month time_table.write.mode('overwrite').partitionBy('year', 'month').parquet(output_data + 'time_table/') #Using print statement to understand where in spark statement we are print("\n Reading song data JSON files from S3's input location....") # read in song data to use for songplays table song_df = spark.read.parquet(output_data + 'songs_table/') #Using print statement to understand where in spark statement we are print("\n Creating select statement for song play data creation....") # extract columns from joined song and log datasets to create songplays table songplays_table = df.withColumn('songplayId', F.monotonically_increasing_id()).join(song_df, song_df.title == df.song) \ .select('songplayId', col('timestamp').alias('start_time'), col('userId'), 'level', 'song_id', 'artist_id', col('sessionId'), 'location', col('userAgent')) songplays_table = songplays_table.join(time_table, songplays_table.start_time == time_table.timestamp, how="inner")\ .select("songplayId", songplays_table.start_time, "userId", "level", "song_id", "artist_id", "sessionId", "location", "userAgent", "month", "year").drop_duplicates() #Using print statement to understand where in spark statement we are print("\n Writing parquet file for song paly table and partitioned by year and month....") # write songplays table to parquet files partitioned by year and month songplays_table.write.mode('overwrite').partitionBy("year", "month").parquet(output_data + 'songplays_table/')
def get_learnig_info(start_year_month_id, end_year_month_id, start_date, end_date): push_down_predicate_v = "((behavior_id == \"" + BEHAVIOR_ID_LS + "\" " \ + " or behavior_id == \"" + BEHAVIOR_ID_SC + "\" " \ + " or behavior_id == \"" + BEHAVIOR_ID_LT + "\" " \ + " or behavior_id == \"" + BEHAVIOR_ID_VOXY + "\" " \ + " or behavior_id == \"" + BEHAVIOR_ID_HW + "\" " \ + " or behavior_id == \"" + BEHAVIOR_ID_NCSB + "\" " \ + " or behavior_id == \"" + BEHAVIOR_ID_NT + "\") " \ + " and year_month_id >= \"" + str(start_year_month_id) + "\" " \ + " and year_month_id <= \"" + str(end_year_month_id) + "\")" dyf_sb_student_behavior = connectGlue(database="olap_student_behavior", table_name="sb_student_behavior", select_fields=["student_behavior_id", "contact_id", "student_behavior_date"], push_down_predicate=push_down_predicate_v ) df_sb_student_behavior = dyf_sb_student_behavior.toDF() df_sb_student_behavior = df_sb_student_behavior.drop_duplicates(["student_behavior_id"]) df_sb_student_behavior = df_sb_student_behavior.select("student_behavior_id", "contact_id", f.from_unixtime("student_behavior_date", format="yyyyMMdd").cast("long").alias( "date_id")) dyf_sb_student_learning = connectGlue(database="olap_student_behavior", table_name="sb_student_learning", select_fields=["student_behavior_id", "behavior_id", "duration", "role_in_class"], push_down_predicate=push_down_predicate_v ).rename_field("student_behavior_id", "student_behavior_id_learning") dyf_sb_student_learning = dyf_sb_student_learning.resolveChoice(specs=[("behavior_id", "cast:int")]) dyf_sb_student_learning = Filter.apply(frame=dyf_sb_student_learning, f=lambda x: (x["behavior_id"] > 12 and x["duration"] > 59) or (x["behavior_id"] < 13 and x["duration"] >= 2100)) df_sb_student_learning = dyf_sb_student_learning.toDF() join = df_sb_student_behavior.join(df_sb_student_learning, df_sb_student_behavior["student_behavior_id"] == df_sb_student_learning[ "student_behavior_id_learning"]) join = join.drop("student_behavior_id", "student_behavior_id_learning") join = join.groupby("contact_id", "date_id", "behavior_id", "role_in_class").agg(f.count("duration").alias("total")) join = join.select( "contact_id", "date_id", f.struct("behavior_id", "total", "role_in_class").alias("type_role_and_total") ) df_group_by = join.groupBy("contact_id", "date_id") \ .agg(f.collect_list("type_role_and_total").alias("l_type_role_and_total")) join_total = df_group_by.select( "contact_id", "date_id", get_final_total("l_type_role_and_total").alias("list_total") ) df_latest = join_total.select( "contact_id", "date_id", f.col("list_total").getItem("total_ls").alias("total_ls"), f.col("list_total").getItem("total_sc").alias("total_sc"), f.col("list_total").getItem("total_voxy").alias("total_voxy"), f.col("list_total").getItem("total_hw").alias("total_hw"), f.col("list_total").getItem("total_nt").alias("total_nt"), f.col("list_total").getItem("total_ncsb").alias("total_ncsb"), f.col("list_total").getItem("total_audit").alias("total_audit"), f.col("list_total").getItem("total_lt").alias("total_lt") ) df_lo = get_lo(start_date, end_date) df_latest = df_latest.join(df_lo, (df_lo["contact_id_lo"] == df_latest["contact_id"]) & (df_lo["created_date_id"] == df_latest["date_id"]), "outer") df_latest = df_latest.fillna(0) df_latest = df_latest.select("total_lt", "total_voxy", "total_hw", "total_nt", "total_ncsb", "total_audit", "total_ls", "total_sc", "total_starter_ait", "total_starter_aip", "total_micro", check_value(df_latest.contact_id, df_latest.contact_id_lo).alias("contact_id"), check_date(df_latest.created_date_id, df_latest.date_id).alias("date_id")) return df_latest
from pyspark.sql.types import IntegerType from pyspark.sql.types import ArrayType from pyspark.sql import Row import operator from pyspark.sql.window import Window conf = SparkConf().setAppName("Ex2").setMaster("local") sc = SparkContext(conf=conf) spark = SparkSession(sc) df = sc.textFile( '/host/HieldshiemMasters/Semester1/DistributedDataAnalytics/Exercises/Ex9_Solution/ml-10M100K/tags.dat' ).map(lambda x: x.split("::")) df = df.toDF(['UserID', 'MovieID', 'Tag', 'Timestamp']) df_Update = df.withColumn('time_datestring', func.from_unixtime('timestamp')) df_Update = df_Update.withColumn( 'time_date', to_timestamp(df_Update.time_datestring, 'yyyy-MM-dd HH:mm:ss')) #print(df_Update) #df_Update.show() #===== get all the time stamps for each user ======================== #test=df_Update.groupBy(['UserID']) new = df_Update.groupBy(['UserID']).agg(collect_list("time_date")) #test.show() #==========sort time stamps for each user=========================== #func=udf(lambda x:sorted(x.tolist())) def sorter(l):
def process_log_data(spark, input_data, output_data): ''' load and process log json files input data is the log directory output data is the output directory for star-schema tables (can be a S3 or HDFS bucket) input logs files should be stored in a tree hierarchy : <input_data>/<year>/<month> ''' # get filepath to log data file log_data = os.path.join(input_data, "log_data", "*", "*") # read log data file df = spark.read.json(log_data) print("EXTRACT USERS") # filter by actions for song plays df = df.filter("page == 'NextSong' ") # extract columns for users table users_table = df.select(col("userId").cast("long").alias("user_id"), col("firstName").alias("first_name"), col("lastName").alias("last_name"), "gender", "level" )\ .distinct()\ .orderBy("user_id") # write users table to parquet files out_users = os.path.join(output_data, "USERS") users_table.write.mode("overwrite").parquet(out_users) # create timestamp column from original timestamp column get_timestamp = udf(lambda x: int(x / 1000.), LongType()) df = df.withColumn("timestamp", get_timestamp("ts")) spark.udf.register("get_timestamp", get_timestamp) # create datetime column from original timestamp column #get_datetime = udf() df = df.withColumn("datetime", from_unixtime("timestamp")).withColumn( "hour", hour("datetime")).withColumn("day", dayofmonth("datetime")).withColumn( "week", weekofyear("datetime")).withColumn( "month", month("datetime")).withColumn("year", year("datetime")).withColumn( "weekday", dayofweek("datetime")) # extract columns to create time table time_table = df.select("ts", "hour", "day", "week", "month", "year", "weekday").distinct() # write time table to parquet files partitioned by year and month out_time = os.path.join(output_data, "TIMESTAMPS") time_table.write.partitionBy("year", "month").mode("overwrite").parquet(out_time) # read in song data to use for songplays table song_db = os.path.join(output_data, "SONGS") song_df = spark.read.parquet(song_db) df.createOrReplaceTempView("lg") song_df.createOrReplaceTempView("sg") # extract columns from joined song and log datasets to create songplays table songplays_table = spark.sql(""" SELECT lg.ts AS start_time, lg.year AS year, lg.month AS month, lg.userId AS user_id, lg.level, sg.song_id, sg.artist_id, lg.sessionId AS session_id, lg.location, lg.userAgent AS user_agent FROM lg JOIN sg ON sg.title = lg.song """) songplays_table = songplays_table.withColumn("songplay_id", monotonically_increasing_id()) rearrange_col = songplays_table.schema.names[:] rearrange_col.insert(0, "songplay_id") rearrange_col.pop() songplays_table = songplays_table.select(*rearrange_col) # write songplays table to parquet files partitioned by year and month out_songplay = os.path.join(output_data, "SONGPLAYS") songplays_table.write.partitionBy( "year", "month").mode("overwrite").parquet(out_songplay)
spark = SparkSession.builder.appName('nyansa').getOrCreate() #Read text file in a data frame df1 = spark.read.option("header", "false") \ .option("delimiter", ",") \ .option("inferSchema", "true") \ .csv(sys.argv[1]) #put data in appropriate columns split_col = F.split(df1['_c0'], "\\|") df1 = df1.withColumn('to_time_stamp', split_col.getItem(0)) df1 = df1.withColumn('url', split_col.getItem(1)) #convert to date df1 = df1.withColumn('date', F.from_unixtime('to_time_stamp','MM/dd/yyyy')).withColumn('date', F.to_date('date','MM/dd/yyyy')) #group by date and url, count urls df_grouped= df1.groupby('date','url').count() \ .orderBy(["date", "count"], ascending=[1, 0]) \ .withColumn('date', F.date_format('date','MM/dd/yyyy'))\ .withColumnRenamed('count','counts') l = df_grouped.collect() #print as required def print_result(list_data): prev_date = 0 for each_row_val in list_data: if prev_date != each_row_val.date: sys.stdout.write(each_row_val.date + " GMT"+ '\n')
execfile('__pyfiles__/load.py') # execfile('src/load.py') from pyspark.sql.types import * from pyspark import SQLContext import json if __name__ == "__main__": _, df = load_data(sc, sample=None) # df.show() df = df.withColumn( 'created', func.from_unixtime(df['created_utc'], 'yyyy-MM-dd HH:mm:ss.SS').cast(DateType())) df.registerTempTable("comments") daily_metrics = spark.sql(""" SELECT *, AVG(count_of_comments) OVER ( ORDER BY created RANGE BETWEEN 30 PRECEDING AND 30 FOLLOWING ) AS count_of_comments_60d_avg, AVG(count_of_users) OVER ( ORDER BY created
# StructField("review/profileName", StringType(), nullable=False), # StructField("review/score", FloatType(), nullable=False), # StructField("review/summary", StringType(), nullable=False), # StructField("review/text", StringType(), nullable=False), # StructField("review/time", LongType(), nullable=False), # StructField("review/userId", StringType(), nullable=False) # ]) # ``` df = spark.read.json("movies/movies.json") split_col = split(df['review/helpfulness'], '/') df = df.withColumn('helpfulness_agreed', split_col.getItem(0).cast("int")) df = df.withColumn('helpfulness_reviewed', split_col.getItem(1).cast("int")) df = df.withColumn('score', df['review/score'].cast("float")) df = df.withColumn('reviewed_at', from_unixtime(df['review/time'])) df2 = df.selectExpr("`product/productId` as product_id", "`review/profileName` as profile_name", "`review/summary` as summary", "`review/text` as text", "`review/userId` as user_id", "score", "helpfulness_agreed", "helpfulness_reviewed", "reviewed_at") df2.show() df2.write.saveAsTable('amazon_movie_reviews', format="parquet", mode='overwrite') ## Original schema of json for meta.json # ```
LANDING_DB_NAME, LANDING_DB_TABLE, transformation_ctx="orders") ordersDF = orders.toDF() ordersDF1 = ordersDF.select("invoicedate", "stockcode", "quantity", "storelocation") ordersDF2 = ordersDF1.withColumnRenamed( "stockcode", "item_id").withColumnRenamed("quantity", "demand").withColumnRenamed( "storelocation", "location").withColumnRenamed("invoicedate", "timestamp") ordersDF3 = ordersDF2.withColumn( 'timestamp', F.from_unixtime(F.unix_timestamp('timestamp', 'dd/mm/yyyy hh:mm:ss'), 'yyyy-MM-dd HH:mm:ss')) ordersDF4 = ordersDF3.repartition(1) ordersDF4.write.csv("s3://" + PROCESSED_BUCKET + "/orders/raw") productsDF1 = ordersDF.select("stockcode", "description", "unitprice") productsDF2 = productsDF1.withColumnRenamed("stockcode", "item_id") productsDF3 = productsDF2.repartition(1) productsDF3.write.csv("s3://" + PROCESSED_BUCKET + "/products/raw") client = boto3.client('s3')
spark = SparkSession.builder.getOrCreate() tv_sessions = spark.read.parquet(data_catalog['foo ']) tv_sessions = ( tv_sessions.filter((tv_sessions.year == 2018) & (tv_sessions.month >= 1) & (tv_sessions.mediaType == 'series-videos') & (tv_sessions.reach60Srd == 1)).filter( tv_sessions.source.isin('box', 'corder')).select( 'customerNumber', 'sessionRecordStartTime', 'programGenreLevel', 'programSeriesName', 'programSeasonEpisode').withColumn( "date", fun.from_unixtime( fun.unix_timestamp( tv_sessions.sessionRecordStartTime), "yyyy-MM-dd"))) # tv_sessions.sessionRecordStartTime is a _timestamp_ type. import pyspark.sql.functions as sparkfun def substring_f(startpos, lengte): return sparkfun.udf( lambda kolom: kolom[startpos - 1:startpos - 1 + lengte]) tv_sessions = (tv_sessions.withColumn( "kijkmaand", substring_f(startpos=6, lengte=2)(tv_sessions.date)).withColumn(
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session mdl_tpe_enduser_used_product_history = glueContext.create_dynamic_frame.from_catalog( database="tig_market", table_name="tpe_enduser_used_product_history") mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.select_fields( [ '_key', 'id', 'used_product_id', 'contact_id', 'status_new', 'status_old', 'timecreated' ]) mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice( specs=[('_key', 'cast:long')]) df_flag = spark.read.parquet("s3a://dts-odin/flag/flag_LS_S0.parquet") max_key = df_flag.collect()[0]['flag'] mdl_tpe_enduser_used_product_history = Filter.apply( frame=mdl_tpe_enduser_used_product_history, f=lambda x: x["_key"] > max_key) if (mdl_tpe_enduser_used_product_history.count() > 0): mdl_tpe_enduser_used_product_history = Filter.apply( frame=mdl_tpe_enduser_used_product_history, f=lambda x: x["contact_id"] is not None and x[ "used_product_id"] is not None and x[ "status_old"] is None and x["status_new"] == 'DEACTIVED') mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.resolveChoice( specs=[('timecreated', 'cast:long')]) df_mdl_tpe_enduser_used_product_history = mdl_tpe_enduser_used_product_history.toDF( ) # df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.groupby('contact_id', 'used_product_id') df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.withColumn( 'ngay_kich_hoat', from_unixtime( df_mdl_tpe_enduser_used_product_history['timecreated'], "yyyyMMdd")) df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.withColumn( 'timestemp', df_mdl_tpe_enduser_used_product_history['timecreated'] * f.lit(1000)) # df_mdl_tpe_enduser_used_product_history = df_mdl_tpe_enduser_used_product_history.select('used_product_id', # 'contact_id', # 'ngay_kich_hoat', # 'id').withColumnRenamed( # 'used_product_id', 'id_product_buy') data_mdl_tpe_enduser_used_product_history = DynamicFrame.fromDF( df_mdl_tpe_enduser_used_product_history, glueContext, "datasource") data_mdl_tpe_enduser_used_product_history = data_mdl_tpe_enduser_used_product_history.resolveChoice( specs=[('timestemp', 'cast:long')]) applymapping1 = ApplyMapping.apply( frame=data_mdl_tpe_enduser_used_product_history, mappings=[("used_product_id", "string", "used_product_id", "string"), ("contact_id", "string", "contact_id", "string"), ("ngay_kich_hoat", "string", "ngay_kich_hoat", "int"), ("id", "string", "id", "string"), ("timestemp", "long", "timestamp", "timestamp")]) resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_ls_trang_thai_s0_1", "database": "dts_odin", "postactions": """ INSERT INTO mapping_changed_status_student (user_id, change_status_date_id, to_status_id, timestamp1, measure1) SELECT um.user_id, tltta.ngay_kich_hoat, 101, tltta.timestamp, 1 FROM temp_ls_trang_thai_s0_1 tltta INNER JOIN user_map um on um.source_type = 1 and um.source_id = tltta.contact_id; DROP TABLE IF EXISTS temp_ls_trang_thai_s0_1;""" }, redshift_tmp_dir="s3n://datashine-dwh/temp1/", transformation_ctx="datasink4") # ghi flag # lay max key trong data source datasourceTmp = mdl_tpe_enduser_used_product_history.toDF() flag = datasourceTmp.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') # ghi de _key vao s3 df.write.parquet( "s3a://datashine-dev-redshift-backup/flag/flag_LS_S0.parquet", mode="overwrite")
def process_log_data(spark, input_data, output_data): ''' Process the log data from the file(s) specified in the parameters. Args: spark: the spark session input_data: output_data: Returns: modeled data from logs and songs json files that are written to parquet files back on S3 ''' # get filepath to log data file log_data = input_data + "log_data/*/*" # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.where(df.page == 'NextSong') # extract columns for users table users_table = df.select( col('userId').alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), col('gender').alias('gender'), col('level').alias('level')).distinct() # write users table to parquet files users_table.write.parquet(output_data + "users.parquet", mode="overwrite") # create timestamp column from original timestamp column df = df.withColumn( 'timestamp', f.to_timestamp( f.from_unixtime((col('ts') / 1000), 'yyyy-MM-dd HH:mm:ss.SSS')).cast("Timestamp")) # create datetime column from original timestamp column df = df.withColumn('ts_datetime', f.to_datetime(col['ts']).cast('Datetime')) # extract columns to create time table time_table = df.withColumn("hour", hour(col("timestamp"))) \ .withColumn("day", dayofmonth(col("timestamp"))) \ .withColumn("week", weekofyear(col("timestamp"))) \ .withColumn("month", month(col("timestamp"))) \ .withColumn("year", year(col("timestamp"))) \ .withColumn("weekday", datetime.datetime(col("timestamp")).weekday()) \ .select( col("timestamp").alias("start_time"), col("hour"), col("day"), col("week"), col("month"), col("year"), col("weekday") ) # write time table to parquet files partitioned by year and month time_table.parquet(output_data + "time.parquet", mode="overwrite") # read in song data to use for songplays table song_df = spark.read.parquet(output_data + "songs.parquet") # extract columns from joined song and log datasets to create songplays table songplays_table = df.withColumn( 'songplay_id', F.monontonically_increasing_id()).join( song_df, song_df.title == df.song).select( 'songplay_id', col().alias('start_time'), col('userId').alias('user_id'), 'level', 'song_id', 'artist_id', col('sessionId').alias('session_id'), 'location', col('userAgent').alias('user_agent')) # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + "songplays.parquet", mode="overwrite")
.format("csv") \ .load("stream") ############### Writing the raw stream to memory ################# df.writeStream \ .queryName("Row stream") \ .format("parquet") \ .option("path", os.path.join(os.getcwd(), 'sink', 'sink_stream_raw')) \ .option("checkpointLocation", os.path.join(os.getcwd(), 'checkpoint', 'checkpoint_stream_raw')) \ .start() df = df.drop(df["id_node"]) # Add latitude and longitude to the dataframe and cast the timestamp into the TimestampType df_modified = df.withColumn("timestamp_modified", F.from_unixtime(df["timestamp"] /1000, format='yyyy-MM-dd HH:mm:ss').cast(TimestampType()))\ .withColumn("location", query_udf(df["latitude"],df["longitude"])) df_modified = df_modified.withColumn( "timestamp", F.regexp_extract(df["timestamp"], ".{3}$", 0)) df_modified = df_modified.withColumn( "timestamp_millisecond", F.concat(df_modified["timestamp_modified"], F.lit('.'), df_modified["timestamp"]).cast(TimestampType())) # Splitting the column into different columns using Spark's split function split_col = F.split(df_modified["location"], ',') df_modified = df_modified.withColumn("name", split_col.getItem(0))\ .withColumn("highway", split_col.getItem(1))\ .withColumn("lanes", split_col.getItem(2))\
'timestamp': 1470663000, 'url': 'http://example.com/', 'ip': '192.168.1.1' }, { 'timestamp': 1470663163, 'url': 'http://example.com/', 'ip': '192.168.1.1' }, { 'timestamp': 1470663277, 'url': 'http://example.com/article1', 'ip': '192.168.1.2' }, { 'timestamp': 1470663277, 'url': 'http://example.com/article2', 'ip': '192.168.1.2' }, { 'timestamp': 1470663277, 'url': 'http://example.com/article3', 'ip': '192.168.1.2' }, ]) logs = logs.map(lambda l: pyspark.sql.Row(**l)) logs = (sqlContext.createDataFrame(logs).withColumn( 'timestamp', F.to_date(F.from_unixtime('timestamp'))).withColumn( 'minute', F.date_format('timestamp', "yyyy-MM-dd'T'HH"))) (logs.groupBy(['minute', 'url']).count().show())
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = '{0}/log_data/*/*/*.json'.format(input_data) # read log data file df = spark.read.json(log_data) df.createOrReplaceTempView('logs') # filter by actions for song plays df = spark.sql(''' select * from logs where page = 'NextSong' ''') # extract columns for users table users_table = spark.sql(''' select cast(e.userid as int) as user_id, e.firstname, e.lastname, e.gender, e.level from logs e join ( select max(ts) as ts, userid from logs where page = 'NextSong' group by userid ) last_event on last_event.userid = e.userid and last_event.ts = e.ts ''') # write users table to parquet files output_users_path = '{0}/users/'.format(output_data) users_table.write.parquet(output_users_path, mode='overwrite') # create timestamp column from original timestamp column df = df.withColumn('start_time', F.from_unixtime(F.col('ts') / 1000)) # create datetime column from original timestamp column time_table = df.select('ts', 'start_time') \ .withColumn('year', F.year('start_time')) \ .withColumn('month', F.month('start_time')) \ .withColumn('week', F.weekofyear('start_time')) \ .withColumn('weekday', F.dayofweek('start_time')) \ .withColumn('day', F.dayofyear('start_time')) \ .withColumn('hour', F.hour('start_time')).dropDuplicates() # write time table to parquet files partitioned by year and month output_times_path = '{0}/times/'.format(output_data) time_table.write.parquet(output_times_path, mode='overwrite', partitionBy=['year', 'month']) # read in song data to use for songplays table song_data = '{0}/song_data/*/*/*/*.json'.format(input_data) song_df = spark.read.json(song_data) song_df.createOrReplaceTempView('songs') # extract columns from joined song and log datasets to create songplays table time_table.createOrReplaceTempView('times') songplays_table = spark.sql(''' select distinct t.start_time, cast(e.userid as int) as user_id, e.level, s.song_id, s.artist_id, cast(e.sessionid as int) as session_id, e.location as location_id, e.useragent as user_agent, t.year, t.month from logs e join songs s on e.song = s.title and e.artist = s.artist_name join times t on t.ts = e.ts where e.page = 'NextSong' ''') # write songplays table to parquet files partitioned by year and month output_songplays_path = '{0}/songplays/'.format(output_data) songplays_table.write.parquet(output_songplays_path, mode='overwrite', partitionBy=['year', 'month'])
now = int(time.time()) data = [] # Building a df with a sequence of chronological timestamps for i in range(0, 1000): data = data + [(i, now)] now = now + (random.randint(1, 3) + 1) df = spark.createDataFrame(data, schema) df.show() df.printSchema() # Turning the timestamps to Timestamp datatype # timestamp, format='yyyy-MM-dd HH:mm:ss') df = df.withColumn('date', F.from_unixtime(df.original_ts).cast('timestamp')) df.show(truncate=False) df.printSchema() # Turning back the timestamps to epoch df = df.withColumn('epoch', F.unix_timestamp(df.date)) df.show(truncate=False) df.printSchema() # Collecting the result and printing out timeRows = [row for row in df.collect()] for row in timeRows: print("{} : {} ({})".format(row[0], row[1], row[2])) spark.stop()
def main(): sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session spark.conf.set("spark.sql.session.timeZone", "GMT+07:00") # get dynamic frame source dyf_3cx_advisor_call = glueContext.create_dynamic_frame.from_catalog( database='callcenter', table_name='advisorcall') dyf_3cx_advisor_call = dyf_3cx_advisor_call.resolveChoice( specs=[('_key', 'cast:long')]) # print schema and select fields print('original schema') dyf_3cx_advisor_call.printSchema() try: df_flag = spark.read.parquet( "s3a://dtsodin/flag/student_status/temp_ls_rating_3cx_v1.parquet") read_from_index = df_flag.collect()[0]['flag'] print('read from index: ', read_from_index) dyf_3cx_advisor_call = Filter.apply( frame=dyf_3cx_advisor_call, f=lambda x: x["_key"] > read_from_index) except: print('read flag file error ') print('the number of new contacts: ', dyf_3cx_advisor_call.count()) dyf_3cx_advisor_call = dyf_3cx_advisor_call.select_fields([ '_key', 'device', 'hanguptvts', 'status', 'phonenumber', 'rating', 'calldate' ]) # .rename_field('statuss', 'status') dy_source_3cx_cache = dyf_3cx_advisor_call.toDF() dy_source_3cx_cache = dy_source_3cx_cache.dropDuplicates(['_key']) dy_source_3cx_cache = dy_source_3cx_cache.cache() dyf_3cx_advisor_call = DynamicFrame.fromDF(dy_source_3cx_cache, glueContext, 'dyf_3cx_advisor_call') if (dyf_3cx_advisor_call.count() > 0): dyf_3cx_advisor_call = Filter.apply( frame=dyf_3cx_advisor_call, f=lambda x: x["device"] == '3CX' and x["status"] == 'ANSWER' and x[ "hanguptvts"] == 1 and x["phonenumber"] is not None and x[ "phonenumber"] != '' and x["calldate"] is not None and x[ "calldate"] != '' and x["rating"] is not None and x[ "rating"] > 0 and x["rating"] < 6) print('dyf_3cx_advisor_call::corrcect') print('dyf_3cx_advisor_call number', dyf_3cx_advisor_call.count()) if (dyf_3cx_advisor_call.count() > 0): dyf_3cx_advisor_call = dyf_3cx_advisor_call.resolveChoice( specs=[('phonenumber', 'cast:string')]) dyf_3cx_advisor_call.printSchema() #convert data df_advisor_call = dyf_3cx_advisor_call.toDF() df_advisor_call = df_advisor_call.withColumn( 'id_time', from_unixtime( unix_timestamp(df_advisor_call.calldate, "yyyy-MM-dd HH:mm:ss"), "yyyyMMdd")) df_advisor_call = df_advisor_call.groupby( 'phonenumber', 'id_time', 'rating').agg(f.count('_key').alias("so_lan")) df_advisor_call = df_advisor_call.withColumn( 'phonenumber_correct', f.concat(f.lit('0'), df_advisor_call.phonenumber)) df_advisor_call = df_advisor_call.withColumn( 'rating_status', f.lit(60) + df_advisor_call.rating) dyf_3cx_advisor_call_rating_number = DynamicFrame.fromDF( df_advisor_call, glueContext, 'dyf_3cx_advisor_call_rating_number') dyf_3cx_advisor_call_rating_number = dyf_3cx_advisor_call_rating_number.resolveChoice( specs=[('so_lan', 'cast:int')]) print('dyf_3cx_advisor_call::after::group::schema') dyf_3cx_advisor_call_rating_number.printSchema() dyf_3cx_advisor_call_rating_number.show(10) print('dyf_3cx_advisor_call after::group: ', dyf_3cx_advisor_call_rating_number.count()) dyf_ad_contact_phone = glueContext.create_dynamic_frame.from_catalog( database='tig_advisor', table_name='student_contact_phone') dyf_ad_contact_phone = dyf_ad_contact_phone.select_fields( ['phone', 'contact_id']) dyf_ad_contact_phone = Filter.apply( frame=dyf_ad_contact_phone, f=lambda x: x["phone"] is not None and x["phone"] != '' and x[ "contact_id"] is not None and x["contact_id"] != '') print('dyf_ad_contact_phone::schema') dyf_ad_contact_phone.printSchema() #-----------------------------------------------------------------------------------------------------------# join = Join.apply(dyf_3cx_advisor_call_rating_number, dyf_ad_contact_phone, 'phonenumber_correct', 'phone') print('join::schema------------') join.printSchema() join.show(2) print('join: ', join.count()) # # chon field applymapping1 = ApplyMapping.apply( frame=join, mappings=[("contact_id", "string", "contact_id", "string"), ("id_time", "string", "id_time", "bigint"), ("phone", "string", "phone", "string"), ("rating_status", "int", "rating_status", "int"), ("rating", "int", "rating", "int"), ("so_lan", "int", "so_lan", "int")]) resolvechoice2 = ResolveChoice.apply( frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") print('dropnullfields3::printSchema') dropnullfields3.printSchema() dropnullfields3.show(2) # ghi data vao redshift datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "dbtable": "temp_ls_rating_3cx_v1", "database": "dts_odin", "postactions": """ INSERT into mapping_changed_status_student(contact_id, change_status_date_id, user_id,to_status_id, measure1) SELECT t3cx.contact_id, t3cx.id_time, um.user_id, t3cx.rating_status, t3cx.so_lan FROM temp_ls_rating_3cx_v1 t3cx LEFT JOIN user_map um ON um.source_type = 1 AND um.source_id = t3cx.contact_id WHERE len(t3cx.contact_id) < 33 ; DROP TABLE IF EXISTS public.temp_ls_rating_3cx_v1 """ }, redshift_tmp_dir="s3n://dts-odin/temp/temp_ls_rating_3cx_v1", transformation_ctx="datasink4") df_datasource = dyf_3cx_advisor_call.toDF() flag = df_datasource.agg({"_key": "max"}).collect()[0][0] flag_data = [flag] df = spark.createDataFrame(flag_data, "long").toDF('flag') df.write.parquet( "s3a://dtsodin/flag/student_status/temp_ls_rating_3cx_v1.parquet", mode="overwrite") dy_source_3cx_cache.unpersist()
def main(): glueContext = GlueContext(SparkContext.getOrCreate()) spark = glueContext.spark_session # ----------------------------------------------DYF-----------------------------------------------------------------# dyf_student_contact = glueContext.create_dynamic_frame.from_catalog( database="tig_advisor", table_name="student_contact") dyf_student_contact = dyf_student_contact.select_fields( ['contact_id', 'student_id']) #-----------------------------------------DYF-----------------------------------------------------------------------# dyf_ghinhan_hp = glueContext.create_dynamic_frame.from_catalog( database="poss", table_name="ghinhan_hp") dyf_ghinhan_hp = dyf_ghinhan_hp.select_fields( ['ngay_thanhtoan', 'khoa_hoc_makh', 'trang_thai']).rename_field('trang_thai', 'trang_thai_gnhp') dyf_ghinhan_hp = Filter.apply(frame=dyf_ghinhan_hp, f=lambda x: x["trang_thai_gnhp"] == True) # -----------------------------------------DYF-----------------------------------------------------------------------# dyf_khoa_hoc = glueContext.create_dynamic_frame.from_catalog( database="poss", table_name="khoa_hoc") dyf_khoa_hoc = dyf_khoa_hoc.select_fields(['makh', 'mahv', 'trang_thai']).rename_field( 'trang_thai', 'trang_thai_kh') dyf_khoa_hoc = Filter.apply(frame=dyf_khoa_hoc, f=lambda x: x["trang_thai_kh"] == True) # -----------------------------------------DYF-----------------------------------------------------------------------# dyf_hoc_vien = glueContext.create_dynamic_frame.from_catalog( database="poss", table_name="hoc_vien") dyf_hoc_vien = dyf_hoc_vien.select_fields([ 'mahv', 'crm_id', 'trang_thai' ]).rename_field('mahv', 'mahv_hv').rename_field('trang_thai', 'trang_thai_hv') dyf_hoc_vien = Filter.apply(frame=dyf_hoc_vien, f=lambda x: x["trang_thai_hv"] == True) #-------------------------------------------------------------------------------------------------------------------# df_student_contact_1 = dyf_student_contact.toDF() df_student_contact_1.drop_duplicates() df_student_contact = df_student_contact_1.groupby( 'contact_id', 'student_id').agg( f.count('contact_id').alias("contact_id_after_count")) dyf_student_contact = DynamicFrame.fromDF(df_student_contact, glueContext, "dyf_student_contact") dyf_student_contact = Filter.apply( frame=dyf_student_contact, f=lambda x: x["contact_id_after_count"] > 1) df_student_contact = dyf_student_contact.toDF() df_student_contact.drop_duplicates() df_student_contact.cache() df_student_contact.printSchema() df_student_contact.show(2) print('df_student_contact count::', df_student_contact.count()) df_ghinhan_hp = dyf_ghinhan_hp.toDF() df_khoa_hoc = dyf_khoa_hoc.toDF() df_hoc_vien = dyf_hoc_vien.toDF() #------------------------------------------___JOIN___---------------------------------------------------------------# df_join = df_ghinhan_hp.join( df_khoa_hoc, df_ghinhan_hp.khoa_hoc_makh == df_khoa_hoc.makh) df_join.printSchema() print('df_join count::', df_join.count()) df_join1 = df_join.join(df_hoc_vien, df_join.mahv == df_hoc_vien.mahv_hv) df_join1.printSchema() print('df_join1 count::', df_join1.count()) df_join2 = df_join1.join(df_student_contact, df_join1.crm_id == df_student_contact.contact_id) df_join2 = df_join2.withColumn( 'change_status_date_id', from_unixtime(unix_timestamp(df_join2.ngay_thanhtoan, "yyyy-MM-dd"), "yyyyMMdd")) df_join2.drop_duplicates() df_join2.printSchema() df_join2.show(2) print('df_join2 count::', df_join2.count()) # df_join2.printSchema() # print('df_join2 count::', df_join2.count()) #-----------------------------------_____choose_name_field______----------------------------------------------------# to_status_id = 201L df_result = df_join2.select('student_id', 'change_status_date_id', f.lit(to_status_id).alias('to_status_id'), 'contact_id') df_result.printSchema() df_result.show(3) df_result = df_result.drop_duplicates() df_result.cache() print('count df_result::', df_result.count()) dyf_result = DynamicFrame.fromDF(df_result, glueContext, "dyf_result") dyf_result = Filter.apply( frame=dyf_result, f=lambda x: x["student_id"] is not None and x[ "change_status_date_id"] is not None and x[ "to_status_id"] is not None and x["contact_id"] is not None) apply_output = ApplyMapping.apply( frame=dyf_result, mappings=[ ("student_id", "string", "student_id", "long"), # ("user_id", "long", "user_id", "long"), ("change_status_date_id", "string", "change_status_date_id", "long" ), # ("from_status_id", "long", "from_status_id", "long"), ("to_status_id", "long", "to_status_id", "long"), # ("measure1", "double", "measure1", "double"), # ("measure2", "double", "measure2", "double"), # ("description", "string", "description", "string"), # ("timestamp1", "string", "timestamp1", "string"), ("contact_id", "string", "contact_id", "string"), # ("teacher_id", "long", "teacher_id", "long"), # ("contact_id1", "string", "contact_id1", "string"), # ("measure1_int", "int", "measure1_int", "int"), # ("measure2_int", "int", "measure2_int", "int"), # ("contact_id_str", "string", "contact_id_str", "string"), # ("lc", "string", "lc", "string"), # ("student_id_string", "string", "student_id_string", "string") ]) df_apply_output = apply_output.toDF() df_apply_output.drop_duplicates() print('df_apply_output.count', df_apply_output.count()) dyf_apply_output = DynamicFrame.fromDF(df_apply_output, glueContext, "dyf_apply_output") resolve_choice = ResolveChoice.apply(frame=dyf_apply_output, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields = DropNullFields.apply(frame=resolve_choice, transformation_ctx="dropnullfields") datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields, catalog_connection="glue_redshift", connection_options={ "dbtable": "mapping_changed_status_student_v1", "database": "dts_odin" }, redshift_tmp_dir="s3n://datashine-dwh/temp1/", transformation_ctx="datasink4") df_result.unpersist() df_student_contact.unpersist() print( '------------------------>___complete__________------------------------------>' )
lambda x, y: pygeohash.encode(float(x), float(y), precision=6), StringType()) spark = SparkSession.builder.appName('supply-to-demand-app').getOrCreate() #'hdfs:///grab_data/user_values_staging/' driver_messages_hdfs_path = sys.argv[1] user_messages_hdfs_path = sys.argv[2] driver_msgs_df = spark.read.parquet(driver_messages_hdfs_path).withColumn( "geo_hash", calcualte_the_geohash_udf("lat", "long")).select( "geo_hash", F.col("timestamp").cast('timestamp').alias('time')).select( "geo_hash", F.from_unixtime(F.unix_timestamp('time', 'yyyy-MM-dd HH:mm:ss'), 'yyyy-MM-dd HH:mm').alias('date_time')).groupBy( "geo_hash", "date_time").agg( F.count("*").alias("supply_count")) user_msgs_df = spark.read.parquet(user_messages_hdfs_path).withColumn( "geo_hash", calcualte_the_geohash_udf("lat", "long")).select( "geo_hash", F.col("timestamp").cast('timestamp').alias('time')).select( "geo_hash", F.from_unixtime(F.unix_timestamp('time', 'yyyy-MM-dd HH:mm:ss'), 'yyyy-MM-dd HH:mm').alias('date_time')).groupBy( "geo_hash", "date_time").agg( F.count("*").alias("demand_count")) driver_msgs_df.write.format('parquet').mode('append').save( driver_messages_for_batch_processing_hdfs_path) user_msgs_df.write.format('parquet').mode('append').save(
def rdd_to_recordstore(rdd_transform_context_rdd): if rdd_transform_context_rdd.isEmpty(): MonMetricsKafkaProcessor.log_debug( "rdd_to_recordstore: nothing to process...") else: sql_context = SQLContext(rdd_transform_context_rdd.context) data_driven_specs_repo = DataDrivenSpecsRepoFactory.\ get_data_driven_specs_repo() pre_transform_specs_df = data_driven_specs_repo.\ get_data_driven_specs( sql_context=sql_context, data_driven_spec_type=DataDrivenSpecsRepo. pre_transform_specs_type) # # extract second column containing raw metric data # raw_mon_metrics = rdd_transform_context_rdd.map( lambda nt: nt.rdd_info[1]) # # convert raw metric data rdd to dataframe rdd # raw_mon_metrics_df = \ MonMetricUtils.create_mon_metrics_df_from_json_rdd( sql_context, raw_mon_metrics) # # filter out unwanted metrics and keep metrics we are interested in # cond = [ raw_mon_metrics_df.metric.name == pre_transform_specs_df.event_type] filtered_metrics_df = raw_mon_metrics_df.join( pre_transform_specs_df, cond) # # validate filtered metrics to check if required fields # are present and not empty # In order to be able to apply filter function had to convert # data frame rdd to normal rdd. After validation the rdd is # converted back to dataframe rdd # # FIXME: find a way to apply filter function on dataframe rdd data validated_mon_metrics_rdd = filtered_metrics_df.rdd.filter( MonMetricsKafkaProcessor._validate_raw_mon_metrics) validated_mon_metrics_df = sql_context.createDataFrame( validated_mon_metrics_rdd, filtered_metrics_df.schema) # # record generator # generate a new intermediate metric record if a given metric # metric_id_list, in pre_transform_specs table has several # intermediate metrics defined. # intermediate metrics are used as a convenient way to # process (aggregated) metric in mutiple ways by making a copy # of the source data for each processing # gen_mon_metrics_df = validated_mon_metrics_df.select( validated_mon_metrics_df.meta, validated_mon_metrics_df.metric, validated_mon_metrics_df.event_processing_params, validated_mon_metrics_df.event_type, explode(validated_mon_metrics_df.metric_id_list).alias( "this_metric_id"), validated_mon_metrics_df.service_id) # # transform metrics data to record_store format # record store format is the common format which will serve as # source to aggregation processing. # converting the metric to common standard format helps in writing # generic aggregation routines driven by configuration parameters # and can be reused # record_store_df = gen_mon_metrics_df.select( (gen_mon_metrics_df.metric.timestamp / 1000).alias( "event_timestamp_unix"), from_unixtime( gen_mon_metrics_df.metric.timestamp / 1000).alias( "event_timestamp_string"), gen_mon_metrics_df.event_type.alias("event_type"), gen_mon_metrics_df.event_type.alias("event_quantity_name"), (gen_mon_metrics_df.metric.value / 1.0).alias( "event_quantity"), when(gen_mon_metrics_df.metric.dimensions.state != '', gen_mon_metrics_df.metric.dimensions.state).otherwise( 'NA').alias("event_status"), lit('1.0').alias('event_version'), lit('metrics').alias("record_type"), # resource_uuid when(gen_mon_metrics_df.metric.dimensions.instanceId != '', gen_mon_metrics_df.metric.dimensions.instanceId).when( gen_mon_metrics_df.metric.dimensions.resource_id != '', gen_mon_metrics_df.metric.dimensions.resource_id). otherwise('NA').alias("resource_uuid"), when(gen_mon_metrics_df.metric.dimensions.tenantId != '', gen_mon_metrics_df.metric.dimensions.tenantId).when( gen_mon_metrics_df.metric.dimensions.tenant_id != '', gen_mon_metrics_df.metric.dimensions.tenant_id).when( gen_mon_metrics_df.metric.dimensions.project_id != '', gen_mon_metrics_df.metric.dimensions.project_id).otherwise( 'NA').alias("tenant_id"), when(gen_mon_metrics_df.metric.dimensions.mount != '', gen_mon_metrics_df.metric.dimensions.mount).otherwise( 'NA').alias("mount"), when(gen_mon_metrics_df.metric.dimensions.device != '', gen_mon_metrics_df.metric.dimensions.device).otherwise( 'NA').alias("device"), when(gen_mon_metrics_df.meta.userId != '', gen_mon_metrics_df.meta.userId).otherwise('NA').alias( "user_id"), when(gen_mon_metrics_df.meta.region != '', gen_mon_metrics_df.meta.region).when( gen_mon_metrics_df.event_processing_params .set_default_region_to != '', gen_mon_metrics_df.event_processing_params .set_default_region_to).otherwise( 'NA').alias("region"), when(gen_mon_metrics_df.meta.zone != '', gen_mon_metrics_df.meta.zone).when( gen_mon_metrics_df.event_processing_params .set_default_zone_to != '', gen_mon_metrics_df.event_processing_params .set_default_zone_to).otherwise( 'NA').alias("zone"), when(gen_mon_metrics_df.metric.dimensions.hostname != '', gen_mon_metrics_df.metric.dimensions.hostname).when( gen_mon_metrics_df.metric.value_meta.host != '', gen_mon_metrics_df.metric.value_meta.host).otherwise( 'NA').alias("host"), when(gen_mon_metrics_df.service_id != '', gen_mon_metrics_df.service_id).otherwise( 'NA').alias("service_group"), when(gen_mon_metrics_df.service_id != '', gen_mon_metrics_df.service_id).otherwise( 'NA').alias("service_id"), from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000, 'yyyy-MM-dd').alias("event_date"), from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000, 'HH').alias("event_hour"), from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000, 'mm').alias("event_minute"), from_unixtime(gen_mon_metrics_df.metric.timestamp / 1000, 'ss').alias("event_second"), gen_mon_metrics_df.this_metric_id.alias("metric_group"), gen_mon_metrics_df.this_metric_id.alias("metric_id")) # # get transform context # rdd_transform_context = rdd_transform_context_rdd.first() transform_context = rdd_transform_context.transform_context_info # # cache record store rdd # if cfg.CONF.service.enable_record_store_df_cache: storage_level_prop = \ cfg.CONF.service.record_store_df_cache_storage_level storage_level = StorageUtils.get_storage_level( storage_level_prop) record_store_df.persist(storage_level) # # start processing metrics available in record_store data # MonMetricsKafkaProcessor.process_metrics(transform_context, record_store_df) # remove df from cache if cfg.CONF.service.enable_record_store_df_cache: record_store_df.unpersist() # # extract kafka offsets and batch processing time # stored in transform_context and save offsets # offsets = transform_context.offset_info # batch time batch_time_info = \ transform_context.batch_time_info MonMetricsKafkaProcessor.save_kafka_offsets( offsets, rdd_transform_context_rdd.context.appName, batch_time_info) # call pre hourly processor, if its time to run if (cfg.CONF.stage_processors.pre_hourly_processor_enabled is True and PreHourlyProcessor.is_time_to_run( batch_time_info)): PreHourlyProcessor.run_processor( record_store_df.rdd.context, batch_time_info)
from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext from pyspark.sql import functions as F import pandas as pd if __name__ == "__main__": appName = "stage-1" sparkmaster = open("/root/spark-ec2/cluster-url").read().strip() conf = SparkConf().setMaster(sparkmaster).setAppName(appName) sc = SparkContext(conf = conf) sqlContext = SQLContext(sc) data_input = "s3n://make-emr-data/input/weblog/*" df = sqlContext.read.json(data_input) dfview = df[df['data_type'] == "MODULE_VIEW"] postviews = dfview.select("payload.post_id", "payload.time_stamp", "payload.author").withColumnRenamed("post_id", "postid") cat_input = "s3n://make-emr-data/input/webprop/*" df2 = sqlContext.read.json(cat_input) dfcat = df2[df2['data_type'] == "TEXT"] payload2 = dfcat.select("payload.post_id", "payload.publish_time_stamp") postcat = payload2.distinct() cond = [postviews.postid == postcat.post_id] dfjoin = postviews.join(postcat, cond, "left_outer") dfdatetime = dfjoin.withColumn('datetime', F.from_unixtime(dfjoin['time_stamp'], format = "yyyy-MM-dd")) dffinal = dfdatetime.withColumn('pub_date', F.from_unixtime(dfdatetime['publish_time_stamp'], format = "yyyy-MM-dd")) sqlContext.registerDataFrameAsTable(dffinal, "dftable") dfgroupby = sqlContext.sql("select count(postid) as viewcounts, pub_date, author, datetime, post_id from dftable group by datetime, pub_date, author, post_id") data_output = "s3n://make-emr-data/output/" dfgroupby.write.mode("overwrite").json(data_output)