def assign_vids_uid_time(df_impressions, vid_assignment_table, demo_cols): select_cols = ["vid", "timestamp", "h1", "p1", "p2", "user_id"] if demo_cols is not None: df = df_impressions.join(F.broadcast(vid_assignment_table), demo_cols) select_cols = demo_cols + select_cols else: df = df_impressions.join(F.broadcast(vid_assignment_table)) df_vid_impressions = ( df.withColumn( "h1", F.hash( F.concat( F.col("user_id").astype("string"), F.col("timestamp").astype("string")))) #.withColumn("h1", F.hash("user_id")) .withColumn( "p1", F.col("h1") / (2**32) + 0.5).where(F.col("p1") >= F.col("prob_>=")).where( F.col("p1") < F.col("prob_<")).withColumn( "p2", F.hash( F.concat( F.col("h1").astype("string"), F.col("timestamp").astype("string"))) / (2**32) + 0.5).withColumn( "vid", (F.col("p2") * F.col("total_VID")).astype('int') + F.col("start_VID")).select(*select_cols)) return df_vid_impressions
def build_inference_df(data): inference_df = (data.select("customerID", F.col("churnString").alias("Churn")) .withColumn("Churn", F.col("Churn") == "Yes") .withColumn("LastCallEscalated", F.when(F.col("Churn"), F.hash(F.col("customerID")) % 100 < 35) .otherwise(F.hash(F.col("customerID")) % 100 < 15))) return inference_df
def assign_vids_weighted(df_impressions, vid_assignment_table, demo_cols=None): select_cols = ["vid", "timestamp", "h1", "p1"] if demo_cols is not None: df = df_impressions.join(F.broadcast(vid_assignment_table), demo_cols) select_cols = demo_cols + select_cols else: df = df_impressions.crossJoin(F.broadcast(vid_assignment_table)) df_vid_impressions = ( df.withColumn("h1", F.hash(F.col("user_id").astype("string"))) #.withColumn("h1", F.hash("user_id")) .withColumn("p1", F.col("h1") / (2**32) + 0.5).withColumn( "n", F.round("weight").cast("integer")).where( F.col("p1") >= F.col("prob_>=")).where( F.col("p1") < F.col("prob_<")).withColumn( "vids", udf_find_n_vids( F.col("n"), F.col("start_VID"), F.col("total_VID"), F.col("h1"))).withColumn( "vid", F.explode(F.col("vids"))).select( *select_cols)) return df_vid_impressions
def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: column_spec = hash(*[ col.get_column_spec(source_df=source_df, current_column=current_column) for col in self.value ]).cast("string") return column_spec
def test_hash_repartition_exact(gen, num_parts): data_gen = gen[0] part_on = gen[1] assert_gpu_and_cpu_are_equal_collect( lambda spark : gen_df(spark, data_gen, length=1024)\ .repartition(num_parts, *part_on)\ .withColumn('id', f.spark_partition_id())\ .withColumn('hashed', f.hash(*part_on))\ .selectExpr('*', 'pmod(hashed, {})'.format(num_parts)))
def add_hash_column(obj, cols=True, hash_colname='_hash', exclude_cols=[]): # add the _updated timestamp if isinstance(cols, bool) and cols: cols = obj.columns colnames = (set(obj.columns) & set(cols)) - set(exclude_cols) cols = [x for x in cols if x in colnames] obj = obj.withColumn(hash_colname, F.hash(*cols)) return obj
def test_auto_mapper_hash(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "54"), (2, "Vidal", "67"), (3, "Vidal", None), (4, None, None), ], ["member_id", "last_name", "my_age"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") source_df = source_df.withColumn("my_age", col("my_age").cast("int")) df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id" ]).columns(age=A.hash(A.column("my_age"), A.column("last_name"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], hash(col("b.my_age"), col("b.last_name")).cast("string").alias("age"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("age").collect()[0][0] == "-543157534") assert (result_df.where("member_id == 2").select("age").collect()[0][0] == "2048196121") assert (result_df.where("member_id == 3").select("age").collect()[0][0] == "-80001407") assert result_df.where("member_id == 4").select( "age").collect()[0][0] == "42" assert dict(result_df.dtypes)["age"] == "string"
def _expode_field_arrays(self) -> DataFrame: timestamp_conditionals = [ F.col("field.field") == "timestamp", F.col("field.field") == "time_created", ] ts_case = F.when( self.bit_any(*timestamp_conditionals), (F.col("field.value").cast("long") + 631065600).cast("timestamp"), ).otherwise(None) ts_col = (F.first(ts_case, ignorenulls=True).over( Window.partitionBy("message_id")).alias("timestamp")) field_col = (F.when(F.col("field") == "type", F.col("message")).otherwise( F.col("field")).alias("field")) self._spark.conf.set("spark.sql.shuffle.partitions", "8") return (self.csv_df.select( "*", F.explode("fields").alias("field"), F.hash("fields").alias("message_id"), ).drop("fields").select( "record_type", "local number", "message", "message_id", ts_col, "field.*", ).replace("file_id", "file", "message").select( "record_type", "local number", "message", "message_id", "timestamp", field_col, "value", "units", ).join(PROFILE, on=["field", "value"], how="left").select( "record_type", "local number", "message", "message_id", "timestamp", "field", F.coalesce("enum_value", "value").alias("value"), "units", ))
def customer_meta(df): SENIOR_CUTOFF = 65 ADULT_CUTOFF = 18 DAYS_IN_YEAR = 365.25 EXPONENTIAL_DIST_SCALE = 6.3 augmented_original = replicate_df(df, options["dup_times"] or 1) customerMetaRaw = augmented_original.select( "customerID", F.lit(now).alias("now"), (F.abs(F.hash(augmented_original.customerID)) % 4096 / 4096).alias("choice"), "SeniorCitizen", "gender", "Partner", "Dependents", F.col("MonthlyCharges").cast( get_currency_type()).alias("MonthlyCharges"), ) customerMetaRaw = customerMetaRaw.withColumn( "ageInDays", F.floor( F.when( customerMetaRaw.SeniorCitizen == 0, (customerMetaRaw.choice * ((SENIOR_CUTOFF - ADULT_CUTOFF - 1) * DAYS_IN_YEAR)) + (ADULT_CUTOFF * DAYS_IN_YEAR), ).otherwise((SENIOR_CUTOFF * DAYS_IN_YEAR) + (DAYS_IN_YEAR * (-F.log1p(-customerMetaRaw.choice) * EXPONENTIAL_DIST_SCALE)))).cast("int"), ) customerMetaRaw = customerMetaRaw.withColumn( "dateOfBirth", F.expr("date_sub(now, ageInDays)")) return customerMetaRaw.select( "customerID", "dateOfBirth", "gender", "SeniorCitizen", "Partner", "Dependents", "MonthlyCharges", "now", ).orderBy("customerID")
def _expand_message_records(self, message: str) -> DataFrame: message_df = (self.csv_df.where(F.col("message") == message).select( "record_type", "message", F.hash("fields").alias("record_id"), F.explode("fields").alias("field_struct"), ).select("*", "field_struct.*").drop("field_struct").groupBy( "record_type", "message", "record_id").pivot("field").agg( F.max(F.concat_ws("-", F.col("value"), F.col("units"))))) if "type" in message_df.columns: message_type = message.replace("_id", "") message_df = message_df.withColumnRenamed("type", message_type) enum_cols = set(message_df.columns).intersection(set(PROFILE.keys())) for col in enum_cols: message_df = message_df.replace(**PROFILE[col], subset=col) return message_df
def input_data(path, filename, schemaString): fields = [ StructField(field_name, StringType(), True) for field_name in schemaString.split() ] schema = StructType(fields) dataType = StructType([ StructField("Class", IntegerType(), True), StructField("Timeseries", ArrayType(FloatType()), True) ]) ''' # when separator is space ' ' df = spark.read.csv(path + filename, schema=schema) def split_cols(array): Class = int(float(array[1])) timeseries = array[2:len(array)] timeseries = [float(value) for value in timeseries] return (Class, timeseries) split_cols = udf(split_cols, dataType) df = df.withColumn('text', split_cols(split('class_timeseries', '\\s+'))).select(hash(col('text.Timeseries')).alias('id'), col('text.*')) return df ''' #when separator is ',' df = spark.read.csv(path + filename, sep=';', schema=schema) def split_cols(array): Class = int(float(array[0])) timeseries = array[1:len(array)] timeseries = [float(value) for value in timeseries] return (Class, timeseries) split_cols = udf(split_cols, dataType) df = df.withColumn('text', split_cols(split('class_timeseries', ','))).select( hash(col('text.Timeseries')).alias('id'), col('text.*')) '''df_csv = df.select('id','Class', ArrayToString(col('Timeseries'))) return df, df_csv''' print("df.first() is {}".format(df.first())) return df
"reply_timestamp", "retweet_timestamp", "retweet_with_comment_timestamp", "like_timestamp" ] from pyspark.sql import functions as f #for feature in text_features: # text_feature_split = f.split(df[feature], '\t') # df = df.withColumn(feature, f.when(f.col(feature).isNotNull(), text_feature_split).otherwise(f.array().cast("array<string>"))) from pyspark.sql.types import IntegerType for feature in numeric_features: df = df.withColumn(feature, f.col(feature).cast(IntegerType())) for feature in id_features: output_col = feature + "_hashed" df = df.withColumn(output_col, (f.hash(f.col(feature)))) df = df.withColumn( output_col, f.when(f.col(output_col) < 0, f.col(output_col) * -1 % 50).otherwise(f.col(output_col) % 50)) for col in label_columns: df = df.withColumn(col, f.when(f.col(col).isNotNull(), 1).otherwise(0)) ##### Same preprocessing for validation (without label_columns transformation) #for feature in text_features: # text_feature_split = f.split(df_val[feature], '\t') # df_val = df_val.withColumn(feature, f.when(f.col(feature).isNotNull(), text_feature_split).otherwise(f.array().cast("array<string>"))) for feature in numeric_features: df_val = df_val.withColumn(feature, f.col(feature).cast(IntegerType()))
def loadRefine(self): s3 = boto3.resource('s3') discoveryBucketNode = s3.Bucket(name=self.discoveryBucket) lastUpdatedDealerFile = self.findLastModifiedFile( discoveryBucketNode, self.prefixDealerDiscoveryPath, self.discoveryBucket) dfDealerCode = self.sparkSession.read.parquet(lastUpdatedDealerFile) dfDealerCode.withColumnRenamed("Company", "CompanyCode").\ withColumnRenamed("DF", "AssociationType").withColumnRenamed("DCstatus", "AssociationStatus")\ .registerTempTable("Dealer") dfStoreAss = self.sparkSession.sql( "select a.DealerCode,a.CompanyCode,a.AssociationType, a.AssociationStatus," "a.TBLoc,a.SMFMapping,a.RankDescription from Dealer a where " "a.RankDescription='Open Standard' or a.RankDescription='SMF Only' or " "a.RankDescription='Bulk' or a.RankDescription Like '%Close%' or " "a.RankDescription Like 'Close%'") dfStoreAss = dfStoreAss.where(col('CompanyCode').like("Spring%")) dfStoreAsstemp = dfStoreAss dfOpenFil = dfStoreAss.filter(dfStoreAss.RankDescription != 'SMF Only') ######################################################################################################### # Transformation starts # ######################################################################################################### dfStoreAssOpen = dfStoreAsstemp.\ withColumn('StoreNumber', when((dfStoreAsstemp.RankDescription != 'SMF Only'), dfStoreAsstemp.TBLoc). otherwise(dfStoreAsstemp.SMFMapping)).withColumn('dealercode1', dfStoreAsstemp.DealerCode).\ withColumn('AssociationType1', when((dfStoreAsstemp.RankDescription != 'SMF Only'), 'Retail'). otherwise('SMF')).\ withColumn('AssociationStatus1', when((dfStoreAsstemp.RankDescription == 'SMF Only') | (dfStoreAsstemp.RankDescription == 'Open Standard') | (dfStoreAsstemp.RankDescription == 'Bulk'), 'Active'). otherwise('Closed')).drop(dfStoreAsstemp.DealerCode).\ drop(dfStoreAsstemp.AssociationType).drop(dfStoreAsstemp.AssociationStatus).\ select(col('StoreNumber'), col('dealercode1').alias('DealerCode'), col('AssociationType1').alias('AssociationType'), col('AssociationStatus1').alias('AssociationStatus'), col('TBLoc'), col('SMFMapping'), col('RankDescription'), col('CompanyCode')) dfStoreAssOpen.registerTempTable("storeAss1") ######################################################################################################### # Code for new entry fields for Open Standard # ######################################################################################################### dfOpenFilNewField = dfOpenFil.withColumn('StoreNumber', dfOpenFil.SMFMapping). \ withColumn('dealercode1', dfOpenFil.DealerCode). \ withColumn('AssociationType1', lit('SMF')). \ withColumn('AssociationStatus1', lit('Active')). \ drop(dfOpenFil.DealerCode). \ drop(dfOpenFil.AssociationType). \ drop(dfOpenFil.AssociationStatus). \ select(col('StoreNumber'), col('dealercode1').alias('DealerCode'), col('AssociationType1'). alias('AssociationType'), col('AssociationStatus1').alias('AssociationStatus'), col('TBLoc'), col('SMFMapping'), col('RankDescription'), col('CompanyCode')) dfOpenFilNewField.registerTempTable("storeAss2") ######################################################################################################### # Code for union of two dataframes# ######################################################################################################### joined_DF = self.sparkSession.sql( "select cast(StoreNumber as integer),DealerCode, 4 as CompanyCode," "AssociationType,AssociationStatus from storeAss1 union select StoreNumber" ",DealerCode,4 as CompanyCode,AssociationType,AssociationStatus " "from storeAss2") joined_DF.registerTempTable("store_assoc_source") self.sparkSession.sql("select StoreNumber, DealerCode, CompanyCode, AssociationType, AssociationStatus from" " store_assoc_source ").withColumn("Hash_Column", hash("StoreNumber", "DealerCode", "CompanyCode", "AssociationType", "AssociationStatus")).\ registerTempTable("store_assoc_curr") refinedBucketNode = s3.Bucket(name=self.refinedBucket) storeAssocPrevRefinedPath = self.findLastModifiedFile( refinedBucketNode, self.prefixStoreAssocPath, self.refinedBucket) if storeAssocPrevRefinedPath != '': self.sparkSession.read.parquet(storeAssocPrevRefinedPath).\ withColumn("Hash_Column", hash("StoreNumber", "DealerCode", "CompanyCode", "AssociationType", "AssociationStatus")).\ registerTempTable("store_assoc_prev") self.sparkSession.sql("select a.StoreNumber, a.DealerCode, a.CompanyCode, a.AssociationType, " "a.AssociationStatus from store_assoc_prev a left join store_assoc_curr b on " "a.StoreNumber = b.StoreNumber where a.Hash_Column = b.Hash_Column").\ registerTempTable("store_assoc_no_change_data") dfStoreUpdated = self.sparkSession.sql( "select a.StoreNumber, a.DealerCode, a.CompanyCode, " "a.AssociationType, a.AssociationStatus from store_assoc_curr a" " left join store_assoc_prev b on a.StoreNumber = b.StoreNumber" " where a.Hash_Column <> b.Hash_Column") updateRowsCount = dfStoreUpdated.count() dfStoreUpdated.registerTempTable("store_assoc_updated_data") dfStoreNew = self.sparkSession.sql( "select a.StoreNumber, a.DealerCode, a.CompanyCode, a.AssociationType," " a.AssociationStatus from store_assoc_curr a left join " "store_assoc_prev b on a.StoreNumber = b.StoreNumber where " "b.StoreNumber = null") newRowsCount = dfStoreNew.count() dfStoreNew.registerTempTable("store_assoc_new_data") if updateRowsCount > 0 or newRowsCount > 0: dfStoreWithCDC = self.sparkSession.sql( "select StoreNumber, DealerCode, CompanyCode, AssociationType, " "AssociationStatus from store_assoc_no_change_data union " "select StoreNumber, DealerCode, CompanyCode, AssociationType," " AssociationStatus from store_assoc_updated_data union " "select StoreNumber, DealerCode, CompanyCode, AssociationType," " AssociationStatus from store_assoc_new_data") self.log.info("Updated file has arrived..") dfStoreWithCDC.coalesce(1).write.mode("overwrite").parquet( self.storeAssociationWorkingPath) dfStoreWithCDC.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\ withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).\ write.mode("append").partitionBy('year', 'month').format('parquet').\ save(self.storeAssociationPartitonPath) else: self.log.info( " The prev and current files are same. So no file will be generated in refined bucket." ) else: self.log.info( " This is the first transaformation call, So keeping the file in refined bucket." ) joined_DF.coalesce(1).write.mode("overwrite").parquet( self.storeAssociationWorkingPath) joined_DF.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\ withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).\ write.mode('append').partitionBy('year', 'month').format('parquet').\ save(self.storeAssociationPartitonPath) self.sparkSession.stop()
def loadRefined(self): self.sparkSession.read.parquet( self.dealerCodeIn).registerTempTable("AttDealerCode") dfDealerCode = self.sparkSession.sql( "select a.dealercode,'4' as companycode,a.dcorigin as dealercodeorigin," "a.dfcode,a.dcstatus,case when a.df = 'No' then '0' when a.df = 'Yes' then '1'" " when a.df = 'Off' then '0' when a.df = 'False' then '0' when a.df = 'On' then '1' " "when a.df = 'True' then '1' when a.df = 'DF' then '1' end as dfindicator,a.candc,a.opendate,a.closedate," "case when a.ws = 'No' then '0' when a.ws = 'Yes' then '1'" " when a.ws = 'Off' then '0' when a.ws = 'False' then '0' when a.ws = 'On' then '1' when" " a.ws = 'True' then '1' end as whitestoreindicator,a.wsexpires as whitestoreexpirationdate," "a.sortingrank as sortrank,a.rankdescription,a.storeorigin," "a.acquisitionorigin as origin,a.businessexpert,a.footprintlevel," "a.location as attlocationextendedname,a.attlocationid,a.attlocationname,a.disputemkt as attdisputemarket," "a.attmktabbrev as attmarketcode, a.attmarketname as attmarket," "a.oldcode as olddealercode, " "a.oldcode2 as olddealercode2," "a.attregion, a.notes, a.notes2 from AttDealerCode a") # to change company code to -1 ## FinalDF1 = dfDealerCode.na.fill({ 'companycode': -1, }) # to filter null value recolds for column delaercodes ## FinalDF2 = FinalDF1.where(FinalDF1.dealercode != '') # dropping duplicates ## FinalDF = FinalDF2.dropDuplicates(['dealercode']) previousAttDealerCodeFile = self.findLastModifiedFile( self.refinedBucketNode, self.prefixAttDealerCodeRefinePath, self.refinedBucket) if previousAttDealerCodeFile != '': FinalDF.withColumn( "Hash_Column", hash("dealercode", "companycode", "dealercodeorigin", "dfcode", "dcstatus", "dfindicator", "candc", "opendate", "closedate", "whitestoreindicator", "whitestoreexpirationdate", "sortrank", "rankdescription", "storeorigin", "origin", "businessexpert", "footprintlevel", "attlocationextendedname", "attlocationid", "attlocationname", "attdisputemarket", "attmarketcode", "attmarket", "olddealercode", "olddealercode2", "attregion", "notes", "notes2")).registerTempTable("att_dealer_code_curr") self.sparkSession.read.parquet( previousAttDealerCodeFile).withColumn( "Hash_Column", hash("dealercode", "companycode", "dealercodeorigin", "dfcode", "dcstatus", "dfindicator", "candc", "opendate", "closedate", "whitestoreindicator", "whitestoreexpirationdate", "sortrank", "rankdescription", "storeorigin", "origin", "businessexpert", "footprintlevel", "attlocationextendedname", "attlocationid", "attlocationname", "attdisputemarket", "attmarketcode", "attmarket", "olddealercode", "olddealercode2", "attregion", "notes", "notes2")).registerTempTable("att_dealer_code_prev") self.sparkSession.sql("select a.dealercode, a.companycode, a.dealercodeorigin, a.dfcode, a.dcstatus, " "a.dfindicator, a.candc, a.opendate, a.closedate, a.whitestoreindicator, " "a.whitestoreexpirationdate, a.sortrank, a.rankdescription, a.storeorigin, a.origin, " "a.businessexpert, a.footprintlevel, a.attlocationextendedname, a.attlocationid, " "a.attlocationname, a.attdisputemarket, a.attmarketcode, a.attmarket, " "a.olddealercode, a.olddealercode2, a.attregion, a.notes, a.notes2 from " "att_dealer_code_prev a left join att_dealer_code_curr b on " "a.dealercode = b.dealercode where a.Hash_Column = b.Hash_Column").\ registerTempTable("att_dealer_no_change_data") dfAttDealerCodeUpdated = self.sparkSession.sql( "select a.dealercode, a.companycode, a.dealercodeorigin, a.dfcode, a.dcstatus, a.dfindicator, a.candc," "a.opendate, a.closedate, a.whitestoreindicator, a.whitestoreexpirationdate, a.sortrank, " "a.rankdescription, a.storeorigin, a.origin, a.businessexpert, a.footprintlevel, " "a.attlocationextendedname, a.attlocationid, a.attlocationname, a.attdisputemarket, a.attmarketcode," "a.attmarket, a.olddealercode, a.olddealercode2, a.attregion, a.notes, a.notes2 from " "att_dealer_code_curr a left join att_dealer_code_prev b on a.dealercode = b.dealercode " "where a.Hash_Column <> b.Hash_Column") updateRowsCount = dfAttDealerCodeUpdated.count() dfAttDealerCodeUpdated.registerTempTable("att_dealer_updated_data") dfAttDealerCodeNew = self.sparkSession.sql( "select a.dealercode, a.companycode, a.dealercodeorigin, a.dfcode, a.dcstatus, a.dfindicator, a.candc," "a.opendate, a.closedate, a.whitestoreindicator, a.whitestoreexpirationdate, a.sortrank, " "a.rankdescription, a.storeorigin, a.origin, a.businessexpert, a.footprintlevel, " "a.attlocationextendedname, a.attlocationid, a.attlocationname, a.attdisputemarket, a.attmarketcode," "a.attmarket, a.olddealercode, a.olddealercode2, a.attregion, a.notes, a.notes2 from " "att_dealer_code_curr a left join att_dealer_code_prev b on a.dealercode = b.dealercode " "where b.dealercode = null") newRowsCount = dfAttDealerCodeNew.count() dfAttDealerCodeNew.registerTempTable("att_dealer_new_data") if updateRowsCount > 0 or newRowsCount > 0: self.sparkSession.sql( "select * from att_dealer_no_change_data union " "select * from att_dealer_updated_data union " "select * from att_dealer_new_data").registerTempTable( "att_dealer_cdc") self.log.info("Updated file has arrived..") FinalDF = self.sparkSession.sql( "select dealercode, companycode, dealercodeorigin, dfcode, dcstatus," " dfindicator, candc, opendate, closedate, whitestoreindicator, " "whitestoreexpirationdate, sortrank, rankdescription, storeorigin, " "origin, businessexpert, footprintlevel, attlocationextendedname, " "attlocationid, attlocationname, attdisputemarket, attmarketcode, " "attmarket, olddealercode, olddealercode2, attregion, notes, notes2" " from att_dealer_cdc") FinalDF.coalesce(1).select("*").write.mode( "overwrite").parquet(self.dealerCodeOutput + '/' + 'Working') FinalDF.coalesce(1).select("*").write.mode("overwrite").csv( self.dealerCodeOutput + '/csv', header=True) FinalDF.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))). \ withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).select("*").write.mode( "append").partitionBy('year', 'month').format('parquet').save(self.dealerCodeOutput) else: self.sparkSession.createDataFrame(self.sparkSession.sparkContext.emptyRDD()).write.mode("overwrite")\ .csv(self.dealerCodeOutput + '/' + 'Working') self.log.info( "The prev and current files same.So zero size delta file generated in delivery bucket." ) else: ######################################################################################################### # write output in parquet file # ######################################################################################################### self.log.info( " This is the first transaformation call, So keeping the file in refined bucket." ) FinalDF.coalesce(1).select("*").write.mode("overwrite").parquet( self.dealerCodeOutput + '/' + 'Working') FinalDF.coalesce(1).select("*").write.mode("overwrite").csv( self.dealerCodeOutput + '/csv', header=True) FinalDF.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\ withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).select("*").write.mode("append").\ partitionBy('year', 'month').format('parquet').save( self.dealerCodeOutput) self.sparkSession.stop()
def _assert_df_equal(df_1, df_2): df_1 = df_1.select(F.hash(*df_1.columns).alias("hash")).orderBy("hash") df_2 = df_2.select(F.hash(*df_2.columns).alias("hash")).orderBy("hash") assert df_1.subtract(df_2).count() == 0
def main(): spark = SparkSession \ .builder \ .appName("RedditRecommender") \ .getOrCreate() output_file = open('als-output.txt', 'w') data = spark.read.json('./sample_data.json') cols_to_keep = [data.author, data.id, data.subreddit] data = data.select(*cols_to_keep) data = data.filter(data.author != "[deleted]") @udf("boolean") def isNotDefault(x): defaultSubs = [ "Art", "AskReddit", "DIY", "Documentaries", "EarthPorn", "Futurology", "GetMotivated", "IAmA", "InternetIsBeautiful", "Jokes", "LifeProTips", "Music", "OldSchoolCool", "Showerthoughts", "UpliftingNews", "announcements", "askscience", "aww", "blog", "books", "creepy", "dataisbeautiful", "explainlikeimfive", "food", "funny", "gadgets", "gaming", "gifs", "history", "listentothis", "mildlyinteresting", "movies", "news", "nosleep", "nottheonion", "personalfinance", "philosophy", "photoshopbattles", "pics", "science", "space", "sports", "television", "tifu", "todayilearned", "videos", "worldnews" ] return x not in defaultSubs data = data.filter(isNotDefault(data.subreddit)) data = data.groupBy([data.author, data.subreddit]).count().orderBy(data.author) data = data.withColumn('author_id', hash(data.author)) data = data.withColumn('subreddit_id', hash(data.subreddit)) (training, test) = data.randomSplit([0.8, 0.2]) als = ALS(maxIter=5, rank=70, regParam=0.01, userCol="author_id", itemCol="subreddit_id", ratingCol="count", coldStartStrategy="drop", implicitPrefs=True) model = als.fit(training) predictions = model.transform(test) evaluator = RegressionEvaluator(metricName="rmse", labelCol="count", predictionCol="prediction") rmse = evaluator.evaluate(predictions) output_file.write('Root mean squared error: ' + str(rmse) + '\n\n') users = data.select(als.getUserCol()).distinct().limit(30) user_subset_recs = model.recommendForUserSubset(users, 10) subreddit_recs = {} for row in user_subset_recs.collect(): author = get_author_from_id(data, row['author_id']) subreddit_recs[author] = [] for rec in row['recommendations']: subreddit_recs[author].append(get_subreddit_from_id(data, rec[0])) for author in subreddit_recs.keys(): output_file.write('Top 10 recommendations for user ' + author + ':\n') for rec in subreddit_recs[author]: output_file.write(rec) output_file.write('\n') output_file.write('\n')
import os common_path = '/data/MobodMovieLens/train' ratings = spark.read.csv(os.path.join(common_path, 'ratings.csv'), header=True).cache() movies = spark.read.csv(os.path.join(common_path, 'movies.csv'), header=True).cache() ratings_tmp = ( ratings .withColumn( 'order', f.row_number().over(Window.partitionBy('userId').orderBy('timestamp')) / f.count('*').over(Window.partitionBy('userId')) ) .withColumn('hash', f.abs(f.hash('userId')) % 211) ) ratings_train_A = ( ratings_tmp .filter((f.col('hash') > 0) & (f.col('hash') <= 105) & (f.col('order') < 0.905)) .drop('order', 'hash') .cache() ) ratings_train_B = ( ratings_tmp .filter((f.col('hash') > 106) & (f.col('order') < 0.905)) .drop('order', 'hash') .cache() ) ratings_dev = ( ratings_tmp