def process_country_codes(): input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + COUNTRY_FILE) df_country = spark.read.format("csv").option("delimiter", "=").option( "header", "False").load(input_data_file) df_country = df_country.withColumnRenamed( "_c0", "country_code").withColumnRenamed("_c1", "country_name") df_country = df_country.withColumn( "country_name", F.regexp_replace(df_country.country_name, "'", "")) df_country = df_country.withColumn( "country_name", F.ltrim(F.rtrim(df_country.country_name))) df_country = df_country.withColumn( "country_code", F.ltrim(F.rtrim(df_country.country_code))) df_country = df_country.withColumn( "country_name", F.regexp_replace(df_country.country_name, "^INVALID.*|Collapsed.*|No\ Country.*", "INVALID")) df_country.write.mode("overwrite").parquet(s3 + 'data/processed/codes/country') return df_country
def remove_space(df, col_name, position): # remove left side space if position == "l": return df.withColumn("tmp", ltrim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name) # remove right side space elif position == "r": return df.withColumn("tmp", rtrim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name) # remove all side space elif position == "a": return df.withColumn("tmp", trim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name)
def preprocess_data_table( df: pyspark.sql.dataframe.DataFrame ) -> pyspark.sql.dataframe.DataFrame: for col_name in df.columns: if isinstance(df.schema[col_name].dataType, StringType) is True: df = df \ .withColumn(col_name, rtrim(col(col_name))) if isinstance(df.schema[col_name].dataType, TimestampType) is True: df = df \ .withColumn(col_name, col(col_name).cast('string')) return df
def process_state_codes(): input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + STATE_FILE) df_state = spark.read.format("csv").option("delimiter", "=").option( "header", "False").load(input_data_file) df_state = df_state.withColumnRenamed("_c0", "state_code").withColumnRenamed( "_c1", "state_name") df_state = df_state.withColumn( "state_code", F.regexp_replace(df_state.state_code, "[^A-Z]", "")) df_state = df_state.withColumn( "state_name", F.regexp_replace(df_state.state_name, "'", "")) df_state = df_state.withColumn("state_name", F.ltrim(F.rtrim(df_state.state_name))) df_state.write.mode("overwrite").parquet(s3 + 'data/processed/codes/us_state') return df_state
def canonicaltokens(df, inputColumn, outputColumn): """ turn input column of strings into canonical format as output column of tokens return as output column added to the dataframe """ newname = df.withColumn("cleanname", \ f.regexp_replace(f.regexp_replace(f.rtrim(f.ltrim(f.col(inputColumn))), \ " (\w) (\w) ", "$1$2"), "(\w) (\w) (\w)$", "$1$2$3")) newtokenizer = mlf.Tokenizer(inputCol="cleanname", outputCol="words") chtokenized = newtokenizer.transform(newname).drop("cleanname") stopwordremover = mlf.StopWordsRemover(inputCol="words", outputCol=outputColumn) canonicalname = stopwordremover.transform(chtokenized).drop("words") return canonicalname
def remove_space(df, col_name, position): if position not in ["l", "r", "a"]: raise ValueError("The position value must be l, r or a") # get origin column orders columns = df.columns # remove left side space if position == "l": return df.withColumn("tmp", ltrim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name).select( *columns) # remove right side space elif position == "r": return df.withColumn("tmp", rtrim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name).select( *columns) # remove all side space elif position == "a": return df.withColumn("tmp", trim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name).select( *columns)
def parse_message(col, eol="\n"): """ Generate the expression that parses the email message into From, Subject, Body etc. Args: col - sqlf.col() column object eol - end of line chatacter to use when parsing the email Returns: List pyspark.sql.functions to be passed to select() """ out_dict = [ "Message-ID", "Date", "From", "To", "Subject", "Mime-Version", "Content-Type", "Content-Transfer-Encoding", "X-From", "X-To", "X-cc", "X-bcc", "X-Folder", "X-Origin", "X-FileName", eol, ] expr = [] for i in range(0, len(out_dict) - 1): expr.append( sqlf.ltrim( sqlf.rtrim(sqlf.split(sqlf.split(col, out_dict[i] + ":")[1], eol)[0]) ).alias(out_dict[i]) ) expr.append(sqlf.split(sqlf.split(col, "X-FileName:")[1], "nsf")[1].alias("Body")) return expr
def process_airport_codes(): #transform airport codes input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + AIRPORT_FILE) df_airport = spark.read.format("csv").option("delimiter", "=").option( "header", "False").load(input_data_file) df_airport = df_airport.withColumn( "_c0", F.regexp_replace(df_airport._c0, "'", "")).withColumn( "_c1", F.regexp_replace(df_airport._c1, "'", "")) split_col = F.split(df_airport._c1, ",") df_airport = df_airport.withColumn("city", split_col.getItem(0)) df_airport = df_airport.withColumn("state_code", split_col.getItem(1)) df_airport = df_airport.withColumnRenamed("_c0", "port_code") df_airport = df_airport.drop("_c1") df_airport = df_airport.withColumn( "port_code", F.regexp_replace(df_airport.port_code, "[^A-Z]", "")).withColumn( "city", F.ltrim(F.rtrim(df_airport.city))).withColumn( "state_code", F.regexp_replace(df_airport.state_code, "[^A-Z]", "")) df_state = process_state_codes() df_airport = df_airport.join(df_state, "state_code") df_airport.write.mode("overwrite").parquet(s3 + 'data/processed/codes/us_ports')
from pyspark.sql.functions import monotonically_increasing_id df.select(monotonically_increasing_id()).show(10) #working with strings #perform case converstions from pyspark.sql.functions import initcap, lower, upper, ltrim, rtrim, trim, lpad, rpad df.select(initcap(col("Description"))).show(5) df.select(col("Description"), initcap(col("Description")), lower(col("Description")), upper(col("Description"))).show(5) string_with_space = " hello " df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)), trim(lit(string_with_space))).show() #regular expressions #working with dates, timestamps from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp dateDF = spark.range(10).withColumn("today", current_date()).withColumn( "now", current_timestamp()) dateDF.show() dateDF.select( date_add(col("today"), 5).alias("today+5"), date_sub(col("today"), 5).alias("today-5")).show()
YEAR_PATTERN = r".*\(\d{4}\)$" spark = SparkSession.builder.appName("hw9").getOrCreate() keyspace = sys.argv[1] df = (spark.read.format("csv").options( header=True, inferSchema=True, sep=",").load("hdfs:///data/movielens/movies.csv")) df = df.withColumn("title", regexp_replace(col("title"), "\xa0", "")) df = df.withColumn("title", regexp_replace(col("title"), r"\(\D+\)$", "")) df = df.withColumn("title", regexp_replace(col("title"), '"+', '"')) df = df.withColumn("title", regexp_replace(col("title"), r"\)+", ")")) df = df.withColumn("title", regexp_replace(col("title"), '^"', "")) df = df.withColumn("title", regexp_replace(col("title"), '"$', "")) df = df.withColumn("title", rtrim(col("title"))).filter( col("title").rlike(YEAR_PATTERN)) df = df.withColumn("year", substring(col("title"), -5, 4).cast(IntegerType())) df = df.withColumn("title", expr("substring(title, 1, length(title)-6)")) df = df.filter(rtrim(col("title")).rlike(".+")) df = df.filter(col("title").isNotNull()) df = df.filter(col("genres") != "(no genres listed)") df = df.withColumn("genres", split(df["genres"], r"\|")) df = df.select( col("movieId").alias("movieid"), col("title"), col("year"), col("genres")) (df.write.format("org.apache.spark.sql.cassandra").options( table="movies_with_genre_index", keyspace=keyspace).mode("append").save())
def calculate_score(self): # transform and filter data mentions_df = self.spark.read.parquet(self.mentions_path).select( 'GLOBALEVENTID', 'MentionTimeDate', 'MentionIdentifier', 'Confidence') gkg_df = self.spark.read.parquet(self.gkg_path).select( 'DocumentIdentifier', 'Date', 'V2Tone') # filter rows on mention date in 2019 mentions_df = mentions_df.filter( mentions_df.MentionTimeDate.like('2019%')) gkg_df = gkg_df.filter(gkg_df.Date.like('2019%')) gkg_df = gkg_df.drop('Date') # type casting for mentions and gkg df mentions_df = mentions_df.withColumn( 'GLOBALEVENTID', mentions_df.GLOBALEVENTID.cast('INT')) mentions_df = mentions_df.withColumn( 'Confidence', mentions_df.Confidence.cast('INT')) mentions_df = mentions_df.withColumn( 'mDate', F.to_date(mentions_df.MentionTimeDate, format='yyyyMMddHHmmss')).drop('MentionTimeDate') mentions_df.printSchema() print(mentions_df.first()) gkg_df = gkg_df.withColumn( 'Tone', F.split(gkg_df.V2Tone, ',')[0].cast('FLOAT')).drop('V2Tone') gkg_df.printSchema() print(gkg_df.first()) # register the DataFrame as a SQL temporary view mentions_df.createOrReplaceTempView('mentions_table') gkg_df.createOrReplaceTempView('gkg_table') # run sql query on 3 tables to calculate safety_score temp_df = self.spark.sql( 'SELECT GLOBALEVENTID, mDate, avg(Confidence*0.01*Tone) as sentiment, count(*) as numOfMentions \ FROM mentions_table inner join gkg_table on mentions_table.MentionIdentifier = gkg_table.DocumentIdentifier \ GROUP BY GLOBALEVENTID, mDate') temp_df.explain() temp_df.printSchema() print(temp_df.first()) temp_df.createOrReplaceTempView('temp_table') # clear cache of mentions and gkg df & table, read in event data self.spark.catalog.dropTempView('mentions_table') self.spark.catalog.dropTempView('gkg_table') mentions_df.unpersist() gkg_df.unpersist() # load event data and perform join and aggregation event_df = self.spark.read.parquet(self.event_path).select( 'GLOBALEVENTID', 'GoldsteinScale', 'ActionGeo_FullName') event_df = event_df.withColumn('GLOBALEVENTID', event_df.GLOBALEVENTID.cast('INT')) event_df = event_df.withColumn('GoldsteinScale', event_df.GoldsteinScale.cast('FLOAT')) event_df = event_df.withColumn( 'country', F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName, ',')[2]))) event_df = event_df.withColumn( 'state', F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName, ',')[1]))) event_df = event_df.withColumn( 'city', F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName, ',')[0]))).drop('ActionGeo_FullName') event_df.printSchema() print(event_df.first()) event_df.createOrReplaceTempView('event_table') # compute final safety score result_df = self.spark.sql( 'SELECT event_table.GLOBALEVENTID, mDate, 0.5*(GoldsteinScale*10+temp_table.sentiment) as SafetyScore, numOfMentions, \ country, state, city \ FROM event_table inner join temp_table on event_table.GLOBALEVENTID = temp_table.GLOBALEVENTID' ) result_df.explain() result_df.printSchema() print(result_df.first()) # free up memory and disk self.spark.catalog.dropTempView('temp_table') self.spark.catalog.dropTempView('event_table') temp_df.unpersist() event_df.unpersist() return result_df
if(i != " "): reformat_list = [rid, i] final_result.append(reformat_list) return final_result #Preprocess the rdd from stage 3 for both positive and negative music_p_small_preprocess = music_p_small_rdd.mapPartitions(review_encode_preprocess).cache() # music_p_small_preprocess.take(2) music_n_small_preprocess = music_n_small_rdd.mapPartitions(review_encode_preprocess).cache() # music_n_small_preprocess.take(2) #Re-formate the preprocessed rdd to formatted dataframe music_p_preprocess_reformat = spark.createDataFrame(music_p_small_preprocess) music_p_preprocess_reformat = music_p_preprocess_reformat.withColumnRenamed('_1', 'review_id').withColumnRenamed('_2', 'review_body') music_p_preprocess_reformat = music_p_preprocess_reformat.withColumn('review_body', f.ltrim(music_p_preprocess_reformat.review_body)) music_p_preprocess_reformat = music_p_preprocess_reformat.withColumn('review_body', f.rtrim(music_p_preprocess_reformat.review_body)) # music_p_preprocess_reformat.show(5) music_n_preprocess_reformat = spark.createDataFrame(music_n_small_preprocess) music_n_preprocess_reformat = music_n_preprocess_reformat.withColumnRenamed('_1', 'review_id').withColumnRenamed('_2', 'review_body') music_n_preprocess_reformat = music_n_preprocess_reformat.withColumn('review_body', f.ltrim(music_n_preprocess_reformat.review_body)) music_n_preprocess_reformat = music_n_preprocess_reformat.withColumn('review_body', f.rtrim(music_n_preprocess_reformat.review_body)) # music_n_preprocess_reformat.show(5) #Doing tokenizer with regex to separate every word in review body and filter if empty list regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'review_body', outputCol = 'review_token') music_p_preprocess_reformat_token = regexTokenizer.transform(music_p_preprocess_reformat) music_p_preprocess_reformat_token_filter = music_p_preprocess_reformat_token.filter(f.size('review_token') > 1) # music_p_preprocess_reformat_token_filter.show(5) music_n_preprocess_reformat_token = regexTokenizer.transform(music_n_preprocess_reformat) music_n_preprocess_reformat_token_filter = music_n_preprocess_reformat_token.filter(f.size('review_token') > 1) # music_n_preprocess_reformat_token_filter.show(5)
def loadParquet(self): print( "-------------------Fetching the dates------------------------------------" ) print(self.AttSalesActualsInput) s3keys = self.AttSalesActualsInput.split(':')[1].split('//')[1].split( '/') key = "" filefolderPrefix = s3keys[1] + "/" + s3keys[2] s3 = boto3.resource('s3') bucket = s3.Bucket(s3keys[0]) listofObj = list(bucket.objects.filter(Prefix=filefolderPrefix)) schema = StructType([ StructField('Location', StringType(), False), StructField('Loc Id', StringType(), False), StructField('Dlr1 Code', StringType(), False), StructField('KPI', StringType(), False), StructField('KPI ID', StringType(), False), StructField('Dec 2017 Actual', StringType(), True), StructField('Dec 2017 Projected', StringType(), True), StructField('Dec 2017 Target', StringType(), True) ]) dfATTSalesActuals = self.sparkSession.createDataFrame( self.sparkSession.sparkContext.emptyRDD(), schema) bucketWithPath = urlparse(self.AttSalesActualsInput) bucket = bucketWithPath.netloc spark = SparkSession.builder.appName("ATTSalesActuals").getOrCreate() for obj in listofObj: key = obj.key file = "s3://" + bucket + "/" + obj.key df1 = self.sparkSession.read.format("com.databricks.spark.csv").\ option("header", "true").\ option("treatEmptyValuesAsNulls", "true").\ option("inferSchema", "false").\ load(file, schema=schema) df1 = df1.filter(~(df1.Location.like('Histori%%'))) df1 = df1.filter(~(df1.Location.like('AT&T MyRe%%'))) df1 = df1.filter(~(df1.Location.like('AR : SPRI%%'))) df1 = df1.filter(~(df1.Location.like('Generated%%'))) df1 = df1.filter(~(df1.Location.like('Locat%%'))) dfATTSalesActuals = df1.unionAll(dfATTSalesActuals) dfATTSalesActuals = dfATTSalesActuals.withColumnRenamed("Location", "attlocationname").\ withColumnRenamed("Loc Id", "locid").\ withColumnRenamed("Dlr1 Code", "dealercode").\ withColumnRenamed("KPI", "kpiname").\ withColumnRenamed("KPI ID", "kpiid").\ withColumnRenamed("Dec 2017 Actual", "actual_value").\ withColumnRenamed("Dec 2017 Projected", "projected_value").\ withColumnRenamed("Dec 2017 Target", "target_value") filedate = key.split("/")[2].split("_")[3].split(".")[1] filedate = filedate[0:8] filedateS = datetime.strptime(filedate, '%Y%m%d').strftime('%m/%d/%Y') ######################################################################################################### # Reading the source data files # ######################################################################################################### FinalHistDF1 = dfATTSalesActuals FinalHistDF1 = FinalHistDF1.withColumn( "actualvalue", when( FinalHistDF1.actual_value.contains('%'), rtrim(regexp_replace("actual_value", '\\%|\\,', '')).cast( DecimalType()) / 100).otherwise( rtrim(regexp_replace("actual_value", '\\$|\\%|\\,', '')))) FinalHistDF1 = FinalHistDF1.withColumn( "projectedvalue", when( FinalHistDF1.actual_value.contains('%'), rtrim(regexp_replace("projected_value", '\\%|\\,', '')).cast( DecimalType()) / 100).otherwise( rtrim( regexp_replace("projected_value", '\\$|\\%|\\,', '')))) FinalHistDF1 = FinalHistDF1.withColumn( "targetvalue", when( FinalHistDF1.actual_value.contains('%'), rtrim(regexp_replace("target_value", '\\%|\\,', '')).cast( DecimalType()) / 100).otherwise( rtrim(regexp_replace("target_value", '\\$|\\%|\\,', '')))) dfATTSalesActualsRPT = spark.read.format("com.databricks.spark.csv").\ option("header", "true").\ option("treatEmptyValuesAsNulls", "true").\ option("inferSchema", "false").\ load(self.ATTMyResultsInp2, schema=schema) dfATTSalesActualsRPT1 = dfATTSalesActualsRPT.filter( ~(dfATTSalesActualsRPT.Location.like('Histori%%'))) dfATTSalesActualsRPT2 = dfATTSalesActualsRPT1.filter( ~(dfATTSalesActualsRPT.Location.like('AT&T MyRe%%'))) dfATTSalesActualsRPT3 = dfATTSalesActualsRPT2.filter( ~(dfATTSalesActualsRPT.Location.like('AR : SPRI%%'))) dfATTSalesActualsRPT4 = dfATTSalesActualsRPT3.filter( ~(dfATTSalesActualsRPT.Location.like('Generated%%'))) dfAttSalesActualsInputRPT = dfATTSalesActualsRPT4.filter( ~(dfATTSalesActualsRPT.Location.like('Locat%%'))) FinalRPTDF1 = dfAttSalesActualsInputRPT.withColumnRenamed("Location", "attlocationname2").\ withColumnRenamed("Loc Id", "locid2").\ withColumnRenamed("Dlr1 Code", "dealercode2").\ withColumnRenamed("KPI", "kpiname2").\ withColumnRenamed("KPI ID", "kpiid2").\ withColumnRenamed("Dec 2017 Actual", "actual_value2").\ withColumnRenamed("Dec 2017 Projected", "projected_value2").\ withColumnRenamed("Dec 2017 Target", "target_value2") FinalRPTDF1 = FinalRPTDF1.withColumn( "actualvalue2", when( FinalRPTDF1.actual_value2.contains('%'), rtrim(regexp_replace("actual_value2", '\\%|\\,', '')).cast( DecimalType()) / 100).otherwise( rtrim( regexp_replace("actual_value2", '\\$|\\%|\\,', '')))) FinalRPTDF1 = FinalRPTDF1.withColumn( "projectedvalue2", when( FinalRPTDF1.actual_value2.contains('%'), rtrim(regexp_replace("projected_value2", '\\%|\\,', '')).cast( DecimalType()) / 100).otherwise( rtrim( regexp_replace("projected_value2", '\\$|\\%|\\,', '')))) FinalRPTDF1 = FinalRPTDF1.withColumn( "targetvalue2", when( FinalRPTDF1.actual_value2.contains('%'), rtrim(regexp_replace("target_value2", '\\%|\\,', '')).cast( DecimalType()) / 100).otherwise( rtrim( regexp_replace("target_value2", '\\$|\\%|\\,', '')))) FinalHistDF1 = FinalHistDF1.withColumn('reportdate', lit(filedateS)) FinalHistDF1.registerTempTable("HIST") FinalRPTDF1 = FinalRPTDF1.withColumn('reportdate2', lit(filedateS)) FinalRPTDF1.registerTempTable("RPT") FinalHistDF = self.sparkSession.sql( "select attlocationname, locid, dealercode, kpiname, kpiid, actualvalue, projectedvalue, targetvalue, reportdate, YEAR(FROM_UNIXTIME(UNIX_TIMESTAMP())) as year, SUBSTR(FROM_UNIXTIME(UNIX_TIMESTAMP()),6,2) as month from HIST" ) FinalRPTDF = self.sparkSession.sql( "select attlocationname2,locid2,dealercode2,kpiname2,kpiid2,actualvalue2 ,projectedvalue2,targetvalue2,reportdate2,YEAR(FROM_UNIXTIME(UNIX_TIMESTAMP())) as year,SUBSTR(FROM_UNIXTIME(UNIX_TIMESTAMP()),6,2) as month from RPT" ) ''' FinalHistDF = spark.sql("select attlocationname,locid,dealercode,kpiname,kpiid,actualvalue ,projectedvalue," "targetvalue,reportdate," "SUBSTR(reportdate,7,4) as year,SUBSTR(reportdate,1,2) as month from HIST") FinalRPTDF = spark.sql("select attlocationname2,locid2,dealercode2,kpiname2,kpiid2," "actualvalue2 ,projectedvalue2,targetvalue2,reportdate2," "SUBSTR(reportdate2,7,4) as year,SUBSTR(reportdate2,1,2) as month from RPT") ''' FinalHistDF.show(100, False) FinalRPTDF.show(100, False) FinalHistDF.coalesce(1).select("*").write.mode("overwrite").parquet( self.AttSalesActualsOutput + '/' + 'Working1') FinalRPTDF.coalesce(1).select("*").write.mode("overwrite").parquet( self.AttSalesActualsOutput + '/' + 'Working2') FinalHistDF.coalesce(1).select("*").write.mode("append").partitionBy( 'year', 'month').format('parquet').save(self.AttSalesActualsOutput) FinalRPTDF.coalesce(1).select("*").write.mode("append").partitionBy( 'year', 'month').format('parquet').save(self.AttSalesActualsOutput) spark.stop()
banner_phone = hive_context.sql(banner_phone_query) #Clean up Advance data #The data coming in from Advance has double quotes around each field. We need to clean them up. We use Pandas for this part because it has a handy applymap method that works elementwise. advance_phone_pd = advance_phone.toPandas() advance_phone_pd_clean = advance_phone_pd.applymap(lambda x: clean_quotes(x)) adv_phone_clean = hive_context.createDataFrame(advance_phone_pd_clean) adv_phone_clean = hive_context.createDataFrame(advance_phone_pd_clean) banner_full_phone = (banner_phone .withColumn('Full_Phone', sf.rtrim( sf.concat( safe_concat(banner_phone, 'Telephone_Area_Code', ''), safe_concat(banner_phone, 'Telephone_Number', ''), ))) .withColumn('Data_Source', sf.lit('Banner')) .select( sf.col('NetID'), sf.col('PIDM').alias('Source_ID'), sf.col('Telephone_Description').alias('Phone_Type'), sf.col('Full_Phone'), sf.col('Data_Source'))) hub_full_phone = (hub_phone .withColumn('Full_Phone', sf.rtrim( sf.concat( safe_concat(hub_phone, 'Country_Code', ''),
snewdf = sdf.withColumn( "comment_text", F.regexp_replace(F.col("comment_text"), "[\$#,?." "!@#$%^&*()0123456789:-=\+]", "")) snewdf = snewdf.withColumn('comment_text', F.regexp_replace(F.col("comment_text"), "\"", "")) snewdf = snewdf.withColumn('comment_text', F.regexp_replace(F.col("comment_text"), "\n", " ")) snewdf = snewdf.withColumn('comment_text', F.regexp_replace(F.col("comment_text"), "\[", "")) snewdf = snewdf.withColumn('comment_text', F.regexp_replace(F.col("comment_text"), "\]", "")) snewdf = snewdf.withColumn('comment_text', F.regexp_replace(F.col("comment_text"), "\"+", "")) snewdf = snewdf.withColumn('comment_text', F.lower(F.col('comment_text'))) snewdf = snewdf.withColumn('comment_text', F.rtrim(snewdf.comment_text)) snewdf = snewdf.withColumn('comment_text', F.ltrim(snewdf.comment_text)) from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol="comment_text", outputCol="tokenized") tokenized_df = tokenizer.transform(snewdf) tokenized_df.select("tokenized").show() stopwordsremoved = StopWordsRemover(inputCol="tokenized", outputCol="comment_txt") swr_df = stopwordsremoved.transform(tokenized_df) swr_df.select("comment_txt").show() from pyspark.ml.feature import HashingTF, IDF
def compile_rstrip(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) return F.rtrim(src_column)
def loadParquet(self): self.log.info('Exception Handling starts') validSourceFormat = self.isValidFormatInSource() if not validSourceFormat: self.log.error(" Source files not in csv format.") validSourceSchema = self.isValidSchemaInSource() if not validSourceSchema: self.log.error( " Source schema does not have all the required columns.") if not validSourceFormat or not validSourceSchema: self.log.error( "Copy the source files to data processing error path and return." ) self.copyFile( self.EmpOprEffMasterListFile, self.dataProcessingErrorPath + self.EmpOprEffName + datetime.now().strftime('%Y%m%d%H%M') + self.fileFormat) return self.log.info('Source format and schema validation successful.') self.log.info('Reading the input parquet file') schema = StructType([ StructField('Market', StringType(), True), StructField('Region', StringType(), True), StructField('District', StringType(), True), StructField('Location', StringType(), True), StructField('Sales Person', StringType(), True), StructField('WorkDay ID', StringType(), False), StructField('Total Loss', StringType(), False), StructField('Total Issues', StringType(), False), StructField('Action Taken', StringType(), False), StructField('HR Consulted Before Termination', StringType(), False), StructField('Transaction Errors', StringType(), False), StructField('Total Errors', StringType(), False), StructField('NEXT Trades', StringType(), False), StructField('Total Devices12', StringType(), False), StructField('Hyla Loss', StringType(), False), StructField('Total Devices14', StringType(), False), StructField('Denied RMA Devices', StringType(), False), StructField('Total Devices16', StringType(), False), StructField('Cash Deposits', StringType(), False), StructField('Total Missing Deposits', StringType(), False), StructField('Total Short Deposits', StringType(), False), StructField('Shrinkage', StringType(), False), StructField('Comments', StringType(), False), StructField('Date', StringType(), False) ]) df1 = self.sparkSession.read.format("com.databricks.spark.csv").option( "header", "true").option("multiLine", "true").option( "delimiter", ",").option("quotechar", '"').option( "treatEmptyValuesAsNulls", "true").option("escape", ",").option("escape", '"').option( "encoding", "UTF-8").option("inferSchema", "false").load( self.OperationalEfficiencyScoreCard, schema=schema) df1 = df1.filter(~(df1.Market.like('%%Marke%%'))) df1 = df1.filter(~(df1.Market.like('%%Operational%%'))) df1 = df1.filter(~(df1.Market.like(' Regio%%'))) df1 = df1.where(df1.Market != '') df1 = df1.where(df1.Region != '') df1 = df1.where(df1.District != '') df1 = df1.withColumnRenamed("Market", "market").withColumnRenamed( "Region", "region" ).withColumnRenamed("District", "district").withColumnRenamed( "Location", "location" ).withColumnRenamed("Sales Person", "salesperson").withColumnRenamed( "WorkDay ID", "workdayid" ).withColumnRenamed("Total Loss", "total_loss").withColumnRenamed( "Total Issues", "total_issues" ).withColumnRenamed("Action Taken", "actiontaken").withColumnRenamed( "HR Consulted Before Termination", "hrconsultedbefore_termination" ).withColumnRenamed( "Transaction Errors", "transaction_errors" ).withColumnRenamed("Total Errors", "total_errors").withColumnRenamed( "NEXT Trades", "next_trades").withColumnRenamed( "Total Devices12", "total_devices12" ).withColumnRenamed("Hyla Loss", "hyla_loss").withColumnRenamed( "Total Devices14", "total_devices14").withColumnRenamed( "Denied RMA Devices", "denied_rmadevices").withColumnRenamed( "Total Devices16", "total_device16").withColumnRenamed( "Cash Deposits", "cash_deposits").withColumnRenamed( "Total Missing Deposits", "totalmissing_deposits").withColumnRenamed( "Total Short Deposits", "totalshort_deposits").withColumnRenamed( "Shrinkage", "Shrinkage").withColumnRenamed( "Comments", "comments").withColumnRenamed( "Date", "reportdate") df1 = df1.withColumn( "totalloss", rtrim(regexp_replace("total_loss", '\\(|\\)|\\$|\\,', '')).cast(DecimalType(15, 2))) df1 = df1.withColumn("totalissues", df1["total_issues"].cast(IntegerType())) df1 = df1.withColumn( "hrconsultedbeforetermination", df1["hrconsultedbefore_termination"].cast(BooleanType())) df1 = df1.withColumn( "transactionerrors", rtrim(regexp_replace("transaction_errors", '\\(|\\)|\\$|\\,', '')).cast(DecimalType(15, 2))) df1 = df1.withColumn("totalerrors", df1["total_errors"].cast(IntegerType())) df1 = df1.withColumn( "nexttrades", rtrim(regexp_replace("next_trades", '\\(|\\)|\\$', '')).cast(DecimalType(15, 2))) df1 = df1.withColumn("totaldevices12", df1["total_devices12"].cast(IntegerType())) df1 = df1.withColumn( "hylaloss", rtrim(regexp_replace("hyla_loss", '\\(|\\)|\\$', '')).cast(DecimalType(15, 2))) df1 = df1.withColumn("totaldevices14", df1["total_devices14"].cast(IntegerType())) df1 = df1.withColumn( "deniedrmadevices", rtrim(regexp_replace("denied_rmadevices", '\\(|\\)|\\$', '')).cast(DecimalType(15, 2))) df1 = df1.withColumn("totaldevice16", df1["total_device16"].cast(IntegerType())) df1 = df1.withColumn( "cashdeposits", rtrim(regexp_replace("cash_deposits", '\\(|\\)|\\$', '')).cast(DecimalType(15, 2))) df1 = df1.withColumn("totalmissingdeposits", df1["totalmissing_deposits"].cast(IntegerType())) df1 = df1.withColumn("totalshortdeposits", df1["totalshort_deposits"].cast(IntegerType())) df1 = df1.withColumn( "shrinkage", rtrim(regexp_replace("Shrinkage", '\\(|\\)|\\$|\\,', '')).cast(DecimalType(15, 2))) df1.registerTempTable("empopreff") df2 = self.sparkSession.sql( "select e.market , e.region,e.district, e.location, e.salesperson, e.workdayid, e.totalloss, e.totalissues, e.actiontaken,e.hrconsultedbeforetermination , e.transactionerrors, e.totalerrors, e.nexttrades, e.totaldevices12, e.hylaloss, e.totaldevices14, e.deniedrmadevices, e.totaldevice16, e.cashdeposits, e.totalmissingdeposits, e.totalshortdeposits, e.shrinkage, e.comments, YEAR(FROM_UNIXTIME(UNIX_TIMESTAMP())) as year, SUBSTR(FROM_UNIXTIME(UNIX_TIMESTAMP()),6,2) as month, e.reportdate from empopreff e " ) df2.coalesce(1).select("*").write.mode("append").partitionBy( 'year', 'month').parquet(self.OperationalEfficiencyScoreCardOutputArg) df2.coalesce(1).select("*").write.mode("overwrite").parquet( self.OperationalEfficiencyScoreCardOutputArg + '/' + 'Working') self.sparkSession.stop()
# COMMAND ---------- from pyspark.sql.functions import lower, upper df.select(col("Description"), lower(col("Description")), upper(lower(col("Description")))).show(2) # COMMAND ---------- from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lp"), rpad(lit("HELLO"), 10, " ").alias("rp")).show(2) # COMMAND ---------- from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2) # COMMAND ----------
"favorite_count", "community_owned_date", "closed_date", "content_license")\ .withColumn("creation_date", f.to_timestamp(f.col("creation_date")))\ .withColumn("last_edit_date", f.to_timestamp(f.col("last_edit_date")))\ .withColumn("last_activity_date", f.to_timestamp(f.col("last_activity_date")))\ .withColumn("community_owned_date", f.to_timestamp(f.col("community_owned_date")))\ .withColumn("closed_date", f.to_timestamp(f.col("closed_date")))\ .withColumn("tags", f.split(f.rtrim(f.regexp_replace(f.regexp_replace(f.col("tags"), "<", ""), ">", " ")), " "))\ .withColumn("creation_year", f.year("creation_date")) save_table(posts, "posts", ["creation_year"]) # tags tags = spark.read.format('xml')\ .options(rowTag='row')\ .load(f's3://{bucket_name}/{input_dir}/tags')\ .withColumnRenamed("_Count", "count")\ .withColumnRenamed("_ExcerptPostId", "excerpt_post_id")\ .withColumnRenamed("_Id", "id")\ .withColumnRenamed("_TagName", "tag_name")\ .withColumnRenamed("_WikiPostId", "wiki_post_id")\ .select("id", "tag_name",
StructField("country", StringType(), True), StructField("salary", StringType(), True)]); df = spark.read.csv(filepath, schema=strct); dfregex1 = df.select(df.workclass, df.finalweight, df.education, df.educationnum, df.maritalstatus, df.occupation, df.relationship , df.race , df.gender, df.capitalgain, df.capitalloss, df.hoursperweek, df.country, regexp_replace(df.salary, '(<=50K)', '50').alias('salary')); dfregex = dfregex1.select(dfregex1.workclass, dfregex1.finalweight, dfregex1.education, dfregex1.educationnum, dfregex1.maritalstatus, dfregex1.occupation , dfregex1.relationship, dfregex1.race , dfregex1.gender, dfregex1.capitalgain, dfregex1.capitalloss, dfregex1.hoursperweek, dfregex1.country , regexp_replace(dfregex1.salary, '(>50K)', '51').alias('salary')); dfbtrim = dfregex1.select(dfregex1.workclass, dfregex1.finalweight, dfregex1.education, dfregex1.educationnum, dfregex1.maritalstatus , dfregex1.occupation, dfregex1.relationship, dfregex1.race , dfregex1.gender, dfregex1.capitalgain, dfregex1.capitalloss , dfregex1.hoursperweek, dfregex1.country, ltrim(rtrim(dfregex1.salary)).alias('salary') ); dfcast = dfbtrim.select(dfbtrim.workclass, dfbtrim.finalweight, dfbtrim.education, dfbtrim.educationnum, dfbtrim.maritalstatus, dfbtrim.occupation , dfbtrim.relationship, dfbtrim.race , dfbtrim.gender, dfbtrim.capitalgain, dfbtrim.capitalloss, dfbtrim.hoursperweek, dfbtrim.country , dfbtrim.salary.cast(IntegerType()).alias('intSal')); dfcast.createOrReplaceTempView("employees"); query = "select workclass, education, maritalstatus, occupation, relationship, hoursperweek, country, avg(intSal) as sal_avg from employees " + "group by workclass, education, maritalstatus, occupation, relationship, hoursperweek, country"; sqldf = spark.sql(query); sqldf.dropna().show(10); spark.stop();
def get_latest_yyyymm_table_name(self, table): # TODO fix import here import pyspark.sql.functions as F df = self.get_all_yyyymm_tables(table) return df.agg(F.max(F.rtrim(F.col('TableName'))).alias('max_partition')).collect()[0].max_partition.lower()
delta = (start-end).total_seconds() return delta # register as a UDF f1 = f.udf(time_delta) def time_sub(y,x): z=y-x return z f2 = f.udf(time_sub) sc = SparkContext('local','MySQL_UC1') sqlContext = SQLContext(sc) device=sqlContext.read.format("jdbc").option("url","jdbc:mysql://localhost:3306/husain?useSSL=false").option("driver", "com.mysql.jdbc.Driver").option("dbtable","device").option("user","root").option("password","root").load() emp=sqlContext.read.format("jdbc").option("url","jdbc:mysql://localhost:3306/husain?useSSL=false").option("driver", "com.mysql.jdbc.Driver").option("dbtable","employee").option("user","root").option("password","root").load() emp = emp.withColumn('Timings',f.rtrim(emp.timings)) emp=emp.join(device,device.device_id == emp.device_id,"inner").select(emp.id,emp.name,emp.device_id,device.status,emp.Timings) split_col = f.split(emp['Timings'],' ') emp = emp.withColumn('Date',split_col.getItem(0).cast('date')) #emp = emp.orderBy('Date','name') #emp.show() emp1 = emp.filter(emp['status']=='in') emp1=emp1.orderBy('Date','name','Timings') emp2 = emp.filter(emp.status=='out') emp2 = emp2.orderBy('Date','name','Timings') # emp1 = emp1.withColumnRenamed('Timings','IN_TIME') emp2 = emp2.withColumnRenamed('Timings','OUT_TIME') # #emp1=emp1.join(emp2,emp1.id == emp2.id,"full_outer").select(emp1.id,emp1.Date,emp1.name,emp1.device_id,emp1.status,emp1.IN_TIME,emp2.OUT_TIME) #emp1 = emp1.
lower(col('Description')), upper(col('Description'))).show(2) df.selectExpr( 'Description', 'lower(Description)', 'upper(lower(Description))').show(2) # select description, lower(Description), upper(lower(Description)) from dfTable from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(' HELLO ')).alias('ltrim'), rtrim(lit(' HELLO ')).alias('rtrim'), trim(lit(' HELLO ')).alias('trim'), lpad(lit('HELLO'), 3, ' ').alias('lp'), rpad(lit('HELLO'), 10, ' ').alias('rp')).show(2) df.selectExpr( 'ltrim( "HELLO" ) as ltrim', 'rtrim( "HELLO" ) as rtrim', 'trim( "HELLO" )as trim', 'lpad("HELLO", 3, " ") as lp', 'rpad("HELLO", 3, " ")as rp').show(2) # select # ltrim(' HELLO '), # rtrim(' HELLO '), # trim(' HELLO '),
df = spark.read.format("csv")\ .option("header","true")\ .option("inferSchema","true")\ .load('/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv') #data schema 확인 df.printSchema() #initcap : 주어진 문자열에서 공백을 나눠 첫글자를 대문자로 반환 df.select(initcap(col("Description"))).show(2, False) #lower // upper df.select(lower(col("StockCode"))).show(2) #공백 추가 및 제거 (lit,ltrim,rtrim,rpad,lpad,trim) from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lpad"), rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2) ##정규 표현식 #description컬럼의 값을 COLOR 값으로 치환 from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2) #주어진 문자를 다른 문자로 치환 from pyspark.sql.functions import translate df.select(translate(col("Description"), "WHI", "123")).show(2)
def lsg_omni(self): start_date, end_date = date_period(self.period, self.start_date) table_name = 'datalake_omni.omni_hit_data' dt_col_name = 'hit_time_gmt_dt_key' _, bound_end_date = date_period(-1, end_date) bound_date_check(table_name, dt_col_name, start_date, bound_end_date, self.env, 'YYYYMMDD', 'LSG') query = 'SELECT ' \ 'VS.visit_session_key AS session_key, ' \ 'HIT.post_visid_combined AS visit_id, ' \ 'HIT.visit_return_count AS visit_number, ' \ 'UPPER(TRIM(prod_list)) AS prod_list, ' \ 'HIT.hit_time_gmt_ts AS time_stamp, ' \ "TRIM(SUBSTRING(TRIM(DEMANDBASE), 0, POSITION('|' IN TRIM(DEMANDBASE)))) AS " \ "account_no " \ 'FROM datalake_omni.omni_hit_data HIT ' \ 'LEFT JOIN CDWDS.D_OMNI_VISIT_SESSION VS ON ' \ ' VS.VISIT_RETURN_COUNT=HIT.VISIT_RETURN_COUNT AND VS.POST_VISID_COMBINED=HIT.POST_VISID_COMBINED ' \ f'WHERE HIT.hit_time_gmt_dt_key<{start_date} AND HIT.hit_time_gmt_dt_key>={end_date} ' \ 'AND HIT.post_visid_combined IS NOT NULL ' \ "AND prod_list IS NOT NULL AND prod_list NOT LIKE '%shipping-handling%' " \ "AND TRIM(SUBSTRING(TRIM(DEMANDBASE), 0, POSITION('|' IN TRIM(DEMANDBASE)))) <> '' " schema = StructType([ StructField('session_key', IntegerType(), True), StructField('visit_id', StringType(), True), StructField('visit_number', IntegerType(), True), StructField('time_stamp', StringType(), True), StructField('prod_list', StringType(), True), StructField('account_no', StringType(), True), ]) df = redshift_cdw_read(query, db_type = 'RS', database = 'CDWDS', env = self.env, schema = schema). \ withColumn('prod_id_untrimmed', explode(split('prod_list', ','))). \ withColumn('prod_id', ltrim(rtrim(col('prod_id_untrimmed')))). \ drop('prod_id_untrimmed'). \ drop('prod_list'). \ filter(col('prod_id').isNotNull()). \ filter(col('prod_id') != ''). \ distinct() if self.debug: print(f'row count for df = {df.count()}') # find active products query = 'SELECT sku as prod_id, stk_type_cd '\ 'FROM cdwds.lsg_prod_v ' \ "WHERE stk_type_cd = 'D'" discontinued_prods = redshift_cdw_read(query, db_type='RS', database='CDWDS', env=self.env) df = df.join(discontinued_prods, ['prod_id'], how = 'left').\ filter(col('stk_type_cd').isNull()).\ drop('stk_type_cd') if self.debug: print( f'After filtering out discontinued SKUs, row count for df = {df.count()}' ) query = 'SELECT UPPER(sku_nbr) AS prod_id, size_grp AS coupon ' \ 'FROM cdwds.f_web_prod_feature ' \ "WHERE size_grp IS NOT NULL AND size_grp <> 'T' " \ 'GROUP BY sku_nbr, size_grp' coupons = redshift_cdw_read(query, db_type='RS', database='CDWDS', env=self.env) if coupons.count() == 0: raise DataValidityError( 'No coupon information. Please check the validity of size_grp column ' 'on cdwds.f_web_prod_feature.') df = df.join(broadcast(coupons), ['prod_id'], how = 'left').\ withColumn('coupon', coalesce('coupon', 'prod_id')) prod_list = df.select('prod_id').distinct() coupons = coupons.union(df.select('prod_id', 'coupon')).\ filter(col('prod_id').isNotNull()).\ distinct().\ withColumn("coupon_key", func.dense_rank().over(Window.orderBy('coupon'))) df = df.join(coupons, ['prod_id', 'coupon'], how='left') if self.debug: coupons.show() df.show() print( f'row count for coupons = {coupons.select(col("coupon_key")).distinct().count()}' ) return df, prod_list, coupons
posts = spark.read.format("xml").option("rowTag", "row").load(xml_file_path) # select only questions questions = posts.filter(posts._PostTypeId == 1) # drop irrelvant columns and clean up strings questions = questions.select( [c for c in questions.columns if c in ['_Id', '_Title', '_Body', '_Tags']]) questions = questions.withColumn( 'full_question', sf.concat(sf.col('_Title'), sf.lit(' '), sf.col('_Body'))) questions = questions.select([ c for c in questions.columns if c in ['_Id', 'full_question', '_Tags'] ]).withColumn("full_question", regexp_replace("full_question", "[\\n,]", " ")) questions = questions.withColumn("_Tags", regexp_replace("_Tags", "><", " ")) questions = questions.withColumn("_Tags", regexp_replace("_Tags", "(>|<)", "")) questions = questions.withColumn('_Tags', rtrim(questions._Tags)) questions = questions.withColumn('_Tags', split(questions._Tags, " ")) # filter out to single tags in following list tags_of_interest = [ 'azure-devops', 'azure-functions', 'azure-web-app-service', 'azure-storage', 'azure-virtual-machine' ] def intersect(xs): xs = set(xs) @udf("array<string>") def _(ys): return list(xs.intersection(ys))
def trimStrings(dataframe, columns): for col in columns: dataframe = dataframe.withColumn(col, F.ltrim(F.rtrim(dataframe[col]))) return dataframe