def process_country_codes(): input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + COUNTRY_FILE) df_country = spark.read.format("csv").option("delimiter", "=").option( "header", "False").load(input_data_file) df_country = df_country.withColumnRenamed( "_c0", "country_code").withColumnRenamed("_c1", "country_name") df_country = df_country.withColumn( "country_name", F.regexp_replace(df_country.country_name, "'", "")) df_country = df_country.withColumn( "country_name", F.ltrim(F.rtrim(df_country.country_name))) df_country = df_country.withColumn( "country_code", F.ltrim(F.rtrim(df_country.country_code))) df_country = df_country.withColumn( "country_name", F.regexp_replace(df_country.country_name, "^INVALID.*|Collapsed.*|No\ Country.*", "INVALID")) df_country.write.mode("overwrite").parquet(s3 + 'data/processed/codes/country') return df_country
def remove_space(df, col_name, position): # remove left side space if position == "l": return df.withColumn("tmp", ltrim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name) # remove right side space elif position == "r": return df.withColumn("tmp", rtrim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name) # remove all side space elif position == "a": return df.withColumn("tmp", trim(f.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name)
def joinDataSet(): spark = SparkSession.builder.appName('csv_parse').getOrCreate() #Load xml xml_df = spark.read.format('com.databricks.spark.xml'). \ option("rootTag", "feed"). \ option("rowTag","doc"). \ load(xml_s3_path). \ withColumn("title",f.ltrim(f.split(f.col("title"),":").getItem(1))). \ withColumn("shortUrl",f.split(f.col("url"),"/")) selectedData = xml_df.select( "title", "url", f.element_at(f.col('shortUrl'), -1).alias('shortUrl'), "abstract") selectedData.repartition(1).write.option( "sep", "\t").format('csv').mode("overwrite").save(csv_output_path_2, header='false') selectedData.createOrReplaceTempView("wiki_pages") #Load csv json_schema = ArrayType( StructType([ StructField('name', StringType(), nullable=False), StructField('id', IntegerType(), nullable=False) ])) df = spark.read.option("header",True). \ option("quote","\""). \ option("escape","\""). \ option("multiLine",True). \ csv(csv_s3_path). \ withColumn("sanitizedTitle",f.regexp_replace(f.col("title"),"\\s+","_")). \ withColumn("year",f.split(f.col("release_date"),"-").getItem(0)). \ withColumn("companiesList",f.from_json(f.col("production_companies"),json_schema)). \ withColumn("companiesList",f.concat_ws("-",f.col("companiesList.name"))) csvSelectedData = df.select("title", "sanitizedTitle") csvSelectedData.repartition(1).write.option( "sep", "\t").format('csv').mode("overwrite").save(csv_output_path_3, header='false') df.createOrReplaceTempView("movies_metadata") # Join datasets q = spark.sql(join_sql_query) # Write output to s3 q.repartition(1).write.option( "sep", "\t").format('csv').mode("overwrite").save(csv_output_path, header='false')
def process_state_codes(): input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + STATE_FILE) df_state = spark.read.format("csv").option("delimiter", "=").option( "header", "False").load(input_data_file) df_state = df_state.withColumnRenamed("_c0", "state_code").withColumnRenamed( "_c1", "state_name") df_state = df_state.withColumn( "state_code", F.regexp_replace(df_state.state_code, "[^A-Z]", "")) df_state = df_state.withColumn( "state_name", F.regexp_replace(df_state.state_name, "'", "")) df_state = df_state.withColumn("state_name", F.ltrim(F.rtrim(df_state.state_name))) df_state.write.mode("overwrite").parquet(s3 + 'data/processed/codes/us_state') return df_state
def canonicaltokens(df, inputColumn, outputColumn): """ turn input column of strings into canonical format as output column of tokens return as output column added to the dataframe """ newname = df.withColumn("cleanname", \ f.regexp_replace(f.regexp_replace(f.rtrim(f.ltrim(f.col(inputColumn))), \ " (\w) (\w) ", "$1$2"), "(\w) (\w) (\w)$", "$1$2$3")) newtokenizer = mlf.Tokenizer(inputCol="cleanname", outputCol="words") chtokenized = newtokenizer.transform(newname).drop("cleanname") stopwordremover = mlf.StopWordsRemover(inputCol="words", outputCol=outputColumn) canonicalname = stopwordremover.transform(chtokenized).drop("words") return canonicalname
def remove_space(df, col_name, position): if position not in ["l", "r", "a"]: raise ValueError("The position value must be l, r or a") # get origin column orders columns = df.columns # remove left side space if position == "l": return df.withColumn("tmp", ltrim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name).select( *columns) # remove right side space elif position == "r": return df.withColumn("tmp", rtrim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name).select( *columns) # remove all side space elif position == "a": return df.withColumn("tmp", trim(sql_fun.col(col_name))).drop(col_name).withColumnRenamed("tmp", col_name).select( *columns)
def parse_message(col, eol="\n"): """ Generate the expression that parses the email message into From, Subject, Body etc. Args: col - sqlf.col() column object eol - end of line chatacter to use when parsing the email Returns: List pyspark.sql.functions to be passed to select() """ out_dict = [ "Message-ID", "Date", "From", "To", "Subject", "Mime-Version", "Content-Type", "Content-Transfer-Encoding", "X-From", "X-To", "X-cc", "X-bcc", "X-Folder", "X-Origin", "X-FileName", eol, ] expr = [] for i in range(0, len(out_dict) - 1): expr.append( sqlf.ltrim( sqlf.rtrim(sqlf.split(sqlf.split(col, out_dict[i] + ":")[1], eol)[0]) ).alias(out_dict[i]) ) expr.append(sqlf.split(sqlf.split(col, "X-FileName:")[1], "nsf")[1].alias("Body")) return expr
def process_airport_codes(): #transform airport codes input_data_file = os.path.join(s3, I94_CODES_DATA_PATH + AIRPORT_FILE) df_airport = spark.read.format("csv").option("delimiter", "=").option( "header", "False").load(input_data_file) df_airport = df_airport.withColumn( "_c0", F.regexp_replace(df_airport._c0, "'", "")).withColumn( "_c1", F.regexp_replace(df_airport._c1, "'", "")) split_col = F.split(df_airport._c1, ",") df_airport = df_airport.withColumn("city", split_col.getItem(0)) df_airport = df_airport.withColumn("state_code", split_col.getItem(1)) df_airport = df_airport.withColumnRenamed("_c0", "port_code") df_airport = df_airport.drop("_c1") df_airport = df_airport.withColumn( "port_code", F.regexp_replace(df_airport.port_code, "[^A-Z]", "")).withColumn( "city", F.ltrim(F.rtrim(df_airport.city))).withColumn( "state_code", F.regexp_replace(df_airport.state_code, "[^A-Z]", "")) df_state = process_state_codes() df_airport = df_airport.join(df_state, "state_code") df_airport.write.mode("overwrite").parquet(s3 + 'data/processed/codes/us_ports')
from pyspark.sql.functions import monotonically_increasing_id df.select(monotonically_increasing_id()).show(10) #working with strings #perform case converstions from pyspark.sql.functions import initcap, lower, upper, ltrim, rtrim, trim, lpad, rpad df.select(initcap(col("Description"))).show(5) df.select(col("Description"), initcap(col("Description")), lower(col("Description")), upper(col("Description"))).show(5) string_with_space = " hello " df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)), trim(lit(string_with_space))).show() #regular expressions #working with dates, timestamps from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp dateDF = spark.range(10).withColumn("today", current_date()).withColumn( "now", current_timestamp()) dateDF.show() dateDF.select( date_add(col("today"), 5).alias("today+5"), date_sub(col("today"), 5).alias("today-5")).show()
def calculate_score(self): # transform and filter data mentions_df = self.spark.read.parquet(self.mentions_path).select( 'GLOBALEVENTID', 'MentionTimeDate', 'MentionIdentifier', 'Confidence') gkg_df = self.spark.read.parquet(self.gkg_path).select( 'DocumentIdentifier', 'Date', 'V2Tone') # filter rows on mention date in 2019 mentions_df = mentions_df.filter( mentions_df.MentionTimeDate.like('2019%')) gkg_df = gkg_df.filter(gkg_df.Date.like('2019%')) gkg_df = gkg_df.drop('Date') # type casting for mentions and gkg df mentions_df = mentions_df.withColumn( 'GLOBALEVENTID', mentions_df.GLOBALEVENTID.cast('INT')) mentions_df = mentions_df.withColumn( 'Confidence', mentions_df.Confidence.cast('INT')) mentions_df = mentions_df.withColumn( 'mDate', F.to_date(mentions_df.MentionTimeDate, format='yyyyMMddHHmmss')).drop('MentionTimeDate') mentions_df.printSchema() print(mentions_df.first()) gkg_df = gkg_df.withColumn( 'Tone', F.split(gkg_df.V2Tone, ',')[0].cast('FLOAT')).drop('V2Tone') gkg_df.printSchema() print(gkg_df.first()) # register the DataFrame as a SQL temporary view mentions_df.createOrReplaceTempView('mentions_table') gkg_df.createOrReplaceTempView('gkg_table') # run sql query on 3 tables to calculate safety_score temp_df = self.spark.sql( 'SELECT GLOBALEVENTID, mDate, avg(Confidence*0.01*Tone) as sentiment, count(*) as numOfMentions \ FROM mentions_table inner join gkg_table on mentions_table.MentionIdentifier = gkg_table.DocumentIdentifier \ GROUP BY GLOBALEVENTID, mDate') temp_df.explain() temp_df.printSchema() print(temp_df.first()) temp_df.createOrReplaceTempView('temp_table') # clear cache of mentions and gkg df & table, read in event data self.spark.catalog.dropTempView('mentions_table') self.spark.catalog.dropTempView('gkg_table') mentions_df.unpersist() gkg_df.unpersist() # load event data and perform join and aggregation event_df = self.spark.read.parquet(self.event_path).select( 'GLOBALEVENTID', 'GoldsteinScale', 'ActionGeo_FullName') event_df = event_df.withColumn('GLOBALEVENTID', event_df.GLOBALEVENTID.cast('INT')) event_df = event_df.withColumn('GoldsteinScale', event_df.GoldsteinScale.cast('FLOAT')) event_df = event_df.withColumn( 'country', F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName, ',')[2]))) event_df = event_df.withColumn( 'state', F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName, ',')[1]))) event_df = event_df.withColumn( 'city', F.rtrim(F.ltrim(F.split(event_df.ActionGeo_FullName, ',')[0]))).drop('ActionGeo_FullName') event_df.printSchema() print(event_df.first()) event_df.createOrReplaceTempView('event_table') # compute final safety score result_df = self.spark.sql( 'SELECT event_table.GLOBALEVENTID, mDate, 0.5*(GoldsteinScale*10+temp_table.sentiment) as SafetyScore, numOfMentions, \ country, state, city \ FROM event_table inner join temp_table on event_table.GLOBALEVENTID = temp_table.GLOBALEVENTID' ) result_df.explain() result_df.printSchema() print(result_df.first()) # free up memory and disk self.spark.catalog.dropTempView('temp_table') self.spark.catalog.dropTempView('event_table') temp_df.unpersist() event_df.unpersist() return result_df
for i in r_body: if(i != " "): reformat_list = [rid, i] final_result.append(reformat_list) return final_result #Preprocess the rdd from stage 3 for both positive and negative music_p_small_preprocess = music_p_small_rdd.mapPartitions(review_encode_preprocess).cache() # music_p_small_preprocess.take(2) music_n_small_preprocess = music_n_small_rdd.mapPartitions(review_encode_preprocess).cache() # music_n_small_preprocess.take(2) #Re-formate the preprocessed rdd to formatted dataframe music_p_preprocess_reformat = spark.createDataFrame(music_p_small_preprocess) music_p_preprocess_reformat = music_p_preprocess_reformat.withColumnRenamed('_1', 'review_id').withColumnRenamed('_2', 'review_body') music_p_preprocess_reformat = music_p_preprocess_reformat.withColumn('review_body', f.ltrim(music_p_preprocess_reformat.review_body)) music_p_preprocess_reformat = music_p_preprocess_reformat.withColumn('review_body', f.rtrim(music_p_preprocess_reformat.review_body)) # music_p_preprocess_reformat.show(5) music_n_preprocess_reformat = spark.createDataFrame(music_n_small_preprocess) music_n_preprocess_reformat = music_n_preprocess_reformat.withColumnRenamed('_1', 'review_id').withColumnRenamed('_2', 'review_body') music_n_preprocess_reformat = music_n_preprocess_reformat.withColumn('review_body', f.ltrim(music_n_preprocess_reformat.review_body)) music_n_preprocess_reformat = music_n_preprocess_reformat.withColumn('review_body', f.rtrim(music_n_preprocess_reformat.review_body)) # music_n_preprocess_reformat.show(5) #Doing tokenizer with regex to separate every word in review body and filter if empty list regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'review_body', outputCol = 'review_token') music_p_preprocess_reformat_token = regexTokenizer.transform(music_p_preprocess_reformat) music_p_preprocess_reformat_token_filter = music_p_preprocess_reformat_token.filter(f.size('review_token') > 1) # music_p_preprocess_reformat_token_filter.show(5) music_n_preprocess_reformat_token = regexTokenizer.transform(music_n_preprocess_reformat) music_n_preprocess_reformat_token_filter = music_n_preprocess_reformat_token.filter(f.size('review_token') > 1)
df.printSchema() df.show(3) print(df.columns) #header = df.first() #print header #header.show() #print([str.strip(column) for column in df.columns]) #print(map(str.strip,df.columns)) #df.withColumnRenamed(" DATE1",str.strip(" DATE1")).columns df_strip_spaces = df.toDF(*map(str.strip, df.columns)) df_strip_spaces.show(3) print(df_strip_spaces.columns) df_strip_spaces.select(ltrim(df_strip_spaces["SYMBOL"])).show() df_strip_spaces.select(lower(df_strip_spaces["SYMBOL"])).show() df_strip_spaces.select(upper(df_strip_spaces["SYMBOL"])).show() df_strip_spaces.select(lpad(df_strip_spaces["SYMBOL"], 20, '0')).show() df_strip_spaces.DATE1 #df_strip_spaces.select(to_date(df_strip_spaces.DATE1),'dd-mmm-yyyy').show() df_strip_spaces.select(col('SYMBOL')).show(3) #filtering,selection df_strip_spaces.select('SYMBOL').show() df_strip_spaces.select(col('SYMBOL')).show() df.where(col("SYMBOL").startswith("KOTAK")).show() df.where("SYMBOL like '%BANK%'").show() df_strip_spaces.where(col('SYMBOL').like('KOTAKBANK') | col('SYMBOL').like("%YES%")) \ .select('SYMBOL','DATE1','OPEN_PRICE','CLOSE_PRICE').show(3)
StructField("country", StringType(), True), StructField("salary", StringType(), True)]); df = spark.read.csv(filepath, schema=strct); dfregex1 = df.select(df.workclass, df.finalweight, df.education, df.educationnum, df.maritalstatus, df.occupation, df.relationship , df.race , df.gender, df.capitalgain, df.capitalloss, df.hoursperweek, df.country, regexp_replace(df.salary, '(<=50K)', '50').alias('salary')); dfregex = dfregex1.select(dfregex1.workclass, dfregex1.finalweight, dfregex1.education, dfregex1.educationnum, dfregex1.maritalstatus, dfregex1.occupation , dfregex1.relationship, dfregex1.race , dfregex1.gender, dfregex1.capitalgain, dfregex1.capitalloss, dfregex1.hoursperweek, dfregex1.country , regexp_replace(dfregex1.salary, '(>50K)', '51').alias('salary')); dfbtrim = dfregex1.select(dfregex1.workclass, dfregex1.finalweight, dfregex1.education, dfregex1.educationnum, dfregex1.maritalstatus , dfregex1.occupation, dfregex1.relationship, dfregex1.race , dfregex1.gender, dfregex1.capitalgain, dfregex1.capitalloss , dfregex1.hoursperweek, dfregex1.country, ltrim(rtrim(dfregex1.salary)).alias('salary') ); dfcast = dfbtrim.select(dfbtrim.workclass, dfbtrim.finalweight, dfbtrim.education, dfbtrim.educationnum, dfbtrim.maritalstatus, dfbtrim.occupation , dfbtrim.relationship, dfbtrim.race , dfbtrim.gender, dfbtrim.capitalgain, dfbtrim.capitalloss, dfbtrim.hoursperweek, dfbtrim.country , dfbtrim.salary.cast(IntegerType()).alias('intSal')); dfcast.createOrReplaceTempView("employees"); query = "select workclass, education, maritalstatus, occupation, relationship, hoursperweek, country, avg(intSal) as sal_avg from employees " + "group by workclass, education, maritalstatus, occupation, relationship, hoursperweek, country"; sqldf = spark.sql(query); sqldf.dropna().show(10); spark.stop();
Person("Pratik", "Solanki", 22, 176.7, None), Person("Ashok ", "Pradhan", 62, None, None), Person(" ashok", "Pradhan", 42, 125.3, "Chemical Engineer"), Person("Pratik", "Solanki", 22, 222.2, "Teacher") ]) people_df.show() people_df.groupBy("firstName").agg(first("weightInLbs")).show() people_df.groupBy(trim(lower(col('firstName')))).agg(first("weightInLbs")).show() people_df.groupBy(trim(lower(col("firstName")))).agg(first("weightInLbs", True)).show() people_df.sort(col("weightInLbs").desc()).groupBy(trim(lower(col("firstName")))).agg(first("weightInLbs", True)).show() people_df.sort(col("weightInLbs").asc_nulls_last()).groupBy(trim(lower(col("firstName")))).agg(first("weightInLbs", True)).show() corrected_people_df = people_df\ .withColumn("firstName", initcap("firstName"))\ .withColumn("firstName", ltrim(initcap("firstName")))\ .withColumn("firstName", trim(initcap("firstName")))\ corrected_people_df.groupBy("firstName").agg(first("weightInLbs")).show() corrected_people_df = corrected_people_df\ .withColumn("fullName", format_string("%s %s", "firstName", "lastName"))\ corrected_people_df.show() corrected_people_df = corrected_people_df\ .withColumn("weightInLbs", coalesce("weightInLbs", lit(0)))\ corrected_people_df.show() corrected_people_df\
df.select(col('Description'), lower(col('Description')), upper(col('Description'))).show(2) df.selectExpr( 'Description', 'lower(Description)', 'upper(lower(Description))').show(2) # select description, lower(Description), upper(lower(Description)) from dfTable from pyspark.sql.functions import ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(' HELLO ')).alias('ltrim'), rtrim(lit(' HELLO ')).alias('rtrim'), trim(lit(' HELLO ')).alias('trim'), lpad(lit('HELLO'), 3, ' ').alias('lp'), rpad(lit('HELLO'), 10, ' ').alias('rp')).show(2) df.selectExpr( 'ltrim( "HELLO" ) as ltrim', 'rtrim( "HELLO" ) as rtrim', 'trim( "HELLO" )as trim', 'lpad("HELLO", 3, " ") as lp', 'rpad("HELLO", 3, " ")as rp').show(2) # select # ltrim(' HELLO '), # rtrim(' HELLO '),
"comment_text", F.regexp_replace(F.col("comment_text"), "[\$#,?." "!@#$%^&*()0123456789:-=\+]", "")) snewdf = snewdf.withColumn('comment_text', F.regexp_replace(F.col("comment_text"), "\"", "")) snewdf = snewdf.withColumn('comment_text', F.regexp_replace(F.col("comment_text"), "\n", " ")) snewdf = snewdf.withColumn('comment_text', F.regexp_replace(F.col("comment_text"), "\[", "")) snewdf = snewdf.withColumn('comment_text', F.regexp_replace(F.col("comment_text"), "\]", "")) snewdf = snewdf.withColumn('comment_text', F.regexp_replace(F.col("comment_text"), "\"+", "")) snewdf = snewdf.withColumn('comment_text', F.lower(F.col('comment_text'))) snewdf = snewdf.withColumn('comment_text', F.rtrim(snewdf.comment_text)) snewdf = snewdf.withColumn('comment_text', F.ltrim(snewdf.comment_text)) from pyspark.ml.feature import Tokenizer tokenizer = Tokenizer(inputCol="comment_text", outputCol="tokenized") tokenized_df = tokenizer.transform(snewdf) tokenized_df.select("tokenized").show() stopwordsremoved = StopWordsRemover(inputCol="tokenized", outputCol="comment_txt") swr_df = stopwordsremoved.transform(tokenized_df) swr_df.select("comment_txt").show() from pyspark.ml.feature import HashingTF, IDF hashingTF = HashingTF().setNumFeatures(50).setInputCol(
#data load df = spark.read.format("csv")\ .option("header","true")\ .option("inferSchema","true")\ .load('/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv') #data schema 확인 df.printSchema() #initcap : 주어진 문자열에서 공백을 나눠 첫글자를 대문자로 반환 df.select(initcap(col("Description"))).show(2, False) #lower // upper df.select(lower(col("StockCode"))).show(2) #공백 추가 및 제거 (lit,ltrim,rtrim,rpad,lpad,trim) from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lpad"), rpad(lit("HELLP"), 10, " ").alias("rpad")).show(2) ##정규 표현식 #description컬럼의 값을 COLOR 값으로 치환 from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2) #주어진 문자를 다른 문자로 치환 from pyspark.sql.functions import translate
df = spark.read \ .format("jdbc") \ .option("url", "jdbc:postgresql://10.0.0.8:5432/my_db") \ .option("dbtable", "airbnb") \ .option("user", "test") \ .option("password", "test") \ .option("driver", "org.postgresql.Driver") \ .load() df = df.withColumn('bedrooms', F.round(df['bedrooms'], 0)) df = df.filter(~F.col('city').contains("/")) df = df.filter(~F.col('city').contains(",")) df = df.filter(~F.col('city').contains("-")) df = df.filter(~F.col('city').contains("^[0-9]*$")) df = df.filter(~F.col('city').contains("*")) df = df.withColumn('city', F.ltrim(df.city)) df = df.withColumn("city", F.initcap(F.col("city"))) df = df.filter(~df.city.rlike("[ ,;{}()\n\t=]")) df = df.filter(~df.city.rlike("[^0-9A-Za-z]")) df = df.filter(~F.col('city').contains("(")) df = df.groupBy('city', 'bedrooms').agg( F.avg('average').alias('average'), F.first('state')) df = df.withColumnRenamed('first(state)', 'state') df = df.sort('city') df1 = df.withColumn('average', F.round(df['average'], 0)) df1.write \ .format("jdbc") \ .option("url", "jdbc:postgresql://10.0.0.8:5432/my_db") \ .option("dbtable", "bnbclean") \ .option("user", "test") \
def Validate(ngrams \ , sampleSizes \ , ctxSize \ , sqc \ , seqs \ , outFile \ , minval \ , maxval \ , avg \ , nlines): accuracy = [] gramSize = GramSize(ctxSize, lookahead) c1 = (((maxval - minval) * 1.0) / nlines) / avg c2 = ((minval * 1.0) / nlines) / avg print seqs.count() ngrams = ngrams.repartition(1 << nPartLog) ngrams.cache() #we will validate separately for each vector size for vecSize in vecSizes: print '======TESTING FOR VECTOR SIZE', vecSize #start fresh old_ngrams = ngrams ngrams = ngrams.withColumn('correct', lit(0)) #use models from each sample modelId = 0 for sampleSize in sampleSizes: w2v = Word2VecModel.load(w2vFile(outDir, ctxSize, sampleSize, vecSize)) lrmodels = [] for dim in range(0, vecSize): lrmodels.append(LinearRegressionModel.load(lrmFile(outDir, ctxSize, sampleSize, vecSize, dim))) success = 0 fail = 0 unopt = 0 #add columns to store model success and failure modelSucc = 'succ_' + str(modelId) modelFail = 'fail_' + str(modelId) modelUnopt = 'unopt_' + str(modelId) seqs = seqs.withColumn(modelSucc, lit(0)) \ .withColumn(modelFail, lit(0)) \ .withColumn(modelUnopt, lit(0)) modelId = modelId + 1 ngrams = ngrams \ .withColumn('predSeq', lit('')) #create initial feature vector #transform each word into a cluster center words, d, centers = ClusterWords(w2v \ , seqs \ ) #record correctness for this model only old_ngrams = ngrams ngrams = ngrams.withColumn('sample_correct', lit(0)).withColumn('sample_confi', lit(1.0)) for nextPos in range(0,lookahead): #build the feature vector ngrams = BuildSubstringFeature(ngrams, w2v, nextPos, nextPos + ctxSize, ctxSize, lookahead,) #build the prediction vector ngrams = BuildPredictionVector(ngrams, lrmodels, ctxSize, vecSize) #now assign a cluster id to each prediction vector old_ngrams = ngrams ngrams = centers.transform(ngrams).withColumnRenamed('cluster', 'predWord').withColumnRenamed('vector', 'predictionVector') #get the predicted word ngrams = ngrams.join(broadcast(words), words.cluster == ngrams.predWord, 'inner') \ .drop('cluster') #\ #calculate the cosine similarity between prediction vector and center vector epsilon = 0.0001 def CosineSimi (v1, v2): d1 = DenseVector(v1) d2 = DenseVector(v2) n1 = d1.norm(2) n2 = d2.norm(2) return float(d1.dot(d2) / (n1 * n2)) cossim = udf(lambda v1, v2: CosineSimi(v1, v2), DoubleType()) ngrams = ngrams.withColumn('simi', cossim('centerVector', 'predictionVector')) ngrams = ngrams.drop('centerVector').drop('predictionVector') #update predicted sequence ngrams = ngrams.withColumn('predSeq', concat_ws(' ', 'predSeq', 'word')) ngrams = ngrams.withColumn('predSeq', ltrim(ngrams.predSeq)) #get actual sequence ngrams = CreateSubstring(ngrams, 'sentence', 'actualSeq', gramSize, ' ', ctxSize, ctxSize + nextPos + 1) #now get the cluster id for the predicted word in the sentence ngrams = BuildLabelVector(ngrams, w2v, ctxSize, lookahead, nextPos).withColumnRenamed('labelVec', 'vector').drop('ngrams') ngrams = centers.transform(ngrams).drop('vector') #and host latency for actual word ngrams = ngrams.join(broadcast(words), 'cluster', 'inner') \ .drop('word') \ .drop('centerVector') #\ #record correctness ngrams = ngrams.withColumn('round_correct', when((ngrams.predWord != ngrams.cluster) | (ngrams.simi < confidence), 0).otherwise(nextPos + 1)).drop('predWord').drop('cluster') ngrams = ngrams.withColumn('sample_correct', when(ngrams.sample_correct + 1 == ngrams.round_correct, ngrams.round_correct).otherwise(ngrams.sample_correct)) #get overall correctness ngrams = ngrams.withColumn('correct', greatest('sample_correct', 'correct')) #get binary correctness ngrams = ngrams.withColumn('binary_correct', when(ngrams.correct >= nextPos + 1, 1).otherwise(0)) ngrams = ngrams.withColumn('sample_confi', when(ngrams.binary_correct == 1, 1.0).otherwise(least(ngrams.simi, ngrams.sample_confi))) ngrams = ngrams.withColumn('simi', when(ngrams.binary_correct == 1, ngrams.simi).otherwise(ngrams.sample_confi)) ngrams = ngrams.withColumn('predSeq', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), ngrams.actualSeq).otherwise(ngrams.predSeq)) ngrams = ngrams.withColumn('succ_wt', when(ngrams.binary_correct == 1, ngrams.wt).otherwise(0)) ngrams = ngrams.withColumn('fail_wt', when((ngrams.binary_correct == 1) | (ngrams.simi < confidence), 0).otherwise(ngrams.wt)) ngrams = ngrams.withColumn('unopt_wt', when((ngrams.binary_correct == 0) & (ngrams.simi < confidence), ngrams.wt).otherwise(0)) ngrams = ngrams.drop('simi') #now summarize success and failure rates by predicted sequence seqWts = ngrams.groupBy('predSeq').agg(sum('succ_wt').alias('succ_wt'), sum('fail_wt').alias('fail_wt'), sum('unopt_wt').alias('unopt_wt')) #update sequences table seqs = seqWts.join(broadcast(seqs), seqWts.predSeq==seqs.word, 'right_outer').drop('predSeq').fillna(-c2/c1, ['succ_wt', 'fail_wt', 'unopt_wt']) scaleback = udf(lambda s: float(s*c1 + c2), DoubleType()) seqs = seqs.withColumn(modelSucc, col(modelSucc) + scaleback(seqs.succ_wt)).drop('succ_wt') seqs = seqs.withColumn(modelFail, col(modelFail) + scaleback(seqs.fail_wt)).drop('fail_wt') seqs = seqs.withColumn(modelUnopt, col(modelUnopt) + scaleback(seqs.unopt_wt)).drop('unopt_wt') seqs.cache() aggregated = seqs.agg(sum(modelSucc), sum(modelFail), sum(modelUnopt)) aggregated.cache() new_success = aggregated.head()['sum(' + modelSucc + ')'] new_fail = aggregated.head()['sum(' + modelFail + ')'] new_unopt = aggregated.head()['sum(' + modelUnopt + ')'] print nextPos, new_success - success, new_fail - fail, new_unopt - unopt success = new_success fail = new_fail unopt = new_unopt #end for testing for each model for a particular vector size #end for each vector size seqs.orderBy('succ_0', ascending=False).write.mode('overwrite').csv(outputFile(outDir, ctxSize, vecSize, sampleSizes)) return accuracy
def fix_null(x): return F.when( F.col(x).isNotNull() & (F.lower(F.col(x)) != "null") & (F.ltrim(F.col(x)) != ""), F.col(x)).otherwise(None)
df.select(initcap(col("Description"))).show() # COMMAND ---------- from pyspark.sql.functions import lower, upper df.select(col("Description"), lower(col("Description")), upper(lower(col("Description")))).show(2) # COMMAND ---------- from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lp"), rpad(lit("HELLO"), 10, " ").alias("rp")).show(2) # COMMAND ---------- from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2)
def lsg_omni(self): start_date, end_date = date_period(self.period, self.start_date) table_name = 'datalake_omni.omni_hit_data' dt_col_name = 'hit_time_gmt_dt_key' _, bound_end_date = date_period(-1, end_date) bound_date_check(table_name, dt_col_name, start_date, bound_end_date, self.env, 'YYYYMMDD', 'LSG') query = 'SELECT ' \ 'VS.visit_session_key AS session_key, ' \ 'HIT.post_visid_combined AS visit_id, ' \ 'HIT.visit_return_count AS visit_number, ' \ 'UPPER(TRIM(prod_list)) AS prod_list, ' \ 'HIT.hit_time_gmt_ts AS time_stamp, ' \ "TRIM(SUBSTRING(TRIM(DEMANDBASE), 0, POSITION('|' IN TRIM(DEMANDBASE)))) AS " \ "account_no " \ 'FROM datalake_omni.omni_hit_data HIT ' \ 'LEFT JOIN CDWDS.D_OMNI_VISIT_SESSION VS ON ' \ ' VS.VISIT_RETURN_COUNT=HIT.VISIT_RETURN_COUNT AND VS.POST_VISID_COMBINED=HIT.POST_VISID_COMBINED ' \ f'WHERE HIT.hit_time_gmt_dt_key<{start_date} AND HIT.hit_time_gmt_dt_key>={end_date} ' \ 'AND HIT.post_visid_combined IS NOT NULL ' \ "AND prod_list IS NOT NULL AND prod_list NOT LIKE '%shipping-handling%' " \ "AND TRIM(SUBSTRING(TRIM(DEMANDBASE), 0, POSITION('|' IN TRIM(DEMANDBASE)))) <> '' " schema = StructType([ StructField('session_key', IntegerType(), True), StructField('visit_id', StringType(), True), StructField('visit_number', IntegerType(), True), StructField('time_stamp', StringType(), True), StructField('prod_list', StringType(), True), StructField('account_no', StringType(), True), ]) df = redshift_cdw_read(query, db_type = 'RS', database = 'CDWDS', env = self.env, schema = schema). \ withColumn('prod_id_untrimmed', explode(split('prod_list', ','))). \ withColumn('prod_id', ltrim(rtrim(col('prod_id_untrimmed')))). \ drop('prod_id_untrimmed'). \ drop('prod_list'). \ filter(col('prod_id').isNotNull()). \ filter(col('prod_id') != ''). \ distinct() if self.debug: print(f'row count for df = {df.count()}') # find active products query = 'SELECT sku as prod_id, stk_type_cd '\ 'FROM cdwds.lsg_prod_v ' \ "WHERE stk_type_cd = 'D'" discontinued_prods = redshift_cdw_read(query, db_type='RS', database='CDWDS', env=self.env) df = df.join(discontinued_prods, ['prod_id'], how = 'left').\ filter(col('stk_type_cd').isNull()).\ drop('stk_type_cd') if self.debug: print( f'After filtering out discontinued SKUs, row count for df = {df.count()}' ) query = 'SELECT UPPER(sku_nbr) AS prod_id, size_grp AS coupon ' \ 'FROM cdwds.f_web_prod_feature ' \ "WHERE size_grp IS NOT NULL AND size_grp <> 'T' " \ 'GROUP BY sku_nbr, size_grp' coupons = redshift_cdw_read(query, db_type='RS', database='CDWDS', env=self.env) if coupons.count() == 0: raise DataValidityError( 'No coupon information. Please check the validity of size_grp column ' 'on cdwds.f_web_prod_feature.') df = df.join(broadcast(coupons), ['prod_id'], how = 'left').\ withColumn('coupon', coalesce('coupon', 'prod_id')) prod_list = df.select('prod_id').distinct() coupons = coupons.union(df.select('prod_id', 'coupon')).\ filter(col('prod_id').isNotNull()).\ distinct().\ withColumn("coupon_key", func.dense_rank().over(Window.orderBy('coupon'))) df = df.join(coupons, ['prod_id', 'coupon'], how='left') if self.debug: coupons.show() df.show() print( f'row count for coupons = {coupons.select(col("coupon_key")).distinct().count()}' ) return df, prod_list, coupons
from pyspark.sql import functions as F df = spark.read.text("s3://wagal/bigdata/shakespeare.txt") textLowerDf = df.select(F.lower(F.col("value")).alias("words_lower")) textSplitDf = textLowerDf.select( F.split(F.col("words_lower"), " ").alias("words_split")) textExplodedDf = textSplitDf.select( F.explode(F.col("words_split")).alias("word")) textExplodedDf = textExplodedDf.where(F.ltrim(F.col("word")) != "") textExplodedDf = textExplodedDf.select( F.regexp_extract(F.col("word"), "[a-z]+", 0).alias("word")) textWordCounts = textExplodedDf.groupBy("word").count().orderBy( F.col("count").desc()) textWordCounts.show()
def compile_lstrip(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) return F.ltrim(src_column)
def trimStrings(dataframe, columns): for col in columns: dataframe = dataframe.withColumn(col, F.ltrim(F.rtrim(dataframe[col]))) return dataframe