def toProcessTransData(): df1=rawdata[0].withColumn('yymmref',F.col("year")*100+F.lit(7)) \ .withColumn("ed_model_id",F.col("ed_model_id").cast(T.StringType())) \ .withColumn("yymminitial",F.when(F.col("yymmref")<F.col("yymminitial"),F.col("yymmref")).otherwise(F.col("yymminitial"))) \ .withColumn('iyear',F.col('yymminitial').substr(1,4).cast("integer")) \ .withColumn('imonth',F.col('yymminitial').substr(5,6).cast("integer")) \ .withColumn("idate",F.to_date(F.concat_ws("-","iyear","imonth",F.lit(1)))) \ .withColumn("adate",F.to_date(F.concat_ws("-","saleyear","salemonth",F.lit(1)))) \ .withColumn("fdate",F.lit(fdate)) \ .withColumn("age",F.months_between(F.col("adate"),F.col('idate'))+1) \ .withColumn("age",F.when(F.col('age')<1,1).otherwise(F.col('age')))\ .withColumn("agef",F.months_between(F.col("fdate"),F.col('idate'))+1) \ .withColumn("agef",F.when(F.col('agef')<1,1).otherwise(F.col('agef')))\ .withColumn("pdate",F.expr("add_months(fdate,0)")) \ .withColumn("tdate",F.expr("add_months(fdate,-3)")) \ .withColumn("syrmmt", F.year('tdate')*100+F.month('tdate')) \ .withColumn("psaleyear", F.year('pdate')) \ .withColumn("psalemonth", F.month('pdate'))\ .cache() extraagelist = df1.filter("age<0 and trans_count>=3").groupBy( 'year', 'make', 'model').agg(F.min(F.col("syrmm")).alias('age1')) df1 = df1.filter("age>0") #extraagelist%.write.format("delta").option("overwriteSchema", "true").mode("overwrite").saveAsTable("stats.nn_shortterm_extraagelist") print(extraagelist.count()) return df1
def join_df(df1, df2): df_joined = df1.join(df2, "title")\ .withColumn("active time", f.round(months_between(col("last_edit_date"), col("creation_date_of_article")), -1)) \ .withColumn("time_since_last_edit", f.round(months_between(to_timestamp(f.lit("2018-05-21")), col("last_edit_date")))) return df_joined
def flat_work_year(df): # 最近5年毕业的人才 # df = df.filter(df.edu_end_year >= CURRENT_YEAR - 5) # 当前工作的最小薪资必须大于MIN_SALARY, 且区间最大值大于最小值 df = df.filter((df.salary_min > MIN_SALARY)).filter(df.salary_max >= df.salary_min) \ .filter(df.avg_salary.isNotNull()) # 计算毕业到现在的工作时长 df = df.withColumn("work_month_min", F.months_between(df.work_start_date, df.edu_end_date)) df = df.withColumn("work_month_max", F.months_between(df.work_end_date, df.edu_end_date)) df = df.withColumn("work_month_duration", F.months_between(df.work_end_date, df.work_start_date)) # 将工作月份转换成年数 df = df.withColumn("work_year_min", F.udf(get_work_year, IntegerType())(df.work_month_min)) df = df.withColumn("work_year_max", F.udf(get_work_year, IntegerType())(df.work_month_max)) # 第一份工作入职时间在毕业5年内(0-1.5年都归为1年),工作时长大于0且小于10年 df = df.filter(df.work_year_min > 0).filter(df.work_year_min <= 5) \ .filter(df.work_month_duration >= 0).filter(df.work_month_duration < 120) # 将薪资摊平到工作年限上 df = df.withColumn( "work_years", F.udf(flat_work_year_salary, ArrayType(work_year_schema))(df.work_year_min, df.work_year_max, df.salary_min, df.salary_max)) # 工作年限对应薪资 df = df.withColumn("work_years", F.explode(df.work_years)) df = df.withColumn("work_year", F.col("work_years")["work_year"]) \ .withColumn("avg_salary", F.col("work_years")["salary"]) return df
def get_filtered_by_month(data: DataFrame) -> DataFrame: """ Method transforms periods "start_date - end_date" to year and month number source: +---+----------+----------+ |key|start_date| end_date| +---+----------+----------+ | 5|2018-01-01|2018-01-09| +---+----------+----------+ result: +---+----------+----------+ |key| year| month| +---+----------+----------+ | 5| 2018| 1| +---+----------+----------+ """ transformed_data = data \ .withColumn("start_date", F.trunc("start_date", "month")) \ .withColumn("monthsDiff", F.months_between("end_date", "start_date")) \ .withColumn("repeat", F.expr("split(repeat(',', monthsDiff), ',')")) \ .select("*", F.posexplode("repeat").alias("date", "val")) \ .withColumn("month", F.expr("add_months(datab, date)")) \ .drop('start_date', 'end_date', 'monthsDiff', 'repeat', 'val', 'date') \ .dropDuplicates() return transformed_data
def ageCalculate(self, column, dateFormat, nameColAge): """ This method compute the age of clients based on their born dates. :param column Name of the column born dates column. :param dateFormat String format date of the column provided. :param nameColAge Name of the new column, the new columns is the resulting column of ages. """ # Check if column argument a string datatype: self.__assertTypeStr(column, "column") # Check if dateFormat argument a string datatype: self.__assertTypeStr(dateFormat, "dateFormat") # Asserting if column if in dataFrame: assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame" # Output format date Format = "yyyy-MM-dd" # Some SimpleDateFormat string exprs = format_number( mag( months_between(date_format( unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias( nameColAge) self.__df = self.__df.withColumn(nameColAge, exprs) self.__addTransformation() # checkpoint in case return self
def ageCalculate(self, column, dateFormat, nameColAge): """ This method compute the age of clients based on their born dates. :param column Name of the column born dates column. :param dateFormat String format date of the column provided. :param nameColAge Name of the new column, the new columns is the resulting column of ages. """ # Check if column argument a string datatype: self.__assertTypeStr(column, "column") # Check if dateFormat argument a string datatype: self.__assertTypeStr(dateFormat, "dateFormat") # Asserting if column if in dataFrame: assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame" # Output format date Format = "yyyy-MM-dd" # Some SimpleDateFormat string exprs = format_number( mag( months_between( date_format( unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias(nameColAge) self.__df = self.__df.withColumn(nameColAge, exprs) self.__addTransformation() # checkpoint in case return self
def creation_date_of_article(df): columns_to_drop = ['redirect', 'ns', 'revision', 'date'] df = df.withColumn("author", df["revision"].getItem(0).contributor.username)\ .withColumn("authorID", df["revision"].getItem(0).contributor.id)\ .withColumn("date", df["revision"].getItem(0).timestamp)\ .withColumn("creation_date_of_article", from_unixtime('date', 'yyyy-MM-dd HH:mm:ss').cast("timestamp"))\ .withColumn("time_since_creation", f.round(months_between(col("creation_date_of_article"), to_timestamp(f.lit("2001-01-15"))))) df_res = df.drop(*columns_to_drop) return df_res
def postgraduate(profile_df, education_df, work_df, mysql_url): """ 留学、考研、就业分析 :param profile_df: 简历基本信息 :param education_df: 简历教育经历 :param work_df: 简历工作经历 :param mysql_url: 数据库路径 :return: """ education_df = education_df.filter(education_df.school_name.isNotNull()) \ .filter(education_df.major.isNotNull()) \ .filter(education_df.degree.isNotNull()) education_df = education_df.select("resume_id", "school_name", "school_area", "major", "degree", "edu_index", "edu_end_date") education_df = education_df.groupby("resume_id").apply(cal_postgraduate) education_df = education_df.filter( education_df.postgraduate.isNotNull()).filter( education_df.flag.isNull()) education_df.persist() # 计算每份工作平均薪资 work_df = work_df.withColumn( "avg_salary", F.udf(cal_avg_salary, FloatType())(work_df.salary_min, work_df.salary_max)) work_df = work_df.filter(work_df.avg_salary > MIN_SALARY) # # 过滤掉只有简历概要信息的所有数据(工作经历分数大于等于2) # wc_df = work_df.filter(work_df.work_index == 2).select("resume_id") # edu_df = wc_df.join(education_df, "resume_id") # 组合教育经历和工作经历 df = education_df.join(work_df, "resume_id") df = df.withColumn("work_month", F.months_between(df.work_end_date, df.edu_end_date)) df = df.withColumn("work_year", F.udf(get_years, IntegerType())(df.work_month)) df = df.filter(df.work_year >= 0).filter(df.work_year <= 40) # 分析结果表 result_tables = dict() result_tables[ "school__major__postgraduate__ratio__v1"] = postgraduate_ratio( education_df) result_tables[ "school__major__postgraduate__work_year__v1"] = postgraduate_work_year( df) # 删除持续化数据 education_df.unpersist() # 将专业相关的分析结果写入数据库 write_mysql(mysql_url, result_tables)
def _years_between(_new_col_name, attr): _date_format = attr[0] _col_name = attr[1] return F.format_number( F.abs( F.months_between( F.date_format( F.unix_timestamp( _col_name, _date_format).cast("timestamp"), format_dt), F.current_date()) / 12), 4) \ .alias( _new_col_name)
def combine_resume(profile_path, education_path, work_path, output_path): # spark 环境初始化 conf = SparkConf().set("spark.ui.port", "44040") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # 加载简历基本信息,教育经历,工作经历 profile_df = sqlContext.read.parquet(profile_path) education_df = sqlContext.read.parquet(education_path) work_df = sqlContext.read.parquet(work_path) df = profile_df.join(education_df, "resume_id").join(work_df, "resume_id") # 计算单次工作的时长 df = df.withColumn( "work_end_date", F.when( F.lower(F.trim(df.work_end_date)).isin("至今", "今", "present"), df.update_date).otherwise(df.work_end_date)) df = df.withColumn("work_duration_month", F.months_between(df.work_end_date, df.work_start_date)) df = df.withColumn( "work_duration_year", F.udf(lambda x: x / 12 if x else -1, FloatType())(df.work_duration_month)) # 计算毕业到现在的工作时长 df = df.withColumn("work_month_min", F.months_between(df.work_start_date, df.edu_end_date)) df = df.withColumn("work_month_max", F.months_between(df.work_end_date, df.edu_end_date)) df = df.withColumn("work_month_duration", F.months_between(df.work_end_date, df.work_start_date)) # 将工作月份转换成年数 df = df.withColumn("work_year_min", F.udf(get_work_year, IntegerType())(df.work_month_min)) df = df.withColumn("work_year_max", F.udf(get_work_year, IntegerType())(df.work_month_max)) df.write.mode('overwrite').save(output_path)
def createAgeDateColumns(self, df): """Criar colunas de idade do cliente e data do pedido. Args: df: Dataframe Returns: Dataframe. """ log( "INFO", f"[createAgeDateColumns] Criando colunas de idade e data do pedido." ) return (df.withColumn( "data_pedido_date", date_format( from_unixtime(col('data_pedido')), "yyyy-MM-dd")).withColumn('idade', (months_between( current_date(), col('data_nascimento_cliente')) / 12).cast( IntegerType())))
# COMMAND ---------- # Get the number of different months (alias month) between the 2 following date columns. Round the column "month". Show all 3 columns df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2']) # Expected: # +-------------------+----------+------+ # | date1| date2|months| # +-------------------+----------+------+ # |1997-02-28 10:30:00|1996-10-30| 4.0| # +-------------------+----------+------+ # Answer df.select('date1', 'date2', F.round(F.months_between('date1', 'date2'), 0).alias('months')).show() # COMMAND ---------- # Using spark.sql and function make_date, from this df, build the associated date spark.createDataFrame([(2020, 6, 26), (1000, 2, 29), (-44, 1, 1)], ['Y', 'M', 'D']).createOrReplaceTempView('YMD') # Expected: # +-----------+ # | date| # +-----------+ # | 2020-06-26| # | null| # |-0044-01-01| # +-----------+
'''Now we drop year,month,day,hour,minute,date,time columns as we will again try to create these from timestamp column that we created''' df_nycflights = df_nycflights. \ drop('year'). \ drop('month'). \ drop('day'). \ drop('hour'). \ drop('minute'). \ drop('date'). \ drop('time') df_nycflights.show() '''Now we extract the fields back''' df_nycflights = df_nycflights. \ withColumn('year',year(df_nycflights.timestamp)). \ withColumn('month',month(df_nycflights.timestamp)). \ withColumn('day',dayofmonth(df_nycflights.timestamp)). \ withColumn('hour',hour(df_nycflights.timestamp)). \ withColumn('minute',minute(df_nycflights.timestamp)) df_nycflights.show() '''Now few operations on timestamp ''' df_nycflights = df_nycflights.\ withColumn('date_sub',date_sub(df_nycflights.timestamp ,10)). \ withColumn('date_add',date_add(df_nycflights.timestamp ,10)). \ withColumn('months_between',months_between(df_nycflights.timestamp,df_nycflights.timestamp)) df_nycflights.show()
def add_age_information(self, date: datetime) -> None: self._subjects = self.subjects.withColumn( "age", floor(months_between(lit(date), col("birthDate")) / 12)).withColumn("ageBucket", floor(col("age") / 5))
from ( select last_day(period_end_dte) as period_end_dte , last_day(add_months(period_end_dte,-3)) as period_start_dte , id from (select distinct issue_dte as period_end_dte from work_db.inp_data where extract(month from issue_dte) in (3,6,9,12) ) as dt cross join (select distinct id from work_db.inp_data) ) as id inner join work_db.inp_data as bse on bse.id = id.id where bse.issue_dte <= id.period_end_dte and coalesce(bse.Default_dte, Maturity_Dte,cast('9999-12-31' as date)) > period_start_dte """) #Checks and other s***e ABT = ABT.withColumn( 'TOB', floor(months_between(col('period_end_dte'), col('issue_dte')) / 3)) print('2.3.2 Pushing ABT to Hive DB') ABT.createOrReplaceTempView("ABT") sqlContext.sql("""create table if not exists work_db.LC_DR_Periodic as (select * from ABT)""") sc.stop()
def create_extra_last_feature2(months): from pyspark.sql import functions as f from pyspark.sql.functions import date_format, to_date, to_timestamp import pyspark.sql.functions as func from pyspark.sql.functions import expr from pyspark.sql.window import Window download_df = spark.read.parquet(data_paths['download_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) browse_df = spark.read.parquet(data_paths['browse_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) share_df = spark.read.parquet(data_paths['share_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) search_df = spark.read.parquet(data_paths['search_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) fav_df = spark.read.parquet(data_paths['fav_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) fav_df2 = fav_df.withColumn("date", to_date(fav_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) fav_df2 = fav_df2.withColumn( "MONTH_tmp", f.from_unixtime(f.unix_timestamp(fav_df2.date, "yyyyMM"))) fav_df2 = fav_df2.withColumn( "MONTH", f.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) fav_df3 = fav_df2.withColumn( "ADJ_USERID", expr("substring(USERNAME, 1, length(USERNAME)-2)")) fav_df3 = fav_df3.withColumn( "ADJ_USERID", expr("substring(ADJ_USERID, 4, length(ADJ_USERID))")) fav = fav_df3.withColumn("ADJ_USERID", regexp_replace(f.col("ADJ_USERID"), "^0*", "")) fav = fav.groupby(['ADJ_USERID', 'MONTH']).count() fav = fav.withColumnRenamed("count", "num_fav") download_df2 = download_df.withColumn( "date", to_date(download_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) download_df2 = download_df2.withColumn( "MONTH_tmp", f.from_unixtime(f.unix_timestamp(download_df2.date, "yyyyMM"))) download_df2 = download_df2.withColumn( "MONTH", f.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) download_df3 = download_df2.withColumn( "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)")) download_df4 = download_df3.withColumn( "ADJ_USERID", regexp_replace(col("ADJ_USERID"), "^0*", "")) download = download_df4.groupby(['ADJ_USERID', 'MONTH']).count() download = download.withColumnRenamed("count", "num_" + "download") browse_df2 = browse_df.withColumn( "date", to_date(browse_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) browse_df2 = browse_df2.withColumn( "MONTH_tmp", f.from_unixtime(f.unix_timestamp(browse_df2.date, "yyyyMM"))) browse_df2 = browse_df2.withColumn( "MONTH", f.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) browse_df3 = browse_df2.withColumn( "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)")) browse_df4 = browse_df3.withColumn( "ADJ_USERID", regexp_replace(col("ADJ_USERID"), "^0*", "")) browse = browse_df4.groupby(['ADJ_USERID', 'MONTH']).count() browse = browse.withColumnRenamed("count", "num_" + "browse") share_df2 = share_df.withColumn( "date", to_date(share_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) share_df2 = share_df2.withColumn( "MONTH_tmp", f.from_unixtime(f.unix_timestamp(share_df2.date, "yyyyMM"))) share_df2 = share_df2.withColumn( "MONTH", f.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) share_df3 = share_df2.withColumn( "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)")) share_df4 = share_df3.withColumn( "ADJ_USERID", regexp_replace(col("ADJ_USERID"), "^0*", "")) share = share_df4.groupby(['ADJ_USERID', 'MONTH']).count() share = share.withColumnRenamed("count", "num_" + "share") search_df2 = search_df.withColumn( "date", to_date(search_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) search_df2 = search_df2.withColumn( "MONTH_tmp", f.from_unixtime(f.unix_timestamp(search_df2.date, "yyyyMM"))) search_df2 = search_df2.withColumn( "MONTH", f.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) search_df3 = search_df2.withColumn( "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)")) search_df4 = search_df3.withColumn( "ADJ_USERID", regexp_replace(col("ADJ_USERID"), "^0*", "")) search = search_df4.groupby(['ADJ_USERID', 'MONTH']).count() search = search.withColumnRenamed("count", "num_" + "search") data = [("2013-01-01", str(datetime.date.today()))] df = spark.createDataFrame(data, ["minDate", "maxDate"]) df = df.withColumn("monthsDiff", f.months_between("maxDate", "minDate")) \ .withColumn("repeat", f.expr("split(repeat(',', monthsDiff), ',')")) \ .select("*", f.posexplode("repeat").alias("date", "val")) \ .withColumn("date", f.expr("add_months(minDate, date)")) \ .select('date') df = df.withColumn( "MONTH", f.from_unixtime(f.unix_timestamp(f.col("date")), "yyyyMM")).select('MONTH') unique_id = download.select('ADJ_USERID').distinct() \ .union(browse.select('ADJ_USERID').distinct()) \ .union(share.select('ADJ_USERID').distinct()) \ .union(search.select('ADJ_USERID').distinct()) \ .union(fav.select('ADJ_USERID').distinct()) unique_id = unique_id.distinct() all_abo_month = unique_id.crossJoin(df) combine = download.select(['ADJ_USERID', 'MONTH']).union(browse.select(['ADJ_USERID', 'MONTH'])) \ .union(share.select(['ADJ_USERID', 'MONTH'])) \ .union(search.select(['ADJ_USERID', 'MONTH'])) \ .union(fav.select(['ADJ_USERID', 'MONTH'])) min_max_date = combine.groupby("ADJ_USERID").agg(f.min("MONTH"), f.max("MONTH")) all_abo_month = all_abo_month.join( min_max_date, all_abo_month.ADJ_USERID == min_max_date.ADJ_USERID, how='left').drop(min_max_date.ADJ_USERID) all_abo_month = all_abo_month.filter(f.col("MONTH") >= f.col("min(MONTH)")) all_abo_month = all_abo_month.filter(f.col("MONTH") <= f.col("max(MONTH)")) all_abo_month = all_abo_month.select(["ADJ_USERID", "MONTH"]) download = all_abo_month.join(download, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, n_max(12, n_max1(months))): download = download.withColumn('num_' + "download" + str(n), func.lag(download['num_' + "download"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) for i in months: if i in [1, 3, 6, 9, 12]: continue download = download.withColumn( "n_lag_currentyr_" + "download" + "_sum_{month}m".format(month=i), download['num_' + "download"]) for iter in range(1, n_max(12, n_max1(months))): download = download.withColumn( "n_lag_currentyr_" + "download" + "_sum_{month}m".format(month=i), download["n_lag_currentyr_" + "download" + "_sum_{month}m".format(month=i)] + download['num_' + "download" + str(iter)]) droplist = [] for n in range(1, n_max(12, n_max1(months))): droplist = droplist + ['num_' + "download" + str(n)] download = download.drop(*droplist) browse = all_abo_month.join(browse, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, n_max(12, n_max1(months))): browse = browse.withColumn('num_' + "browse" + str(n), func.lag(browse['num_' + "browse"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) for i in months: if i in [1, 3, 6, 9, 12]: continue browse = browse.withColumn( "n_lag_currentyr_" + "browse" + "_sum_{month}m".format(month=i), browse['num_' + "browse"]) for iter in range(1, n_max(12, n_max1(months))): browse = browse.withColumn( "n_lag_currentyr_" + "browse" + "_sum_{month}m".format(month=i), browse["n_lag_currentyr_" + "browse" + "_sum_{month}m".format(month=i)] + browse['num_' + "browse" + str(iter)]) droplist = [] for n in range(1, n_max(12, n_max1(months))): droplist = droplist + ['num_' + "browse" + str(n)] browse = browse.drop(*droplist) share = all_abo_month.join(share, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, n_max(12, n_max1(months))): share = share.withColumn('num_' + "share" + str(n), func.lag(share['num_' + "share"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) for i in months: if i in [1, 3, 6, 9, 12]: continue share = share.withColumn( "n_lag_currentyr_" + "share" + "_sum_{month}m".format(month=i), share['num_' + "share"]) for iter in range(1, n_max(12, n_max1(months))): share = share.withColumn( "n_lag_currentyr_" + "share" + "_sum_{month}m".format(month=i), share["n_lag_currentyr_" + "share" + "_sum_{month}m".format(month=i)] + share['num_' + "share" + str(iter)]) droplist = [] for n in range(1, n_max(12, n_max1(months))): droplist = droplist + ['num_' + "share" + str(n)] share = share.drop(*droplist) search = all_abo_month.join(search, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, n_max(12, n_max1(months))): search = search.withColumn('num_' + "search" + str(n), func.lag(search['num_' + "search"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) for i in months: if i in [1, 3, 6, 9, 12]: continue search = search.withColumn( "n_lag_currentyr_" + "search" + "_sum_{month}m".format(month=i), search['num_' + "search"]) for iter in range(1, n_max(12, n_max1(months))): search = search.withColumn( "n_lag_currentyr_" + "search" + "_sum_{month}m".format(month=i), search["n_lag_currentyr_" + "search" + "_sum_{month}m".format(month=i)] + search['num_' + "search" + str(iter)]) droplist = [] for n in range(1, n_max(12, n_max1(months))): droplist = droplist + ['num_' + "search" + str(n)] search = search.drop(*droplist) fav = all_abo_month.join(fav, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, n_max(12, n_max1(months))): fav = fav.withColumn('num_' + "fav" + str(n), func.lag(fav['num_' + "fav"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) for i in months: if i in [1, 3, 6, 9, 12]: continue fav = fav.withColumn( "n_lag_currentyr_" + "fav" + "_sum_{month}m".format(month=i), fav['num_' + "fav"]) for iter in range(1, n_max(12, n_max1(months))): fav = fav.withColumn( "n_lag_currentyr_" + "fav" + "_sum_{month}m".format(month=i), fav["n_lag_currentyr_" + "fav" + "_sum_{month}m".format(month=i)] + fav['num_' + "fav" + str(iter)]) droplist = [] for n in range(1, n_max(12, n_max1(months))): droplist = droplist + ['num_' + "fav" + str(n)] fav = fav.drop(*droplist) classroom_data = all_abo_month.join( download, ['ADJ_USERID', 'MONTH'], 'left').join(browse, ['ADJ_USERID', 'MONTH'], 'left').join(share, ['ADJ_USERID', 'MONTH'], 'left').join( search, ['ADJ_USERID', 'MONTH'], 'left').join(fav, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) classroom_data = classroom_data.withColumnRenamed("ADJ_USERID", "imc_no") classroom_data = classroom_data.withColumnRenamed("MONTH", "mo_yr_key_no") classroom_data = classroom_data.withColumn( 'mo_yr_key_no', classroom_data.mo_yr_key_no.cast('string')) classroom_data = classroom_data.withColumn( 'mo_yr_key_no', to_timestamp(classroom_data.mo_yr_key_no, 'yyyyMM')) classroom_data = classroom_data.withColumn( 'mo_yr_key_no', date_format('mo_yr_key_no', 'yyyy-MM-dd')) classroom_data = classroom_data.drop( *['num_download', 'num_browse', 'num_share', 'num_search', 'num_fav']) from pyspark.sql.window import Window import pyspark.sql.functions as f # from pyspark.sql.functions import * from pyspark.sql.types import IntegerType from pyspark.sql.functions import date_format, to_date, to_timestamp wechat_mini = spark.read.option("delimiter", "\t").option( "header", "true").option("encoding", "UTF-8").csv(data_paths['wechat_miniprogram_input']) wechat_mini = wechat_mini.withColumn('time', to_timestamp('时间戳', 'yyyy-MM-dd')) wechat_mini = wechat_mini.withColumn('month', to_timestamp('时间戳', 'yyyy-MM')) wechat_mini = wechat_mini.withColumn('month', date_format('month', 'yyyyMM')) wechat_mini2 = wechat_mini.withColumnRenamed('事件类型', 'event_type') \ .withColumnRenamed('时间戳', 'timestamp') \ .withColumnRenamed('诸葛id', 'trip_id') \ .withColumnRenamed('事件id', 'event_id') \ .withColumnRenamed('事件名', 'event_name') \ .withColumnRenamed('商品id', 'product_id') \ .withColumnRenamed('商品名称', 'product_name') \ .withColumnRenamed('搜索词', 'search_word') # clean up imc_no wechat_mini3 = wechat_mini2.withColumn("leading360", expr("substring(amwayid, 1, 3)")) wechat_mini4 = wechat_mini3.withColumn( "ADJ_USERID", when( f.col("leading360") == "360", expr("substring(amwayid, 4, length(amwayid)-2)")).otherwise( f.col("amwayid"))) wechat_mini5 = wechat_mini4.withColumn( "imc_no", regexp_replace(col("ADJ_USERID"), "^0*", "")) wechat_mini_all = wechat_mini5.withColumn( "imc_no", when( f.col("leading360") == "360", expr("substring(imc_no, 1, length(imc_no)-2)")).otherwise( f.col("imc_no"))) # browse wechat_mini_browse = wechat_mini_all.where((f.col("event_type") == '页面浏览')) wechat_mini_browse2 = wechat_mini_browse.groupBy('imc_no', 'month').agg( f.count("event_id").alias("n_num_cloudcommerce_browse")) # search wechat_mini_search = wechat_mini_all.where((f.col("event_type") == '站内搜索')) wechat_mini_search2 = wechat_mini_search.groupBy('imc_no', 'month').agg( f.count("event_id").alias("n_num_cloudcommerce_search")) # order wechat_mini_order = wechat_mini_all.where( (f.col("event_name") == '小程序_订单确认')) wechat_mini_order2 = wechat_mini_order.groupBy('imc_no', 'month').agg( f.count("event_id").alias("n_num_cloudcommerce_order")) # cart purchase_trip = wechat_mini_order.select('trip_id').distinct() wechat_mini_cart = wechat_mini_all.join( purchase_trip, 'trip_id', 'inner').where( (f.col("event_type") == '商品加购')) wechat_mini_cart2 = wechat_mini_cart.groupBy( 'imc_no', 'month', 'trip_id').agg( f.count("product_id").alias( "n_num_cloudcommerce_product_per_cart")) wechat_mini_cart3 = wechat_mini_cart2.groupBy('imc_no', 'month').agg( f.avg("n_num_cloudcommerce_product_per_cart").alias( "n_num_cloudcommerce_product_per_cart")) # all abo and month combination unique_id = wechat_mini_all.select('imc_no').distinct() month = wechat_mini_all.select('month').distinct() all_abo_month = unique_id.crossJoin(month) min_max_date = wechat_mini_all.groupby("imc_no").agg( f.min("month"), f.max("month")) all_abo_month = all_abo_month.join( min_max_date, all_abo_month.imc_no == min_max_date.imc_no, how='left').drop(min_max_date.imc_no) all_abo_month = all_abo_month.filter(f.col("month") >= f.col("min(month)")) all_abo_month = all_abo_month.filter(f.col("month") <= f.col("max(month)")) # join everything together combine1 = all_abo_month.join(wechat_mini_browse2, ['imc_no', 'month'], 'left').na.fill(0) combine2 = combine1.join(wechat_mini_search2, ['imc_no', 'month'], 'left').na.fill(0) combine3 = combine2.join(wechat_mini_order2, ['imc_no', 'month'], 'left').na.fill(0) combine4 = combine3.join(wechat_mini_cart3, ['imc_no', 'month'], 'left').na.fill(0) # create lag features combine = combine4.withColumnRenamed("month", "mo_yr_key_no") feature_list = [ 'n_num_cloudcommerce_browse', 'n_num_cloudcommerce_search', 'n_num_cloudcommerce_order', 'n_num_cloudcommerce_product_per_cart' ] lag_features = months for feature in feature_list: for lag_mo in lag_features: if lag_mo in [1, 3, 6, 9, 12]: continue for lag in range(0, lag_mo): colname = feature + "_" + str(lag) feature_col = feature + "_sum_" + str(lag_mo) + "m" combine = combine.withColumn( colname, f.lag(combine[feature], lag).over( Window.partitionBy("imc_no").orderBy("mo_yr_key_no"))) if lag == 0: combine = combine.withColumn(feature_col, combine[colname]) else: combine = combine.withColumn( feature_col, combine[feature_col] + combine[colname]) main_col = ['imc_no', 'mo_yr_key_no'] selected_feature = [] for feature in feature_list: for lag_mo in lag_features: if lag_mo in [1, 3, 6, 9, 12]: continue feature_col = feature + "_sum_" + str(lag_mo) + "m" selected_feature.append(feature_col) selected_feature = main_col + feature_list + selected_feature wechat_cloudcommerce = combine.select(selected_feature) wechat_formatting = wechat_cloudcommerce wechat_formatting = wechat_formatting.withColumn( 'mo_yr_key_no', wechat_formatting.mo_yr_key_no.cast('string')) # wechat_formatting = wechat_formatting.withColumn('imc_no',wechat_formatting.imc_no.cast('string')) wechat_formatting = wechat_formatting.withColumn( 'mo_yr_key_no', to_timestamp(wechat_formatting.mo_yr_key_no, 'yyyyMM')) wechat_formatting = wechat_formatting.withColumn( 'mo_yr_key_no', date_format('mo_yr_key_no', 'yyyy-MM-dd')) wechat_cloudcommerce = wechat_formatting wechat_cloudcommerce = wechat_cloudcommerce.drop(*[ 'n_num_cloudcommerce_browse', 'n_num_cloudcommerce_search', 'n_num_cloudcommerce_order', 'n_num_cloudcommerce_product_per_cart' ]) return classroom_data, wechat_cloudcommerce
'transactions_w_reg_init_date_after_2015.csv', header=True, inferSchema=True) transactions.count() transactions.printSchema() transactions = transactions.select( ['msno', 'is_churn_final', 'transaction_date']) my_window = Window.partitionBy('msno').orderBy('transaction_date') transactions = transactions.withColumn( 'prev_transaction_date', F.lag(transactions.transaction_date).over(my_window)) transactions = transactions.withColumn('months_since_prev_transaction',\ F.months_between(transactions.transaction_date, transactions.prev_transaction_date)) transactions.printSchema() transactions = transactions.where( F.col('months_since_prev_transaction').isNotNull()) transactions.createOrReplaceTempView('trans') q = 'SELECT first(msno) as msno, first(is_churn_final) as is_churn_final, round(sum(months_since_prev_transaction), 0) as month ' \ 'FROM trans ' \ 'GROUP BY msno ' joined_2015_or_later = spark.sql(q) joined_2015_or_later.show(50) joined_2015_or_later = joined_2015_or_later.toPandas() joined_2015_or_later.head() joined_2015_or_later.is_churn_final.mean()
date_sub(col("today"), 5).alias('5 days ago'), date_add(col("today"), 5).alias('5 days ahead')).show(1) date_df.selectExpr('(date_sub(today, 5)) as `5 days ago`', 'date_add(today, 5) as `5 days togo`').show(1, False) #%% "number of days and months between 2 dates" from pyspark.sql.functions import datediff, months_between, to_date, lit, expr print( date_df.withColumn("week_ago", date_sub(col("today"), 7)).select( datediff(col("week_ago"), col("today"))).show(1)) print( date_df.select( to_date(lit("2016-01-01")).alias("start"), to_date(lit("2017-05-22")).alias("end")).select( months_between(col("start"), col("end"))).show(1)) print( date_df.withColumn('start', to_date(lit("2014-01-01"))).withColumn( 'end', to_date(lit("2017-05-22"))).select( months_between(col("start"), col("end"))).show(1)) #%% "providing date format" from pyspark.sql.functions import to_date dateFormat = "yyyy-dd-MM" cleanDateDF = spark.range(1).select( to_date(lit("2017-12-11"), dateFormat).alias("date"), to_date(lit("2017-20-12"), dateFormat).alias("date2")) cleanDateDF.show()
for i in months: df = df.withColumn( 'issue_d', F.regexp_replace('issue_d', i, months[i]).alias('issue_d')) df = df.withColumn( 'last_pymnt_d', F.regexp_replace('last_pymnt_d', i, months[i]).alias('last_pymnt_d')) df = df.withColumn('issue_d', (F.to_date('issue_d', 'MM-dd-yyyy')).alias('issue_d')) df = df.withColumn('last_pymnt_d', (F.to_date('last_pymnt_d', 'MM-dd-yyyy')).alias('last_pymnt_d')) df = df.withColumn( 'mnth_start2last', F.months_between(df.last_pymnt_d, df.issue_d).alias('mnth_start2last')) df = df.withColumn('fracNumPmts', F.col('mnth_start2last') / F.col('term2')) #Optional ## start and end dates based on last payment date: only when using large samples #start_date = '2011-01-01' #end_date = '2013-12-31' #Early repayments between certain dates #df = df.filter((df.last_pymnt_d >= start_date)&(df.last_pymnt_d < end_date)) #df.count() print('\n\n\nWord2vec...\n') #for tokenizer change null to none df = df.na.fill({'desc': 'none'}) #perhaps doesn't matter?
def toGetRawData(maxmm, syymm, syear): new_df = spark.sql( """select * from stats.nn_transstyleprice where year_of_sale*100+month_of_sale<={maxmm} and year_of_sale*100+month_of_sale>={syymm} \ and vehicle_code>0 and year>={syear} and make!='Tesla'""". format(syear=syear, maxmm=maxmm, syymm=syymm)) #catch all updated data from May 18 for daily run new_df = new_df.select([F.col(x).alias(x.lower()) for x in new_df.columns ]).withColumn('year', F.col('year').cast("integer")) tmvlist = spark.sql( "select ed_style_id, year,make,model,tmv_category,model_year_link_code,model_level_link_code,msr_price from \ stats.try_nn_newvehiclesf where tmv_category!='R' and make!='Tesla' and make!='Polestar'" ) tmvlistextra = spark.sql( "select ed_style_id, year,make,model,tmv_category,null as model_year_link_code, null as model_level_link_code,total_original_msrp as msr_price from \ stats.try_nn_newvehiclesextraf where tmv_category!='R' and make!='Tesla' and make!='Polestar'" ) tmvlist = tmvlist.union(tmvlistextra).withColumn( 'year', F.col('year').cast("integer")) print(tmvlist.count()) tmvlist.write.format("delta").option( "overwriteSchema", "true").mode("overwrite").saveAsTable("stats.nn_shortterm_tmvlist") tmvlist = spark.sql("select * from stats.nn_shortterm_tmvlist") agemaster = spark.sql("select * from stats.nn_vehicleagemaster") extralist = spark.sql("select * from stats.nn_shortterm_extraagelist") agemaster=agemaster.join(extralist, ['year','make','model'],how='left').withColumn("yymminitial",F.when(F.col("age1")>0,F.col('age1')).otherwise(F.col('yymminitial'))).drop('age1') \ .withColumn("earliestmm",(F.col('year')-1)*100+2) \ .withColumn("yymminitial",F.when(F.col("yymminitial")<F.col("earliestmm"),F.col('earliestmm')) \ .otherwise(F.col('yymminitial'))).drop('earliestmm') catfinal = spark.sql("select * from stats.try_catfinal").cache() ed4list=spark.sql("select link_make as make,link_model as model, max(ed4) as ed4 from stats.try_yrmdcatorigin where saleyear>2016 group by make,model") \ .withColumn("make",F.lower(F.col("make"))).withColumn("model",F.lower(F.col("model"))).dropDuplicates(subset=["make","model"]) \ .join(catfinal,['ed4'],how='left').cache() originlist=spark.sql("select distinct link_make,origin from stats.try_yrmdcatorigin where saleyear>2016") \ .withColumn("make",F.lower(F.col("link_make"))).drop('link_make').dropDuplicates(subset=["make"]).cache() transdata=new_df \ .withColumnRenamed("vehicle_code","ed_style_id") \ .withColumnRenamed("year_of_sale","saleyear") \ .withColumnRenamed("month_of_sale","salemonth") \ .withColumn("syrmm",F.col('saleyear')*100+F.col('salemonth'))\ .withColumn("ageyear",age_year(F.col("saleyear"),F.col("year"))) \ .withColumn("make",F.lower(F.col("make"))).withColumn("model",F.lower(F.col("model")))\ .fillna(100,subset=['dtt']) \ .withColumn("item",F.col("ed_style_id").cast(T.StringType())) \ .join(agemaster,['year','make','model'],how='left') \ .withColumn("ptg",F.round(F.col("trans_price")/F.col("trans_msrp"),5)) \ .join(ed4list,["make","model"],how="left") \ .join(originlist,["make"],how="left") \ .join(tmvlist.select("ed_style_id", "tmv_category", "model_year_link_code","model_level_link_code","msr_price"),["ed_style_id"],how="left")\ .withColumn("msrpf", F.when(F.col("trans_msrp")<F.col("msr_price"),F.col("msr_price")).otherwise(F.col("trans_msrp"))) \ .withColumn("ptgf", F.when(F.col("ptg")>1.05,1.05).otherwise(F.col("ptg"))) \ .withColumn("pricef", F.round(F.col("ptgf")*F.col("msrpf")))\ .cache() vehiclelist=tmvlist \ .withColumn("make",F.lower(F.col("make"))).withColumn("model",F.lower(F.col("model")))\ .withColumn("item",F.col("ed_style_id").cast(T.StringType())) \ .join(agemaster,['year','make','model'],how='left') \ .join(ed4list,["make","model"],how="left") \ .join(originlist,["make"],how="left") \ .withColumn('yymmref',F.col("year")*100+F.lit(7)) \ .withColumn("yymminitial",F.when(F.col("yymmref")<F.col("yymminitial"),F.col("yymmref")).otherwise(F.col("yymminitial"))) \ .withColumn('iyear',F.col('yymminitial').substr(1,4).cast("integer")) \ .withColumn('imonth',F.col('yymminitial').substr(5,6).cast("integer")) \ .withColumn("idate",F.to_date(F.concat_ws("-","iyear","imonth",F.lit(1)))) \ .withColumn("fdate",F.lit(fdate)) \ .withColumn("age",F.months_between(F.col("fdate"),F.col('idate'))+1) \ .withColumn("age",F.when(F.col('age')<1,1).otherwise(F.col('age')))\ .withColumn("pdate",F.expr("add_months(fdate,0)")) \ .withColumn("saleyear", F.year('pdate')) \ .withColumn("salemonth", F.month('pdate'))\ .withColumn("syrmm",F.col('saleyear')*100+F.col('salemonth'))\ .withColumn("ageyear",age_year(F.col("saleyear"),F.col("year"))) \ .cache() print(transdata.count()) print(vehiclelist.count()) return [transdata, vehiclelist]
df = df.withColumn('dti',df['dti'].cast(DoubleType()).alias('dti')) df = df.withColumn('int_rate',df['int_rate'].cast(DoubleType()).alias('int_rate')) df = df.withColumn('term2', df['term'].cast(DoubleType()).alias('term2')) #and handle date strings which are Mon-YYYY format months={'Jan':'01-01','Feb':'02-15','Mar':'03-15','Apr':'04-15','May':'05-15','Jun':'06-15',\ 'Jul':'07-15','Aug':'08-15','Sep':'09-15','Oct':'10-15','Nov':'11-15','Dec':'12-31'} for i in months: df = df.withColumn('issue_d', F.regexp_replace('issue_d', i , months[i]).alias('issue_d')) df = df.withColumn('last_pymnt_d', F.regexp_replace('last_pymnt_d', i , months[i]).alias('last_pymnt_d')) df = df.withColumn('issue_d', (F.to_date('issue_d', 'MM-dd-yyyy')).alias('issue_d')) df = df.withColumn('last_pymnt_d', (F.to_date('last_pymnt_d', 'MM-dd-yyyy')).alias('last_pymnt_d')) df = df.withColumn('mnth_start2last', F.months_between(df.last_pymnt_d, df.issue_d).alias('mnth_start2last')) df = df.withColumn('fracNumPmts', F.col('mnth_start2last')/F.col('term2')) #Indicators for default, early repayment df2 = df #keep a backup #then drop rows with leftover na's df = df.na.drop(how='any') #for large sample size, use a subsample df = df.sample(fraction=0.1) #if df.count()>0: # del(df2) #if df is okay (did not lose all data!)
def create_classroom_feature(pargs, params): """ :inputs: download_df, browse_df, share_df, search_df, fav_df :outputs: classroom_data """ download_df = spark.read.parquet(data_paths['download_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) browse_df = spark.read.parquet(data_paths['browse_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) share_df = spark.read.parquet(data_paths['share_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) search_df = spark.read.parquet(data_paths['search_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) fav_df = spark.read.parquet(data_paths['fav_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) fav_df2 = fav_df.withColumn("date", to_date(fav_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) fav_df2 = fav_df2.withColumn( "MONTH_tmp", F.from_unixtime(F.unix_timestamp(fav_df2.date, "yyyyMM"))) fav_df2 = fav_df2.withColumn( "MONTH", F.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) fav_df3 = fav_df2.withColumn( "ADJ_USERID", expr("substring(USERNAME, 1, length(USERNAME)-2)")) fav_df3 = fav_df3.withColumn( "ADJ_USERID", expr("substring(ADJ_USERID, 4, length(ADJ_USERID))")) fav = fav_df3.withColumn("ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", "")) fav = fav.groupby(['ADJ_USERID', 'MONTH']).count() fav = fav.withColumnRenamed("count", "num_fav") download_df2 = download_df.withColumn( "date", to_date(download_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) download_df2 = download_df2.withColumn( "MONTH_tmp", F.from_unixtime(F.unix_timestamp(download_df2.date, "yyyyMM"))) download_df2 = download_df2.withColumn( "MONTH", F.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) download_df3 = download_df2.withColumn( "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)")) download_df4 = download_df3.withColumn( "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", "")) download = download_df4.groupby(['ADJ_USERID', 'MONTH']).count() download = download.withColumnRenamed("count", "num_" + "download") browse_df2 = browse_df.withColumn( "date", to_date(browse_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) browse_df2 = browse_df2.withColumn( "MONTH_tmp", F.from_unixtime(F.unix_timestamp(browse_df2.date, "yyyyMM"))) browse_df2 = browse_df2.withColumn( "MONTH", F.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) browse_df3 = browse_df2.withColumn( "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)")) browse_df4 = browse_df3.withColumn( "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", "")) browse = browse_df4.groupby(['ADJ_USERID', 'MONTH']).count() browse = browse.withColumnRenamed("count", "num_" + "browse") share_df2 = share_df.withColumn( "date", to_date(share_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) share_df2 = share_df2.withColumn( "MONTH_tmp", F.from_unixtime(F.unix_timestamp(share_df2.date, "yyyyMM"))) share_df2 = share_df2.withColumn( "MONTH", F.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) share_df3 = share_df2.withColumn( "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)")) share_df4 = share_df3.withColumn( "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", "")) share = share_df4.groupby(['ADJ_USERID', 'MONTH']).count() share = share.withColumnRenamed("count", "num_" + "share") search_df2 = search_df.withColumn( "date", to_date(search_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) search_df2 = search_df2.withColumn( "MONTH_tmp", F.from_unixtime(F.unix_timestamp(search_df2.date, "yyyyMM"))) search_df2 = search_df2.withColumn( "MONTH", F.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) search_df3 = search_df2.withColumn( "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)")) search_df4 = search_df3.withColumn( "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", "")) search = search_df4.groupby(['ADJ_USERID', 'MONTH']).count() search = search.withColumnRenamed("count", "num_" + "search") data = [("2013-01-01", str(datetime.date.today()))] df = spark.createDataFrame(data, ["minDate", "maxDate"]) df = df.withColumn("monthsDiff", F.months_between("maxDate", "minDate")) \ .withColumn("repeat", F.expr("split(repeat(',', monthsDiff), ',')")) \ .select("*", F.posexplode("repeat").alias("date", "val")) \ .withColumn("date", F.expr("add_months(minDate, date)")) \ .select('date') df = df.withColumn( "MONTH", F.from_unixtime(F.unix_timestamp(F.col("date")), "yyyyMM")).select('MONTH') unique_id = download.select('ADJ_USERID').distinct() \ .union(browse.select('ADJ_USERID').distinct()) \ .union(share.select('ADJ_USERID').distinct()) \ .union(search.select('ADJ_USERID').distinct()) \ .union(fav.select('ADJ_USERID').distinct()) unique_id = unique_id.distinct() all_abo_month = unique_id.crossJoin(df) combine = download.select(['ADJ_USERID', 'MONTH']).union(browse.select(['ADJ_USERID', 'MONTH'])) \ .union(share.select(['ADJ_USERID', 'MONTH'])) \ .union(search.select(['ADJ_USERID', 'MONTH'])) \ .union(fav.select(['ADJ_USERID', 'MONTH'])) min_max_date = combine.groupby("ADJ_USERID").agg(F.min("MONTH"), F.max("MONTH")) all_abo_month = all_abo_month.join( min_max_date, all_abo_month.ADJ_USERID == min_max_date.ADJ_USERID, how='left').drop(min_max_date.ADJ_USERID) all_abo_month = all_abo_month.filter(F.col("MONTH") >= F.col("min(MONTH)")) all_abo_month = all_abo_month.filter(F.col("MONTH") <= F.col("max(MONTH)")) all_abo_month = all_abo_month.select(["ADJ_USERID", "MONTH"]) download = all_abo_month.join(download, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, 12): download = download.withColumn('num_' + "download" + str(n), F.lag(download['num_' + "download"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) download = download.withColumn( "n_lag_currentyr_" + "download" + "_sum_3m", download['num_' + "download"] + download['num_' + "download" + "1"] + download['num_' + "download" + "2"]) download = download.withColumn( "n_lag_currentyr_" + "download" + "_sum_6m", download["n_lag_currentyr_" + "download" + "_sum_3m"] + download['num_' + "download" + "3"] + download['num_' + "download" + "4"] + download['num_' + "download" + "5"]) download = download.withColumn( "n_lag_currentyr_" + "download" + "_sum_9m", download["n_lag_currentyr_" + "download" + "_sum_6m"] + download['num_' + "download" + "6"] + download['num_' + "download" + "7"] + download['num_' + "download" + "8"]) download = download.withColumn( "n_lag_currentyr_" + "download" + "_sum_12m", download["n_lag_currentyr_" + "download" + "_sum_9m"] + download['num_' + "download" + "9"] + download['num_' + "download" + "10"] + download['num_' + "download" + "11"]) droplist = [] for n in range(1, 12): droplist = droplist + ['num_' + "download" + str(n)] download = download.drop(*droplist) browse = all_abo_month.join(browse, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, 12): browse = browse.withColumn('num_' + "browse" + str(n), F.lag(browse['num_' + "browse"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) browse = browse.withColumn( "n_lag_currentyr_" + "browse" + "_sum_3m", browse['num_' + "browse"] + browse['num_' + "browse" + "1"] + browse['num_' + "browse" + "2"]) browse = browse.withColumn( "n_lag_currentyr_" + "browse" + "_sum_6m", browse["n_lag_currentyr_" + "browse" + "_sum_3m"] + browse['num_' + "browse" + "3"] + browse['num_' + "browse" + "4"] + browse['num_' + "browse" + "5"]) browse = browse.withColumn( "n_lag_currentyr_" + "browse" + "_sum_9m", browse["n_lag_currentyr_" + "browse" + "_sum_6m"] + browse['num_' + "browse" + "6"] + browse['num_' + "browse" + "7"] + browse['num_' + "browse" + "8"]) browse = browse.withColumn( "n_lag_currentyr_" + "browse" + "_sum_12m", browse["n_lag_currentyr_" + "browse" + "_sum_9m"] + browse['num_' + "browse" + "9"] + browse['num_' + "browse" + "10"] + browse['num_' + "browse" + "11"]) droplist = [] for n in range(1, 12): droplist = droplist + ['num_' + "browse" + str(n)] browse = browse.drop(*droplist) share = all_abo_month.join(share, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, 12): share = share.withColumn('num_' + "share" + str(n), F.lag(share['num_' + "share"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) share = share.withColumn( "n_lag_currentyr_" + "share" + "_sum_3m", share['num_' + "share"] + share['num_' + "share" + "1"] + share['num_' + "share" + "2"]) share = share.withColumn( "n_lag_currentyr_" + "share" + "_sum_6m", share["n_lag_currentyr_" + "share" + "_sum_3m"] + share['num_' + "share" + "3"] + share['num_' + "share" + "4"] + share['num_' + "share" + "5"]) share = share.withColumn( "n_lag_currentyr_" + "share" + "_sum_9m", share["n_lag_currentyr_" + "share" + "_sum_6m"] + share['num_' + "share" + "6"] + share['num_' + "share" + "7"] + share['num_' + "share" + "8"]) share = share.withColumn( "n_lag_currentyr_" + "share" + "_sum_12m", share["n_lag_currentyr_" + "share" + "_sum_9m"] + share['num_' + "share" + "9"] + share['num_' + "share" + "10"] + share['num_' + "share" + "11"]) droplist = [] for n in range(1, 12): droplist = droplist + ['num_' + "share" + str(n)] share = share.drop(*droplist) search = all_abo_month.join(search, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, 12): search = search.withColumn('num_' + "search" + str(n), F.lag(search['num_' + "search"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) search = search.withColumn( "n_lag_currentyr_" + "search" + "_sum_3m", search['num_' + "search"] + search['num_' + "search" + "1"] + search['num_' + "search" + "2"]) search = search.withColumn( "n_lag_currentyr_" + "search" + "_sum_6m", search["n_lag_currentyr_" + "search" + "_sum_3m"] + search['num_' + "search" + "3"] + search['num_' + "search" + "4"] + search['num_' + "search" + "5"]) search = search.withColumn( "n_lag_currentyr_" + "search" + "_sum_9m", search["n_lag_currentyr_" + "search" + "_sum_6m"] + search['num_' + "search" + "6"] + search['num_' + "search" + "7"] + search['num_' + "search" + "8"]) search = search.withColumn( "n_lag_currentyr_" + "search" + "_sum_12m", search["n_lag_currentyr_" + "search" + "_sum_9m"] + search['num_' + "search" + "9"] + search['num_' + "search" + "10"] + search['num_' + "search" + "11"]) droplist = [] for n in range(1, 12): droplist = droplist + ['num_' + "search" + str(n)] search = search.drop(*droplist) fav = all_abo_month.join(fav, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, 12): fav = fav.withColumn('num_' + "fav" + str(n), F.lag(fav['num_' + "fav"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) fav = fav.withColumn( "n_lag_currentyr_" + "fav" + "_sum_3m", fav['num_' + "fav"] + fav['num_' + "fav" + "1"] + fav['num_' + "fav" + "2"]) fav = fav.withColumn( "n_lag_currentyr_" + "fav" + "_sum_6m", fav["n_lag_currentyr_" + "fav" + "_sum_3m"] + fav['num_' + "fav" + "3"] + fav['num_' + "fav" + "4"] + fav['num_' + "fav" + "5"]) fav = fav.withColumn( "n_lag_currentyr_" + "fav" + "_sum_9m", fav["n_lag_currentyr_" + "fav" + "_sum_6m"] + fav['num_' + "fav" + "6"] + fav['num_' + "fav" + "7"] + fav['num_' + "fav" + "8"]) fav = fav.withColumn( "n_lag_currentyr_" + "fav" + "_sum_12m", fav["n_lag_currentyr_" + "fav" + "_sum_9m"] + fav['num_' + "fav" + "9"] + fav['num_' + "fav" + "10"] + fav['num_' + "fav" + "11"]) droplist = [] for n in range(1, 12): droplist = droplist + ['num_' + "fav" + str(n)] fav = fav.drop(*droplist) classroom_data = all_abo_month.join( download, ['ADJ_USERID', 'MONTH'], 'left').join(browse, ['ADJ_USERID', 'MONTH'], 'left').join(share, ['ADJ_USERID', 'MONTH'], 'left').join( search, ['ADJ_USERID', 'MONTH'], 'left').join(fav, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) classroom_data = classroom_data.withColumnRenamed("ADJ_USERID", "imc_no") classroom_data = classroom_data.withColumnRenamed("MONTH", "mo_yr_key_no") df = classroom_data df = df.withColumn('mo_yr_key_no', df.mo_yr_key_no.cast('string')) df = df.withColumn('mo_yr_key_no', to_timestamp(df.mo_yr_key_no, 'yyyyMM')) df = df.withColumn('mo_yr_key_no', date_format('mo_yr_key_no', 'yyyy-MM-dd')) classroom_data = df print("now saving the data") classroom_data.write.parquet(data_paths['classroom_data'].format( run_mode=run['run_mode'], run_id=run['run_id']), mode='overwrite')
# ### Age: from date of birth 'dob' # In[34]: users_raw.select('userId', 'timestamp', 'dob').show(5) # In[35]: # users_raw.select('timestamp', to_date('timestamp')).show(5) # In[36]: # compute user's age users_age = users_raw.select( 'userId', round(months_between(users_raw['timestamp'], users_raw['dob']) / 12, 0).alias("age")) # In[37]: # check few results: users_age.show(3) # In[38]: users_age.describe().toPandas().transpose() # # BuyClicks # purchase behavior of users # * based on buyId, price # * amount spent: total, yearly, monthly?
def _compute_longitudinal_age_groups( self, cohort: Cohort, col_offset: int) -> Tuple[DataFrame, List[str]]: """ Parameters ---------- cohort: Cohort cohort on which the age groups should be computed col_offset: int number of columns used by lagged exposure features Returns ------- (age_features, mapping): Tuple(DataFrame, List(str)) a dataframe containing the age features in aij format and a mapping giving the correspondence between column number and age group. """ # This implementation is suboptimal, but we need to have something # working with inconsistent python versions across the cluster. assert (cohort.has_subject_information( )), "Cohort subjects should have gender and birthdate information" subjects = cohort.subjects.select("patientID", "gender", "birthDate") bucket_ids = sf.array([sf.lit(i) for i in range(self.n_buckets)]) subjects = (subjects.withColumn("bucketID", bucket_ids).select( "PatientID", "gender", "birthDate", sf.explode("bucketID").alias("bucket"), ).withColumn("dateShift", sf.col("bucket") * self.bucket_size).withColumn( "referenceDate", sf.lit(self.age_reference_date))) # Longitudinal age is based on referenceDate instead of minDate to # be consistent with cohort definition. time_references = sf.expr("date_add(referenceDate, dateShift)") longitudinal_age = sf.floor( sf.months_between(time_references, sf.col("birthdate")) / 12) subjects = subjects.withColumn("longitudinalAge", longitudinal_age).select( "patientID", "gender", "birthDate", "bucket", "longitudinalAge") subjects, n_age_groups, mapping = self._bucketize_age_column( subjects, "longitudinalAge", "longitudinalAgeBucket") assert n_age_groups == self.n_age_groups, ( "Computed number of age groups is different from the number of specified" " age groups at initialization. There might be empty age_groups," " you should investigate this.") age_features = subjects.select( sf.col("patientID"), sf.col("bucket").alias("rowIndex"), (sf.col("longitudinalAgeBucket") + col_offset).alias("colIndex"), ) # Remove "age events" which are not in follow-up fup_events = self.followups.intersection(self.final_cohort).events fup_events = self._discretize_start_end(fup_events) fup_events = rename_df_columns(fup_events, prefix="fup_") age_features_columns = age_features.columns age_features = age_features.join(fup_events, on="patientID") age_features = age_features.where( sf.col("rowIndex").between(sf.col("fup_startBucket"), sf.col("fup_endBucket"))) age_features = age_features.select(*age_features_columns) return age_features, mapping
from pyspark.sql.functions import col, lit, to_date, current_date, \ current_timestamp, date_add, date_sub, datediff, months_between, to_timestamp, hour if __name__ == '__main__': spark = SparkSession.builder.appName("learning").master( "local").getOrCreate() spark.range(5).withColumn('date', to_date(lit('2019-01-01'))).show() spark.read.jdbc spark.range(5)\ .select(current_date().alias('date'), current_timestamp().alias('timestamp'))\ .select(date_add(col('date'), 1), date_sub(col('timestamp'), 1)).show() spark.range(5).select(to_date(lit('2019-01-01')).alias('date1'), to_date(lit('2019-01-05')).alias('date2'))\ .select(datediff(col('date2'), col('date1')), months_between(col('date2'), col('date1'))).show() spark.range(5).withColumn('date', to_date( lit('2019-XX-XX'))).show() #No emite excepcion spark.range(5).withColumn('date_comp1', to_date(lit('2019-01-01')) > to_date(lit('2019-01-02'))) \ .withColumn('date_comp2', to_date(lit('2019-01-01')) > to_timestamp(lit('2019-01-02'))) \ .withColumn('date_comp3', to_date(lit('2019-01-01')) > "2019-01-02") \ .withColumn('date_comp3', to_date(lit('2019-01-01')) > "'2019-01-02'").show() spark.range(5).select(current_timestamp().alias("timestamp")).select( hour(col('date')))
# COMMAND ---------- from pyspark.sql.functions import date_add, date_sub dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1) # COMMAND ---------- from pyspark.sql.functions import datediff, months_between, to_date dateDF.withColumn("week_ago", date_sub(col("today"), 7))\ .select(datediff(col("week_ago"), col("today"))).show(1) dateDF.select( to_date(lit("2016-01-01")).alias("start"), to_date(lit("2017-05-22")).alias("end"))\ .select(months_between(col("start"), col("end"))).show(1) # COMMAND ---------- from pyspark.sql.functions import to_date, lit spark.range(5).withColumn("date", lit("2017-01-01"))\ .select(to_date(col("date"))).show(1) # COMMAND ---------- from pyspark.sql.functions import to_date dateFormat = "yyyy-dd-MM" cleanDateDF = spark.range(1).select( to_date(lit("2017-12-11"), dateFormat).alias("date"),
#df_1.select(substring_index("value", "_", 1)).show() # COMMAND ---------- #Date from pyspark.sql.functions import current_timestamp, current_date, date_format, dayofyear, year, month, date_add, date_sub, datediff, add_months, months_between, next_day, last_day, date_trunc, lit orders_new_col.select( current_timestamp(), current_date(), date_format(current_timestamp(), "yyyy-MM-dd a hh:HH:mm").alias("dat"), dayofyear(current_timestamp()).alias("dayofyear_val"), month(current_timestamp()).alias("mon"), date_sub(current_timestamp(), 1).alias("date_d")).select( datediff(current_timestamp(), "date_d")).show(1, truncate=False) orders_new_col.select(add_months( current_timestamp(), -1).alias("prev_mon")).select( months_between(current_timestamp(), "prev_mon").alias("no_of_mon")).select( next_day(current_timestamp(), "Mon")).select( last_day(current_timestamp())).show(1) orders_new_col.select(date_trunc('year', lit('2020-04-01'))).show(1) # COMMAND ---------- from pyspark.sql.functions import concat_ws #orders_new_col.printSchema() #spark.sql("create database if not exists newdb") #spark.sql("show databases").show() ordersText = orders_new_col.select( concat_ws("~", "order_id", "order_date", "order_customer_id", "order_status").alias("concal_col")) #ordersText.write.mode("overwrite").text("/FileStore/tables/cca175/test_data/retail_db/order_items/text_format") #read_text_file = spark.read.text("/FileStore/tables/cca175/test_data/retail_db/order_items/text_format")
## Créer une nouvelle colonne dans la table borrow qui prend la valeur 1, si la durée d'emprunt est supérieur à 3 mois, sinon 0 ## SQL print("**** Etat des emprunts") spark.sql(""" select *, case when (months_between(to_date(`return_time`, 'dd-MM-yyyy'), to_date(`checkout_time`, 'dd-MM-yyyy')) > 3) then 1 else 0 end as `Plus de 3 mois` from Borrow_SQL """).show() ## DSL Borrow_export = Borrow\ .withColumn("format",F.lit("dd-MM-yyyy"))\ .withColumn("start",F.expr("to_date(`checkout_time`, format)"))\ .withColumn("end",F.expr("to_date(`return_time`, format)"))\ .withColumn("Plus de 3 mois", F.when(F.months_between(F.col('end'),F.col('start')) > 3,1)\ .otherwise(0))\ .drop('format','start','end') ## configuration des paths import configparser config = configparser.ConfigParser() config.read('properties.conf') path_to_output_data = config['BDA']['path'] ## Exportation en csv Borrow_export.toPandas().to_csv(path_to_output_data + "Borrow.csv") ## Déterminer les livres qui n’ont jamais été empruntés ## SQL print("*** Les livres jamais empruntés")
print("Number of edits per month over all articles: ") df_monthly.select("title", "yearmonth", "count").show() min_date, max_date = df_monthly_ts.select( min_("yearmonth").cast("long"), max_("yearmonth").cast("long")).first() data = [(min_date, max_date)] df_dates = spark.createDataFrame(data, ["minDate", "maxDate"]) df_min_max_date = df_dates.withColumn( "minDate", col("minDate").cast("timestamp")).withColumn( "maxDate", col("maxDate").cast("timestamp")) df_formatted_ts = df_min_max_date.withColumn("monthsDiff", f.months_between("maxDate", "minDate"))\ .withColumn("repeat", f.expr("split(repeat(',', monthsDiff), ',')"))\ .select("*", f.posexplode("repeat").alias("date", "val"))\ .withColumn("date", f.expr("add_months(minDate, date)"))\ .withColumn("yearmonth", f.concat(f.year("date"), f.lit('-'), format_string("%02d", f.month("date"))))\ .select('yearmonth') df_group_ts = df_groups.crossJoin(df_formatted_ts) df_allts = df_group_ts.join(df_monthly, ['title', 'yearmonth'], how='left') \ .orderBy('title', 'yearmonth').select('title', 'yearmonth', 'count') df_allts.orderBy(desc('count')).show(100) print('Calculate average edits per month for each article :') window = Window.partitionBy("title").orderBy('yearmonth').rowsBetween(-1, 1)
# select date_sub(today, 5), date_add(today, 5) from dateTable # Another task is to take a look at the difference between two dates. We can do this with datediff function dateDF.withColumn('week_ago', date_sub(col('today'), 7))\ .select(datediff(col('week_ago'), col('today'))).show(1) dateDF.withColumn('week_ago', date_sub(col('today'), 7))\ .selectExpr('datediff(week_ago, today)').show(1) spark.sql('select datediff(week_ago, today) from (select *, date_sub(today, 7) as week_ago from dateTable)') dateDF.select( to_date(lit('2016-01-01')).alias('start'), to_date(lit('2017-05-22')).alias('end'))\ .select(months_between(col('start'), col('end'))).show(1) dateDF.selectExpr( 'to_date("2016-01-01") as start', 'to_date("2017-05-22") as end')\ .selectExpr( 'months_between(start, end)', 'datediff("2016-01-01", "2017-01-01")' ).show(1) # We introduces the to_date function. The to_date function allows you to convert a string to a date, optionally with a specified format # We specify our format in the Java SimpleDateFormat spark.range(5).withColumn('date', lit('2017-01-01'))\ .select(to_date(col('date'))).show(1)
# Alternatively, use the `to_timestamp` function: from pyspark.sql.functions import to_timestamp rides \ .withColumn("date_time_fixed", to_timestamp("date_time", format="yyyy-MM-dd HH:mm")) \ .select("date_time", "date_time_fixed") \ .show(5) # ### Example 8: Computing the age of each rider # Use the `current_date` and `months_between` functions to compute the age of # each rider: from pyspark.sql.functions import current_date, months_between, floor riders \ .withColumn("today", current_date()) \ .withColumn("age", floor(months_between("today", "birth_date") / 12)) \ .select("birth_date", "today", "age") \ .show(5) # **Note:** Spark implicitly casts `birth_date` or `today` as necessary. It is # probably safer to explicitly cast one of these columns before computing the # number of months between. # ## Working with Boolean columns # ### Example 10: Predefining a Boolean column expression # You can predefine a Boolean column expression: studentFilter = col("student") == 1 type(studentFilter)