def get_gender_feature(): train_user = ks.read_csv("data/train_preliminary/user.csv") train_click_log = ks.read_csv("data/train_preliminary/click_log.csv") train_data = train_user.merge(train_click_log, on="user_id", how='inner') sql = ''' select creative_id, gender, sum(nvl(click_times, 0)) click_times from {train_data} group by creative_id, gender ''' age_data = ks.sql(sql, train_data=train_data) age_data.cache() sql = ''' SELECT creative_id, gender, click_times / sum(click_times) OVER (PARTITION BY creative_id ORDER BY click_times DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) gender_dist FROM {age_data} ''' age_dist_data = ks.sql(sql, age_data=age_data) age_dist_data.head(10) age_dist_data.cache() age_dist_pivot = age_dist_data.pivot(index='creative_id', columns='gender', values='gender_dist') age_dist_pivot.columns = ['gender_' + str(ele) for ele in range(1, 3)] age_dist_pivot = age_dist_pivot.reset_index() age_dist_pivot.fillna(0, inplace=True) age_dist_pivot.to_csv('./data/gender_dist', num_files=1)
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = "data/*.json" # read log data file df = ks.read_json(log_data) # extract columns for users table users_table = (ks.sql(''' SELECT DISTINCT userId, firstName, lastName, gender, level FROM {df}''') ) # write users table to parquet files users_table.to_spark().write.mode('overwrite').parquet('users/') # create timestamp and datetime column from original timestamp column unix_time_series = df.head().ts.copy() get_datetime_timestamp = ks.DataFrame(data = unix_time_series) # extract columns to create time table get_datetime_timestamp.pipe(extract_time_features) # write time table to parquet files partitioned by year and month get_datetime_timestamp.to_spark().write.mode('overwrite').partitionBy("year", "month").parquet('time/') # read in song data to use for songplays table song_df = ks.read_json("data/song_data/*/*/*/*.json") # extract columns from joined song and log datasets to create songplays table songplays_table = (ks.sql('''SELECT e.ts, t.month, t.year, e.userId, e.level, s.song_id, s.artist_id, e.sessionId, e.location, e.userAgent FROM {df} e JOIN {song_df} s ON (e.song = s.title AND e.artist = s.artist_name) JOIN {get_datetime_timestamp} t ON (e.ts = t.ts)''') ) # write songplays table to parquet files partitioned by year and month songplays_table.to_spark().write.mode('overwrite').partitionBy("year", "month").parquet('songplays/') """
def main(args): print("main start1!") print(args) formatted_target_date = datetime.strptime(args.target_date, '%Y-%m-%d') begin_date = formatted_target_date - timedelta(days=int(args.days)) end_date = formatted_target_date date_list = date_generator(begin_date.strftime('%Y%m%d'), end_date.strftime('%Y%m%d')) str_date_list = "('" + "','".join(date_list) + "')" print("date list : {}".format(str_date_list)) model = BPModel() spark_session = SparkSession.builder \ .appName('Yeondys') \ .config("spark.driver.memory", "15g") \ .enableHiveSupport() \ .getOrCreate() corpus = ks.sql(get_source_data(str_date_list)) if corpus: model.threshold_likes(corpus.to_pandas(), int(args.brand_visit_cnt), int(args.user_visit_cnt)) # TEST용으로 진행 원래는 airflow 에 설정되어있는 5000, 5를 사용 # model.threshold_likes(corpus.to_pandas(), 1, 1) else: print("data collect fail!") sys.exit() corpus = ks.sql(get_toddler_data()) if corpus: model.get_toddler_brand(corpus.to_pandas()) # TEST용으로 진행 원래는 airflow 에 설정되어있는 5000, 5를 사용 # model.threshold_likes(corpus.to_pandas(), 1, 1) else: print("toddler_data collect fail!") sys.exit() print("start prepare") model.prepare(end_date.strftime('%Y%m%d')) model.process() # begin_date = formatted_target_date - timedelta(days=14) # end_date = formatted_target_date # corpus = ks.sql(get_order_data(begin_date.strftime('%Y%m%d'), end_date.strftime('%Y%m%d'))) # print("koalas order data size : {}".format(corpus.size)) # model.post_process(corpus.to_pandas()) model.post_process() result_df = ks.from_pandas(model.final_df) result_df.to_table(name='members_kangcj.brand_preference', format='orc', mode='overwrite')
def process_song_data(spark, input_data, output_data): """ process song_data to ctreate songs, artist tables """ # get filepath to song data file song_data = 'data/song_data/A/B/C/*.json' kdf = ks.read_json('data/song_data/A/B/C/*.json') # read song data file df = spark.read.json(song_data) # extract columns to create songs table #song_id, title, artist_id, year, duration songs_table = (ks.sql("""select DISTINCt row_number() over (ORDER BY year,title,artist_id) id, song_id, title, artist_id, year, duration FROM {kdf} """)) # write songs table to parquet files partitioned by year and artist (songs_table.to_spark().write.partitionBy("year", "artist_id").parquet( f'{output_data}/songs', mode="overwrite")) # extract columns to create artists table artists_table = (ks.sql(""" SELECT DISTINCT row_number() over (ORDER BY artist_name) id, artist_id, artist_name, artist_location, artist_latitude, artist_longitude FROM {kdf} """)) # write artists table to parquet files (artists_table.to_spark().write.parquet(f'{output_data}/artists', mode="overwrite"))
def make_recall(self): df = ks.sql( Path('./brand_lterm_preference/get_recall.sql').read_text() ).to_pandas() brd_df = ks.sql( Path('./brand_lterm_preference/get_brand.sql').read_text() ).to_pandas() merge_df = df.merge(brd_df, on='brand', how='left') # preprocessing merge_df = merge_df.dropna() merge_df['cnt'] = merge_df['cnt'].astype(int).astype(str) merge_df['score'] = merge_df['score'].astype(str) merge_df = merge_df[['userid', 'brand', 'score', 'brd_nm', 'cnt']] final_df = merge_df.groupby('userid').agg(lambda x: '^'.join(list( x)) if x.name == 'brd_nm' else ','.join(list(x))).reset_index() final_df['prdid'] = '' self.df_to_table(final_df, 'members_bycho.brand_lterm_preference')
def process_song_data(spark, input_data, output_data): # get filepath to song data file song_data = "data/song_data/*/*/*/*.json" # read song data file df = ks.read_json(song_data) # extract columns to create songs table songs_table = (ks.sql(''' SELECT DISTINCT song_id, title, artist_id, year, duration FROM {df}''') ) # write songs table to parquet files partitioned by year and artist songs_table.to_spark().write.mode('overwrite').partitionBy("year", "artist_id").parquet('songs/') # extract columns to create artists table artists_table = (ks.sql(''' SELECT DISTINCT artist_id, artist_name, artist_location, artist_latitude, artist_longitude FROM {df}''') ) # write artists table to parquet files artists_table.to_spark().write.mode('overwrite').parquet('artists/') """
def process_song_data(spark, input_data, output_data): #INPUTS : [spark] : an instance of the spark session #[input_data] : path to the data set . #[output_data] : [path to where to save the processed data] # get filepath to song data file song_data = input_data + 'song_data/*/*/*/*.json' # read song data file df = ks.read_json("s3a://udacity-dend/song_data/*/*/*/*.json") # extract columns to create songs table songs_table = (ks.sql(''' SELECT DISTINCT row_number() over (ORDER BY year,title,artist_id) id, title, artist_id, year, duration FROM {df}''')) # write songs table to parquet files partitioned by year and artist songs_table.to_spark().write.mode('overwrite').partitionBy( "year", "artist_id").parquet(output_data + 'songs/') # extract columns to create artists table artists_table = (ks.sql(''' SELECT DISTINCT artist_id,artist_latitude,artist_location,artist_longitude,artist_name FROM {df}''')) # write artists table to parquet files artists_table.to_spark().write.mode('overwrite').partitionBy( "artist_name").parquet(output_data + 'artists/')
def make_brand_index(self): large_cate_df = ks.sql( Path('./brand_lterm_preference/get_large_cate.sql').read_text() ).to_pandas() self.col_list = large_cate_df.cate1.unique().tolist() for cate1 in self.col_list: self.brand_feat_dict[cate1] = [] self.feature_idx_dict = { 'pcid_emb': 0, 'brand_emb': len(self.col_list), 'ismatch': len(self.col_list) * 2 + 1 } tmp_df = pd.DataFrame([self.feature_idx_dict]) self.df_to_table(tmp_df, "members_bycho.brand_feature_idx")
def make_pcid_feat(self): pcid_emb_df = ks.sql( Path('./brand_lterm_preference/get_pcid_bhv.sql').read_text() ).to_pandas() pcid_emb_df = pcid_emb_df.dropna() pcid_emb_df['cate1'] = pcid_emb_df['cate1'].astype(int) pcid_feat_df = self.make_embedding(pcid_emb_df, 'pcid') self.make_processing(pcid_feat_df, 'pcid') print(self.pcid_feature_dict['mEVEBGZUD8KY93ykgHWIw']) tmp = [[pcid, feat, self.pcid_lcate_dict[pcid]] for pcid, feat in self.pcid_feature_dict.items()] tmp_df = pd.DataFrame(tmp, columns=['pcid', 'feat1', 'feat2']) self.df_to_table(tmp_df, "members_bycho.pcid_features")
def combine_feature(train=True): user_filename = 'data/train_preliminary/user.csv' if train else './data/test/click_log.csv' result_filename = './data/combine_feature' if train else './data/combine_feature_test' user_df = ks.read_csv(user_filename) if not train: user_df = ks.sql('select distinct user_id from {user_df}', user_df=user_df) wv_feature = ks.read_csv('data/wv_features.csv') nn_feature = ks.read_csv('data/nn_features.csv') stats_data = ks.read_csv( "data/stats_features/part-00000-f6695da4-6d9f-4ba4-80b1-d370e636696b-c000.csv" ) all_features = user_df.merge(wv_feature, on='user_id').merge( nn_feature, on='user_id').merge(stats_data, on='user_id') print(all_features.shape) all_features.to_csv(result_filename, num_files=1)
def make_brand_feat(self): brand_emb_df = ks.sql( Path('./brand_lterm_preference/get_brand_bhv.sql').read_text() ).to_pandas() brand_emb_df = brand_emb_df.dropna() brand_emb_df['cate1'] = brand_emb_df['cate1'].astype(int) brand_feat_df = self.make_embedding(brand_emb_df, 'brd_id') self.make_processing(brand_feat_df, 'brd_id') print(self.brand_feature_dict['120402']) print(self.brand_lcate_dict['120402']) tmp = [[brd_id, feat, self.brand_lcate_dict[brd_id]] for brd_id, feat in self.brand_feature_dict.items()] tmp_df = pd.DataFrame(tmp, columns=['brd_id', 'feat1', 'feat2']) self.df_to_table(tmp_df, "members_bycho.brand_features")
def get_ad_dict(): train_ad = ks.read_csv("../data/train_preliminary/ad.csv") test_ad = ks.read_csv("../data/test/ad.csv") ad_info = ks.concat([train_ad, test_ad], axis=0) ad_info = ad_info.drop_duplicates() ad_dict_sql = ''' select creative_id, product_id, product_category, advertiser_id, industry, row_number() over (partition by product_id, product_category,advertiser_id,industry order by 1 desc) ad_rn from {ad_info} ''' ad_info = ks.sql(ad_dict_sql, ad_info=ad_info) print(ad_info.nunique()) ad_info.to_csv('../data/ad_info', index=False, num_files=1)
def get_test_corpus(): test_data = get_test_data() query = ''' select user_id, concat_ws(' ', collect_list(b.time)) time1, concat_ws(' ', collect_list(b.creative_id)) creative_id, concat_ws(' ', collect_list(b.click_times)) click_times, concat_ws(' ', collect_list(b.ad_id)) ad_id, concat_ws(' ', collect_list(b.product_id)) product_id, concat_ws(' ', collect_list(b.product_category)) product_category, concat_ws(' ', collect_list(b.advertiser_id)) advertiser_id, concat_ws(' ', collect_list(b.industry)) industry from {test_data} b group by user_id order by time1 asc ''' test_encode_result = ks.sql(query=query, test_data=test_data) test_encode_result.to_csv('../data/predict_corpus', index=False, num_files=1)
def make_train_validate_testset(self): dataset_df = ks.sql( Path('./brand_lterm_preference/get_dataset.sql').read_text() ).to_pandas() self.train, self.validate, self.test = np.split( dataset_df.sample(frac=1), [int(.6 * len(dataset_df)), int(.8 * len(dataset_df))]) filename_list = ['train.txt', 'validate.txt', 'test.txt'] df_list = [self.train, self.validate, self.test] try: for df, filename in zip(df_list, filename_list): txt_file = open(filename, 'w') for row in df.itertuples(): vec = [] label = row.istarget if (str(row.pcid) in self.pcid_feature_dict) & (str( row.brand) in self.brand_feature_dict): ismatch = len( set(self.pcid_lcate_dict[str(row.pcid)]) & set(self.brand_lcate_dict[str(row.brand)])) pcid_feat = self.pcid_feature_dict[str(row.pcid)] brand_feat = self.brand_feature_dict[str(row.brand)] vec.append(str(label)) vec.append(pcid_feat) vec.append(brand_feat) if ismatch > 0: vec.append( str(self.feature_idx_dict['ismatch']) + ":" + str(1)) txt_file.write("%s\n" % " ".join(vec)) txt_file.close() except IOError as e: print("I/O error({0}) : {1}]".format(e.errno, e.strerror)) else: self.move_to_opt(filename_list)
def test_error_bad_sql(self): with self.assertRaises(ParseException): ks.sql("this is not valid sql")
def test_error_unsupported_type(self): msg = "Unsupported variable type <class 'dict'>: {'a': 1}" with self.assertRaisesRegex(ValueError, msg): some_dict = {"a": 1} ks.sql("select * from {some_dict}")
def test_error_variable_not_exist(self): msg = "The key variable_foo in the SQL statement was not found.*" with self.assertRaisesRegex(ValueError, msg): ks.sql("select * from {variable_foo}")
# MAGIC %md # MAGIC ### Value Counts # COMMAND ---------- # To get value counts of the different property types with PySpark display(df.groupby("property_type").count().orderBy("count", ascending=False)) # COMMAND ---------- # Value counts in Koalas kdf["property_type"].value_counts() # COMMAND ---------- # MAGIC %md # MAGIC ### Visualizations with Koalas DataFrames # COMMAND ---------- kdf.plot(kind="hist", x="bedrooms", y="price", bins=200) # COMMAND ---------- # MAGIC %md # MAGIC ### SQL on Koalas DataFrames # COMMAND ---------- ks.sql("select distinct(property_type) from {kdf}")
def sterm_reranking(self): print("start reranking!!") brand_feat_df = ks.sql( Path('./brand_lterm_preference/get_brand_feat.sql').read_text() ).to_pandas() pcid_feat_df = ks.sql( Path('./brand_lterm_preference/get_pcid_feat.sql').read_text() ).to_pandas() sterm_df = ks.sql( Path('./brand_lterm_preference/get_sterm.sql').read_text() ).to_pandas() brd_df = ks.sql( Path('./brand_lterm_preference/get_brand.sql').read_text() ).to_pandas() feat_idx_df = ks.sql( Path('./brand_lterm_preference/get_feat_idx.sql').read_text() ).to_pandas() pcid_feat_dict = {} for row in pcid_feat_df.itertuples(): pcid_feat_dict.setdefault(row.userid, {})['feat1'] = row.feat1 pcid_feat_dict.setdefault(row.userid, {})['feat2'] = row.feat2 brand_feat_dict = {} for row in brand_feat_df.itertuples(): brand_feat_dict.setdefault(row.brd_id, {})['feat1'] = row.feat1 brand_feat_dict.setdefault(row.brd_id, {})['feat2'] = row.feat2 target_dict = {'userid': [], 'brand': []} file_name = "/".join([self.OPT_HOME, 'sterm.txt']) try: txt_file = open(file_name, 'w') for row in sterm_df.itertuples(): vec = [] if (str(row.userid) in pcid_feat_dict) & (str(row.brand) in brand_feat_dict): ismatch = len( set(pcid_feat_dict[str(row.userid)]['feat2']) & set(brand_feat_dict[str(row.brand)]['feat2'])) pcid_feat = pcid_feat_dict[str(row.userid)]['feat1'] brand_feat = brand_feat_dict[str(row.brand)]['feat1'] vec.append(pcid_feat) vec.append(brand_feat) if ismatch > 0: vec.append( str(feat_idx_df.iloc[0]['ismatch']) + ":" + str(1)) txt_file.write("%s\n" % " ".join(vec)) target_dict['userid'].append(row.userid) target_dict['brand'].append(row.brand) txt_file.close() except IOError as e: print("I/O error({0}) : {1}]".format(e.errno, e.strerror)) else: if self.run_model() == 0: out_file = "/".join([self.OPT_HOME, 'sterm.txt.out']) score_df = pd.read_csv(out_file, sep='\t', names=["score"]) target_df = pd.DataFrame(target_dict) rerank_df = pd.concat([target_df, score_df], axis=1) rerank_df['brand'] = rerank_df['brand'].astype(int) brd_df['brand'] = brd_df['brand'].astype(int) rerank_df = rerank_df.merge(brd_df, on='brand', how='left') rerank_df = rerank_df[(rerank_df.brand.notnull()) & (rerank_df.brd_nm.notnull()) & (rerank_df.cnt.notnull())] rerank_df['RN'] = rerank_df.sort_values(['userid', 'score'], ascending=[True, False]) \ .groupby(['userid']) \ .cumcount() + 1 rerank_df = rerank_df[rerank_df.RN < 9] rerank_df = rerank_df.sort_values(['userid', 'RN'], ascending=[True, True]) rerank_df = rerank_df[[ 'userid', 'brand', 'score', 'brd_nm', 'cnt' ]] rerank_df['brand'] = rerank_df['brand'].astype(str) rerank_df['score'] = rerank_df['score'].astype(str) rerank_df['brd_nm'] = rerank_df['brd_nm'].astype(str) rerank_df['cnt'] = rerank_df['cnt'].astype(str) final_df = rerank_df.groupby('userid').agg( lambda x: '^'.join(list(x)) if x.name == 'brd_nm' else ','. join(list(x))).reset_index() final_df['prdid'] = '' self.df_to_table(final_df, "members_bycho.brand_preference_reranking") else: print("fail to create reranking file!!")
def process_log_data(spark, input_data, output_data): """process log_data to create users, time ,songsplay table""" # get filepath to log data file log_data = 'data/*.json' # read log data file log_kdf = ks.read_json(log_data) # filter by actions for song plays df = log_kdf.filter(log_kdf.page == "NextSong") # extract columns for users table users_table = ks.sql(""" SELECT DISTINCT userId, firstName, lastName, gender, level FROM {df}""") # write users table to parquet files (users_table.to_spark().write.parquet(f'{output_data}/users', mode="overwrite")) # create timestamp column from original timestamp column df['timestamp'] = ks.to_datetime(df['ts'], unit='ns') # create datetime column from original timestamp column df['datetime'] = ks.to_datetime(df['ts']) # extract columns to create time table time_table = (ks.sql(""" SELECT DISTINCT datetime as start_time, extract(day from datetime) as day, extract(week from datetime) as week, extract(month from datetime) as month, extract(year from datetime) as year, extract(hour from datetime) as hour from {df} """)) # to enable join on table ks.set_option('compute.ops_on_diff_frames', True) # add weekday columns time_table['weekday'] = df.datetime.dt.weekday # write time table to parquet files partitioned by year and month (time_table.to_spark().write.partitionBy('year', 'month').parquet('time/')) # read in song data to use for songplays table song_df = ks.read_json('data/song_data/*/*/*/*.json') # convert ts to datetime log_kdf["ts"] = ks.to_datetime(log_kdf['ts']) # extract columns from joined song and log datasets to create songplays table songplays_table = (ks.sql(""" SELECT DISTINCT row_number() over (ORDER BY e.userId) songplay_id, e.ts AS start_time, extract(month from e.ts) as month, extract(year from e.ts) as year, e.userId AS user_id, e.level AS level, s.song_id AS song_id, s.artist_id AS artist_id, e.sessionId as session_id, e.location AS location, e.userAgent AS user_agent FROM {log_kdf} as e join {song_df} as s ON (e.artist = s.artist_name AND e.song = s.title AND e.length= s.duration) WHERE e.page='NextSong' """)) # write songplays table to parquet files partitioned by year and month (songplays_table.to_spark().write.partitionBy("year", "month").parquet( f'{output_data}/songplayes', mode="overwrite"))