Example #1
0
def get_gender_feature():
    train_user = ks.read_csv("data/train_preliminary/user.csv")
    train_click_log = ks.read_csv("data/train_preliminary/click_log.csv")
    train_data = train_user.merge(train_click_log, on="user_id", how='inner')
    sql = '''
    select creative_id,
            gender,
            sum(nvl(click_times, 0)) click_times
    from {train_data}
    group by  creative_id, gender
    '''
    age_data = ks.sql(sql, train_data=train_data)
    age_data.cache()
    sql = '''
    SELECT creative_id,
           gender,
           click_times / sum(click_times)
                             OVER (PARTITION BY creative_id  ORDER BY click_times DESC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) gender_dist
    FROM {age_data}
    '''
    age_dist_data = ks.sql(sql, age_data=age_data)
    age_dist_data.head(10)
    age_dist_data.cache()
    age_dist_pivot = age_dist_data.pivot(index='creative_id',
                                         columns='gender',
                                         values='gender_dist')
    age_dist_pivot.columns = ['gender_' + str(ele) for ele in range(1, 3)]
    age_dist_pivot = age_dist_pivot.reset_index()
    age_dist_pivot.fillna(0, inplace=True)
    age_dist_pivot.to_csv('./data/gender_dist', num_files=1)
Example #2
0
def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = "data/*.json"

    # read log data file
    df = ks.read_json(log_data)
    
    # extract columns for users table    
    users_table = (ks.sql('''
               SELECT 
               DISTINCT
               userId,
               firstName,
               lastName,
               gender,
               level
               FROM 
                   {df}''')
              )

    # write users table to parquet files
    users_table.to_spark().write.mode('overwrite').parquet('users/')


    # create timestamp and datetime column from original timestamp column
    unix_time_series = df.head().ts.copy()
    get_datetime_timestamp = ks.DataFrame(data = unix_time_series)                               
    # extract columns to create time table

    
    get_datetime_timestamp.pipe(extract_time_features)
    
    # write time table to parquet files partitioned by year and month
    get_datetime_timestamp.to_spark().write.mode('overwrite').partitionBy("year", "month").parquet('time/')

    # read in song data to use for songplays table
    song_df = ks.read_json("data/song_data/*/*/*/*.json")

    # extract columns from joined song and log datasets to create songplays table 
    songplays_table = (ks.sql('''SELECT e.ts,
       t.month,
       t.year,
       e.userId,
       e.level,
       s.song_id,
       s.artist_id,
       e.sessionId,
       e.location,
       e.userAgent
       FROM 
       {df} e
       JOIN {song_df} s ON (e.song = s.title AND e.artist = s.artist_name)
       JOIN {get_datetime_timestamp} t ON (e.ts = t.ts)''')   
                  )
    # write songplays table to parquet files partitioned by year and month
    songplays_table.to_spark().write.mode('overwrite').partitionBy("year", "month").parquet('songplays/')

    """
def main(args):
    print("main start1!")
    print(args)

    formatted_target_date = datetime.strptime(args.target_date, '%Y-%m-%d')
    begin_date = formatted_target_date - timedelta(days=int(args.days))
    end_date = formatted_target_date

    date_list = date_generator(begin_date.strftime('%Y%m%d'),
                               end_date.strftime('%Y%m%d'))
    str_date_list = "('" + "','".join(date_list) + "')"
    print("date list : {}".format(str_date_list))

    model = BPModel()

    spark_session = SparkSession.builder \
     .appName('Yeondys') \
     .config("spark.driver.memory", "15g") \
     .enableHiveSupport() \
     .getOrCreate()

    corpus = ks.sql(get_source_data(str_date_list))
    if corpus:
        model.threshold_likes(corpus.to_pandas(), int(args.brand_visit_cnt),
                              int(args.user_visit_cnt))
    # TEST용으로 진행 원래는 airflow 에 설정되어있는 5000, 5를 사용
    # model.threshold_likes(corpus.to_pandas(), 1, 1)
    else:
        print("data collect fail!")
        sys.exit()

    corpus = ks.sql(get_toddler_data())
    if corpus:
        model.get_toddler_brand(corpus.to_pandas())
    # TEST용으로 진행 원래는 airflow 에 설정되어있는 5000, 5를 사용
    # model.threshold_likes(corpus.to_pandas(), 1, 1)
    else:
        print("toddler_data collect fail!")
        sys.exit()

    print("start prepare")
    model.prepare(end_date.strftime('%Y%m%d'))

    model.process()

    # begin_date = formatted_target_date - timedelta(days=14)
    # end_date = formatted_target_date
    # corpus = ks.sql(get_order_data(begin_date.strftime('%Y%m%d'), end_date.strftime('%Y%m%d')))
    # print("koalas order data size : {}".format(corpus.size))
    # model.post_process(corpus.to_pandas())
    model.post_process()

    result_df = ks.from_pandas(model.final_df)
    result_df.to_table(name='members_kangcj.brand_preference',
                       format='orc',
                       mode='overwrite')
Example #4
0
def process_song_data(spark, input_data, output_data):
    """ process song_data to ctreate songs, artist tables """
    # get filepath to song data file
    song_data = 'data/song_data/A/B/C/*.json'
    kdf = ks.read_json('data/song_data/A/B/C/*.json')

    # read song data file
    df = spark.read.json(song_data)

    # extract columns to create songs table
    #song_id, title, artist_id, year, duration
    songs_table = (ks.sql("""select
                                DISTINCt
                                row_number() over (ORDER BY year,title,artist_id) id,
                                song_id,
                                title,
                                artist_id,
                                year,
                                duration
                                FROM {kdf}
        """))

    # write songs table to parquet files partitioned by year and artist
    (songs_table.to_spark().write.partitionBy("year", "artist_id").parquet(
        f'{output_data}/songs', mode="overwrite"))

    # extract columns to create artists table
    artists_table = (ks.sql(""" SELECT
                            DISTINCT
                            row_number() over (ORDER BY artist_name) id,
                            artist_id,
                            artist_name,
                            artist_location,
                            artist_latitude,
                            artist_longitude
                            FROM {kdf}
                            
                            
"""))

    # write artists table to parquet files
    (artists_table.to_spark().write.parquet(f'{output_data}/artists',
                                            mode="overwrite"))
    def make_recall(self):
        df = ks.sql(
            Path('./brand_lterm_preference/get_recall.sql').read_text()
        ).to_pandas()
        brd_df = ks.sql(
            Path('./brand_lterm_preference/get_brand.sql').read_text()
        ).to_pandas()
        merge_df = df.merge(brd_df, on='brand', how='left')

        # preprocessing
        merge_df = merge_df.dropna()
        merge_df['cnt'] = merge_df['cnt'].astype(int).astype(str)
        merge_df['score'] = merge_df['score'].astype(str)
        merge_df = merge_df[['userid', 'brand', 'score', 'brd_nm', 'cnt']]

        final_df = merge_df.groupby('userid').agg(lambda x: '^'.join(list(
            x)) if x.name == 'brd_nm' else ','.join(list(x))).reset_index()
        final_df['prdid'] = ''
        self.df_to_table(final_df, 'members_bycho.brand_lterm_preference')
Example #6
0
def process_song_data(spark, input_data, output_data):
    
    # get filepath to song data file
    song_data = "data/song_data/*/*/*/*.json"
    
    # read song data file
    df = ks.read_json(song_data)

    # extract columns to create songs table
    songs_table = (ks.sql('''
               SELECT 
               DISTINCT
               song_id,
               title,
               artist_id,
               year,
               duration
               FROM 
                   {df}''')
              )

    # write songs table to parquet files partitioned by year and artist
    songs_table.to_spark().write.mode('overwrite').partitionBy("year", "artist_id").parquet('songs/')

    # extract columns to create artists table
    artists_table = (ks.sql('''
               SELECT 
               DISTINCT
               artist_id,
               artist_name,
               artist_location,
               artist_latitude,
               artist_longitude
               FROM 
                   {df}''')
              )
    
    # write artists table to parquet files
    artists_table.to_spark().write.mode('overwrite').parquet('artists/')

    """
Example #7
0
def process_song_data(spark, input_data, output_data):
    #INPUTS : [spark] : an instance of the spark session
    #[input_data] : path to the data set .
    #[output_data] : [path to where to save the processed data]

    # get filepath to song data file
    song_data = input_data + 'song_data/*/*/*/*.json'

    # read song data file
    df = ks.read_json("s3a://udacity-dend/song_data/*/*/*/*.json")

    # extract columns to create songs table
    songs_table = (ks.sql('''
               SELECT 
               DISTINCT
               row_number() over (ORDER BY year,title,artist_id) id,
               title,
               artist_id,
               year,
               duration
               FROM 
                   {df}'''))

    # write songs table to parquet files partitioned by year and artist
    songs_table.to_spark().write.mode('overwrite').partitionBy(
        "year", "artist_id").parquet(output_data + 'songs/')

    # extract columns to create artists table
    artists_table = (ks.sql('''
               SELECT 
               DISTINCT
               artist_id,artist_latitude,artist_location,artist_longitude,artist_name
               FROM 
                   {df}'''))

    # write artists table to parquet files
    artists_table.to_spark().write.mode('overwrite').partitionBy(
        "artist_name").parquet(output_data + 'artists/')
    def make_brand_index(self):
        large_cate_df = ks.sql(
            Path('./brand_lterm_preference/get_large_cate.sql').read_text()
        ).to_pandas()
        self.col_list = large_cate_df.cate1.unique().tolist()
        for cate1 in self.col_list:
            self.brand_feat_dict[cate1] = []

        self.feature_idx_dict = {
            'pcid_emb': 0,
            'brand_emb': len(self.col_list),
            'ismatch': len(self.col_list) * 2 + 1
        }
        tmp_df = pd.DataFrame([self.feature_idx_dict])
        self.df_to_table(tmp_df, "members_bycho.brand_feature_idx")
    def make_pcid_feat(self):
        pcid_emb_df = ks.sql(
            Path('./brand_lterm_preference/get_pcid_bhv.sql').read_text()
        ).to_pandas()
        pcid_emb_df = pcid_emb_df.dropna()
        pcid_emb_df['cate1'] = pcid_emb_df['cate1'].astype(int)

        pcid_feat_df = self.make_embedding(pcid_emb_df, 'pcid')
        self.make_processing(pcid_feat_df, 'pcid')

        print(self.pcid_feature_dict['mEVEBGZUD8KY93ykgHWIw'])

        tmp = [[pcid, feat, self.pcid_lcate_dict[pcid]]
               for pcid, feat in self.pcid_feature_dict.items()]
        tmp_df = pd.DataFrame(tmp, columns=['pcid', 'feat1', 'feat2'])
        self.df_to_table(tmp_df, "members_bycho.pcid_features")
Example #10
0
def combine_feature(train=True):
    user_filename = 'data/train_preliminary/user.csv' if train else './data/test/click_log.csv'
    result_filename = './data/combine_feature' if train else './data/combine_feature_test'
    user_df = ks.read_csv(user_filename)
    if not train:
        user_df = ks.sql('select distinct user_id from {user_df}',
                         user_df=user_df)
    wv_feature = ks.read_csv('data/wv_features.csv')
    nn_feature = ks.read_csv('data/nn_features.csv')
    stats_data = ks.read_csv(
        "data/stats_features/part-00000-f6695da4-6d9f-4ba4-80b1-d370e636696b-c000.csv"
    )
    all_features = user_df.merge(wv_feature, on='user_id').merge(
        nn_feature, on='user_id').merge(stats_data, on='user_id')
    print(all_features.shape)
    all_features.to_csv(result_filename, num_files=1)
Example #11
0
    def make_brand_feat(self):
        brand_emb_df = ks.sql(
            Path('./brand_lterm_preference/get_brand_bhv.sql').read_text()
        ).to_pandas()
        brand_emb_df = brand_emb_df.dropna()
        brand_emb_df['cate1'] = brand_emb_df['cate1'].astype(int)

        brand_feat_df = self.make_embedding(brand_emb_df, 'brd_id')
        self.make_processing(brand_feat_df, 'brd_id')

        print(self.brand_feature_dict['120402'])
        print(self.brand_lcate_dict['120402'])

        tmp = [[brd_id, feat, self.brand_lcate_dict[brd_id]]
               for brd_id, feat in self.brand_feature_dict.items()]
        tmp_df = pd.DataFrame(tmp, columns=['brd_id', 'feat1', 'feat2'])
        self.df_to_table(tmp_df, "members_bycho.brand_features")
def get_ad_dict():
    train_ad = ks.read_csv("../data/train_preliminary/ad.csv")
    test_ad = ks.read_csv("../data/test/ad.csv")
    ad_info = ks.concat([train_ad, test_ad], axis=0)
    ad_info = ad_info.drop_duplicates()
    ad_dict_sql = '''
     select 
       creative_id,
       product_id,
       product_category,
       advertiser_id,
       industry,
       row_number()
       over (partition by product_id, product_category,advertiser_id,industry order by 1 desc) ad_rn
       from {ad_info}
    '''
    ad_info = ks.sql(ad_dict_sql, ad_info=ad_info)
    print(ad_info.nunique())
    ad_info.to_csv('../data/ad_info', index=False, num_files=1)
def get_test_corpus():
    test_data = get_test_data()
    query = '''
    select user_id,
           concat_ws(' ', collect_list(b.time))             time1,
           concat_ws(' ', collect_list(b.creative_id))      creative_id,
           concat_ws(' ', collect_list(b.click_times))      click_times,
           concat_ws(' ', collect_list(b.ad_id))            ad_id,
           concat_ws(' ', collect_list(b.product_id))       product_id,
           concat_ws(' ', collect_list(b.product_category)) product_category,
           concat_ws(' ', collect_list(b.advertiser_id))    advertiser_id,
           concat_ws(' ', collect_list(b.industry))         industry
    from {test_data} b
    group by user_id
    order by time1 asc
    '''

    test_encode_result = ks.sql(query=query, test_data=test_data)
    test_encode_result.to_csv('../data/predict_corpus',
                              index=False,
                              num_files=1)
Example #14
0
    def make_train_validate_testset(self):
        dataset_df = ks.sql(
            Path('./brand_lterm_preference/get_dataset.sql').read_text()
        ).to_pandas()
        self.train, self.validate, self.test = np.split(
            dataset_df.sample(frac=1),
            [int(.6 * len(dataset_df)),
             int(.8 * len(dataset_df))])
        filename_list = ['train.txt', 'validate.txt', 'test.txt']
        df_list = [self.train, self.validate, self.test]

        try:
            for df, filename in zip(df_list, filename_list):
                txt_file = open(filename, 'w')
                for row in df.itertuples():
                    vec = []
                    label = row.istarget
                    if (str(row.pcid) in self.pcid_feature_dict) & (str(
                            row.brand) in self.brand_feature_dict):
                        ismatch = len(
                            set(self.pcid_lcate_dict[str(row.pcid)])
                            & set(self.brand_lcate_dict[str(row.brand)]))
                        pcid_feat = self.pcid_feature_dict[str(row.pcid)]
                        brand_feat = self.brand_feature_dict[str(row.brand)]
                        vec.append(str(label))
                        vec.append(pcid_feat)
                        vec.append(brand_feat)
                        if ismatch > 0:
                            vec.append(
                                str(self.feature_idx_dict['ismatch']) + ":" +
                                str(1))
                        txt_file.write("%s\n" % " ".join(vec))
                txt_file.close()
        except IOError as e:
            print("I/O error({0}) : {1}]".format(e.errno, e.strerror))
        else:
            self.move_to_opt(filename_list)
Example #15
0
 def test_error_bad_sql(self):
     with self.assertRaises(ParseException):
         ks.sql("this is not valid sql")
Example #16
0
 def test_error_unsupported_type(self):
     msg = "Unsupported variable type <class 'dict'>: {'a': 1}"
     with self.assertRaisesRegex(ValueError, msg):
         some_dict = {"a": 1}
         ks.sql("select * from {some_dict}")
Example #17
0
 def test_error_variable_not_exist(self):
     msg = "The key variable_foo in the SQL statement was not found.*"
     with self.assertRaisesRegex(ValueError, msg):
         ks.sql("select * from {variable_foo}")
Example #18
0
# MAGIC %md
# MAGIC ### Value Counts

# COMMAND ----------

# To get value counts of the different property types with PySpark
display(df.groupby("property_type").count().orderBy("count", ascending=False))

# COMMAND ----------

# Value counts in Koalas
kdf["property_type"].value_counts()

# COMMAND ----------

# MAGIC %md
# MAGIC ### Visualizations with Koalas DataFrames

# COMMAND ----------

kdf.plot(kind="hist", x="bedrooms", y="price", bins=200)

# COMMAND ----------

# MAGIC %md
# MAGIC ### SQL on Koalas DataFrames

# COMMAND ----------

ks.sql("select distinct(property_type) from {kdf}")
Example #19
0
    def sterm_reranking(self):
        print("start reranking!!")
        brand_feat_df = ks.sql(
            Path('./brand_lterm_preference/get_brand_feat.sql').read_text()
        ).to_pandas()
        pcid_feat_df = ks.sql(
            Path('./brand_lterm_preference/get_pcid_feat.sql').read_text()
        ).to_pandas()
        sterm_df = ks.sql(
            Path('./brand_lterm_preference/get_sterm.sql').read_text()
        ).to_pandas()
        brd_df = ks.sql(
            Path('./brand_lterm_preference/get_brand.sql').read_text()
        ).to_pandas()
        feat_idx_df = ks.sql(
            Path('./brand_lterm_preference/get_feat_idx.sql').read_text()
        ).to_pandas()

        pcid_feat_dict = {}
        for row in pcid_feat_df.itertuples():
            pcid_feat_dict.setdefault(row.userid, {})['feat1'] = row.feat1
            pcid_feat_dict.setdefault(row.userid, {})['feat2'] = row.feat2

        brand_feat_dict = {}
        for row in brand_feat_df.itertuples():
            brand_feat_dict.setdefault(row.brd_id, {})['feat1'] = row.feat1
            brand_feat_dict.setdefault(row.brd_id, {})['feat2'] = row.feat2

        target_dict = {'userid': [], 'brand': []}
        file_name = "/".join([self.OPT_HOME, 'sterm.txt'])
        try:
            txt_file = open(file_name, 'w')
            for row in sterm_df.itertuples():
                vec = []
                if (str(row.userid) in pcid_feat_dict) & (str(row.brand)
                                                          in brand_feat_dict):
                    ismatch = len(
                        set(pcid_feat_dict[str(row.userid)]['feat2'])
                        & set(brand_feat_dict[str(row.brand)]['feat2']))
                    pcid_feat = pcid_feat_dict[str(row.userid)]['feat1']
                    brand_feat = brand_feat_dict[str(row.brand)]['feat1']
                    vec.append(pcid_feat)
                    vec.append(brand_feat)
                    if ismatch > 0:
                        vec.append(
                            str(feat_idx_df.iloc[0]['ismatch']) + ":" + str(1))
                    txt_file.write("%s\n" % " ".join(vec))
                    target_dict['userid'].append(row.userid)
                    target_dict['brand'].append(row.brand)

            txt_file.close()
        except IOError as e:
            print("I/O error({0}) : {1}]".format(e.errno, e.strerror))
        else:
            if self.run_model() == 0:
                out_file = "/".join([self.OPT_HOME, 'sterm.txt.out'])
                score_df = pd.read_csv(out_file, sep='\t', names=["score"])

                target_df = pd.DataFrame(target_dict)
                rerank_df = pd.concat([target_df, score_df], axis=1)

                rerank_df['brand'] = rerank_df['brand'].astype(int)
                brd_df['brand'] = brd_df['brand'].astype(int)

                rerank_df = rerank_df.merge(brd_df, on='brand', how='left')
                rerank_df = rerank_df[(rerank_df.brand.notnull())
                                      & (rerank_df.brd_nm.notnull()) &
                                      (rerank_df.cnt.notnull())]

                rerank_df['RN'] = rerank_df.sort_values(['userid', 'score'], ascending=[True, False]) \
                                   .groupby(['userid']) \
                                   .cumcount() + 1
                rerank_df = rerank_df[rerank_df.RN < 9]
                rerank_df = rerank_df.sort_values(['userid', 'RN'],
                                                  ascending=[True, True])
                rerank_df = rerank_df[[
                    'userid', 'brand', 'score', 'brd_nm', 'cnt'
                ]]
                rerank_df['brand'] = rerank_df['brand'].astype(str)
                rerank_df['score'] = rerank_df['score'].astype(str)
                rerank_df['brd_nm'] = rerank_df['brd_nm'].astype(str)
                rerank_df['cnt'] = rerank_df['cnt'].astype(str)
                final_df = rerank_df.groupby('userid').agg(
                    lambda x: '^'.join(list(x)) if x.name == 'brd_nm' else ','.
                    join(list(x))).reset_index()
                final_df['prdid'] = ''

                self.df_to_table(final_df,
                                 "members_bycho.brand_preference_reranking")
            else:
                print("fail to create reranking file!!")
Example #20
0
def process_log_data(spark, input_data, output_data):
    """process log_data to create users, time ,songsplay table"""
    # get filepath to log data file
    log_data = 'data/*.json'

    # read log data file
    log_kdf = ks.read_json(log_data)

    # filter by actions for song plays
    df = log_kdf.filter(log_kdf.page == "NextSong")

    # extract columns for users table
    users_table = ks.sql(""" SELECT 
                           DISTINCT
                           userId,
                           firstName,
                           lastName,
                           gender,
                           level 
                           FROM {df}""")

    # write users table to parquet files
    (users_table.to_spark().write.parquet(f'{output_data}/users',
                                          mode="overwrite"))

    # create timestamp column from original timestamp column
    df['timestamp'] = ks.to_datetime(df['ts'], unit='ns')

    # create datetime column from original timestamp column
    df['datetime'] = ks.to_datetime(df['ts'])

    # extract columns to create time table
    time_table = (ks.sql("""
            SELECT
            DISTINCT
           datetime as start_time,
           extract(day from datetime) as day,
           extract(week from datetime) as week,
           extract(month from datetime) as month,
           extract(year from datetime) as year,
           extract(hour from datetime) as hour
           from {df}
                        """))

    # to enable join on table
    ks.set_option('compute.ops_on_diff_frames', True)

    # add weekday columns
    time_table['weekday'] = df.datetime.dt.weekday

    # write time table to parquet files partitioned by year and month
    (time_table.to_spark().write.partitionBy('year', 'month').parquet('time/'))

    # read in song data to use for songplays table
    song_df = ks.read_json('data/song_data/*/*/*/*.json')

    # convert ts to datetime
    log_kdf["ts"] = ks.to_datetime(log_kdf['ts'])

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = (ks.sql(""" SELECT 
                             DISTINCT
                             row_number() over (ORDER BY e.userId) songplay_id,
                             e.ts AS start_time,
                             extract(month from e.ts) as month,
                             extract(year from e.ts) as year,
                             e.userId AS user_id,
                             e.level AS level,
                             s.song_id AS song_id,
                             s.artist_id AS artist_id,
                             e.sessionId as session_id,
                             e.location AS location,
                             e.userAgent AS user_agent
                             FROM {log_kdf} as e join {song_df} as s ON
                             (e.artist = s.artist_name AND 
                             e.song = s.title AND 
                             e.length= s.duration)
                             WHERE e.page='NextSong'

             """))

    # write songplays table to parquet files partitioned by year and month
    (songplays_table.to_spark().write.partitionBy("year", "month").parquet(
        f'{output_data}/songplayes', mode="overwrite"))