def match_all(df, data_path): ''' 匹配所有的优酷观影记录和爱奇艺标签分数, 存入yk2iqytag表 :param df: :param data_path: :return: ''' sqlContext = HiveContext(sc) sqlContext.sql("use usercenter_dw") # df = sqlContext.sql("select * from youku_mediadata") df = df.withColumn( "yk_movie", translate('title')("mediadata")).filter(col('yk_movie') != '') iqiyi_tags = [ 'entertainment', 'technology', 'shopping', 'lifestyle', 'business', 'fashion', 'tourism', 'game', 'finance', 'female', 'sports', 'photography', 'car' ] iqiyi2tag_df, iqy = iqiyi2tag(data_path, iqiyi_tags) addcols = [ 'beauty', 'childcare', 'movie', 'funny', 'health', 'education', 'music', 'news' ] for i in addcols: iqiyi2tag_df = iqiyi2tag_df.withColumn(i, lit(0)).cache() yk_rdd = df.select('mediadata').rdd.map(list) yk_rdd = yk_rdd.map(lambda xs: [xs[0], find(xs[0], iqy)]) df = yk_rdd.toDF(['yktag', 'ykmovie']).filter(col('movie') != '') df = df.join(iqiyi2tag_df, df.ykmovie == iqiyi2tag_df.iqytag).drop('ykmovie') df.registerTempTable('tab_name') sqlContext.sql("insert into table yk2iqytag select * from tab_name")
def getContentRecoms(u_id, sim_bus_limit=5): # select restaurants previously reviewed (3+) by the user query = """ SELECT distinct r.business_id,b.business_name FROM reviews r join business b on r.business_id = b.business_id where r.stars >= 3.0 and r.user_id = "{}" """.format(u_id) usr_rev_bus = spark.sql(query) #session['previousbusiness'] = usr_rev_bus.select('business_name').collect() # from these get sample of 5 restaurants usr_rev_bus = usr_rev_bus.limit(5) print(usr_rev_bus.select('business_name').collect()) usr_rev_bus_det = getBusinessDetails(usr_rev_bus) # show the sample details #print('\nBusinesses previously reviewed by user:'******'business_id', 'business_name', 'categories']).show(truncate=False) bus_list = [i.business_id for i in usr_rev_bus.collect()] # get restaurants similar to the sample sim_bus_df = getSimilarBusinesses(bus_list, sim_bus_limit) # filter out those have been reviewd before by the user s = sim_bus_df.alias("s") r = usr_rev_bus.alias("r") j = s.join(r, col("s.business_id") == col("r.business_id"), 'left_outer') \ .where(col("r.business_id").isNull()) \ .select([col('s.business_id'), col('s.score')]) a = j.orderBy("score", ascending=False).limit(sim_bus_limit) return getBusinessDetails(a)
def getBusinessDetails(in_bus): a = in_bus.alias("a") b = business.alias("b") return a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \ .select([col('a.' + xx) for xx in a.columns] + [col('b.business_name'), col('b.categories'), col('b.stars'), col('b.review_count'), col('b.latitude'), col('b.longitude')])
def getSimilarBusinesses(b_ids, sim_bus_limit=10): all_business_vecs = reviews_by_business_trf_df.select( 'business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect() schema = StructType([ StructField("business_id", StringType(), True), StructField("score", IntegerType(), True), StructField("input_business_id", StringType(), True) ]) similar_businesses_df = spark.createDataFrame([], schema) for b_id in b_ids: input_vec = [(r[1]) for r in all_business_vecs if r[0] == b_id][0] # input_vec = reviews_by_business_trf_df.select('word_vec')\ # .filter(reviews_by_business_trf_df['business_id'] == b_id)\ # .collect()[0][0] similar_business_rdd = spark.sparkContext.parallelize( (i[0], float(CosineSim(input_vec, i[1]))) for i in all_business_vecs) similar_business_df = spark.createDataFrame(similar_business_rdd) \ .withColumnRenamed('_1', 'business_id') \ .withColumnRenamed('_2', 'score') \ .orderBy("score", ascending=False) similar_business_df = similar_business_df.filter( col("business_id") != b_id).limit(sim_bus_limit) similar_business_df = similar_business_df.withColumn( 'input_business_id', lit(b_id)) similar_businesses_df = similar_businesses_df \ .union(similar_business_df) return similar_businesses_df
def match_all_df(data_path): sqlContext = HiveContext(sc) sqlContext.sql("use usercenter_dw") df = sqlContext.sql("select * from youku_mediadata") df = df.withColumn("yk_movie", translate('title')("mediadata")).filter( col('yk_movie') != '').withColumn( 'movie', lit('')).persist(StorageLevel.DISK_ONLY) iqiyi_tags = [ '娱乐', '科技', '购物', '生活', '企业办公', '时尚', '旅游', '游戏', '财经', '女性', '体育', '摄影', '汽车' ] iqiyi2tag_df, iqy = iqiyi2tag(data_path, iqiyi_tags) for ele in iqy: df1 = df.withColumn( 'movie', when(col('yk_movie').like('%' + ele + '%'), ele).otherwise(col('movie'))).filter( col('movie') != '').select('mediadata', 'movie') df = df.withColumn( 'movie', when(col('yk_movie').like('%' + ele + '%'), ele).otherwise(col('movie'))).filter( col('movie') == '').persist(StorageLevel.DISK_ONLY) df1.registerTempTable('tab_name') sqlContext.sql("insert into table youku_iqy select * from tab_name ")
def match_all(data_path): sqlContext = HiveContext(sc) sqlContext.sql("use usercenter_dw") df = sqlContext.sql("select * from youku_mediadata") df = df.withColumn( "yk_movie", translate('title')("mediadata")).filter(col('yk_movie') != '').persist( StorageLevel.DISK_ONLY) iqiyi_tags = [ '娱乐', '科技', '购物', '生活', '企业办公', '时尚', '旅游', '游戏', '财经', '女性', '体育', '摄影', '汽车' ] iqiyi2tag_df, iqy = iqiyi2tag(data_path, iqiyi_tags) yk_rdd = df.select('mediadata').rdd.map(list) yk_rdd = yk_rdd.map(lambda xs: [xs[0], find(xs[0], iqy)]) df = yk_rdd.toDF(['mediadata', 'movie']) df = df.join(iqiyi2tag_df, "movie", "left") df.registerTempTable('tab_name') sqlContext.sql("insert into table yk_iqytag select * from tab_name ")