コード例 #1
0
ファイル: youku_user_tag.py プロジェクト: yyangcg/bbak
def match_all(df, data_path):
    '''
    匹配所有的优酷观影记录和爱奇艺标签分数,
    存入yk2iqytag表
    :param df:
    :param data_path:
    :return:
    '''
    sqlContext = HiveContext(sc)
    sqlContext.sql("use usercenter_dw")
    # df = sqlContext.sql("select * from youku_mediadata")
    df = df.withColumn(
        "yk_movie",
        translate('title')("mediadata")).filter(col('yk_movie') != '')
    iqiyi_tags = [
        'entertainment', 'technology', 'shopping', 'lifestyle', 'business',
        'fashion', 'tourism', 'game', 'finance', 'female', 'sports',
        'photography', 'car'
    ]
    iqiyi2tag_df, iqy = iqiyi2tag(data_path, iqiyi_tags)
    addcols = [
        'beauty', 'childcare', 'movie', 'funny', 'health', 'education',
        'music', 'news'
    ]
    for i in addcols:
        iqiyi2tag_df = iqiyi2tag_df.withColumn(i, lit(0)).cache()
    yk_rdd = df.select('mediadata').rdd.map(list)
    yk_rdd = yk_rdd.map(lambda xs: [xs[0], find(xs[0], iqy)])
    df = yk_rdd.toDF(['yktag', 'ykmovie']).filter(col('movie') != '')
    df = df.join(iqiyi2tag_df,
                 df.ykmovie == iqiyi2tag_df.iqytag).drop('ykmovie')
    df.registerTempTable('tab_name')
    sqlContext.sql("insert into table yk2iqytag select * from tab_name")
コード例 #2
0
def getContentRecoms(u_id, sim_bus_limit=5):
    # select restaurants previously reviewed (3+) by the user
    query = """
    SELECT distinct r.business_id,b.business_name FROM reviews r join business b on r.business_id = b.business_id  
    where r.stars >= 3.0 
    and r.user_id = "{}"
    """.format(u_id)

    usr_rev_bus = spark.sql(query)
    #session['previousbusiness'] = usr_rev_bus.select('business_name').collect()
    # from these get sample of 5 restaurants
    usr_rev_bus = usr_rev_bus.limit(5)
    print(usr_rev_bus.select('business_name').collect())

    usr_rev_bus_det = getBusinessDetails(usr_rev_bus)

    # show the sample details
    #print('\nBusinesses previously reviewed by user:'******'business_id', 'business_name', 'categories']).show(truncate=False)

    bus_list = [i.business_id for i in usr_rev_bus.collect()]

    # get restaurants similar to the sample
    sim_bus_df = getSimilarBusinesses(bus_list, sim_bus_limit)

    # filter out those have been reviewd before by the user
    s = sim_bus_df.alias("s")
    r = usr_rev_bus.alias("r")
    j = s.join(r, col("s.business_id") == col("r.business_id"), 'left_outer') \
        .where(col("r.business_id").isNull()) \
        .select([col('s.business_id'), col('s.score')])

    a = j.orderBy("score", ascending=False).limit(sim_bus_limit)

    return getBusinessDetails(a)
コード例 #3
0
def getBusinessDetails(in_bus):
    a = in_bus.alias("a")
    b = business.alias("b")

    return a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
        .select([col('a.' + xx) for xx in a.columns] + [col('b.business_name'), col('b.categories'),
                                                        col('b.stars'), col('b.review_count'),
                                                        col('b.latitude'), col('b.longitude')])
コード例 #4
0
def getSimilarBusinesses(b_ids, sim_bus_limit=10):
    all_business_vecs = reviews_by_business_trf_df.select(
        'business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("score", IntegerType(), True),
        StructField("input_business_id", StringType(), True)
    ])

    similar_businesses_df = spark.createDataFrame([], schema)

    for b_id in b_ids:
        input_vec = [(r[1]) for r in all_business_vecs if r[0] == b_id][0]
        # input_vec = reviews_by_business_trf_df.select('word_vec')\
        # .filter(reviews_by_business_trf_df['business_id'] == b_id)\
        # .collect()[0][0]

        similar_business_rdd = spark.sparkContext.parallelize(
            (i[0], float(CosineSim(input_vec, i[1])))
            for i in all_business_vecs)

        similar_business_df = spark.createDataFrame(similar_business_rdd) \
            .withColumnRenamed('_1', 'business_id') \
            .withColumnRenamed('_2', 'score') \
            .orderBy("score", ascending=False)
        similar_business_df = similar_business_df.filter(
            col("business_id") != b_id).limit(sim_bus_limit)
        similar_business_df = similar_business_df.withColumn(
            'input_business_id', lit(b_id))

        similar_businesses_df = similar_businesses_df \
            .union(similar_business_df)

    return similar_businesses_df
コード例 #5
0
ファイル: youku_user_tag.py プロジェクト: yyangcg/...
def match_all_df(data_path):
    sqlContext = HiveContext(sc)
    sqlContext.sql("use usercenter_dw")
    df = sqlContext.sql("select * from youku_mediadata")
    df = df.withColumn("yk_movie",
                       translate('title')("mediadata")).filter(
                           col('yk_movie') != '').withColumn(
                               'movie',
                               lit('')).persist(StorageLevel.DISK_ONLY)
    iqiyi_tags = [
        '娱乐', '科技', '购物', '生活', '企业办公', '时尚', '旅游', '游戏', '财经', '女性', '体育',
        '摄影', '汽车'
    ]
    iqiyi2tag_df, iqy = iqiyi2tag(data_path, iqiyi_tags)
    for ele in iqy:
        df1 = df.withColumn(
            'movie',
            when(col('yk_movie').like('%' + ele + '%'),
                 ele).otherwise(col('movie'))).filter(
                     col('movie') != '').select('mediadata', 'movie')
        df = df.withColumn(
            'movie',
            when(col('yk_movie').like('%' + ele + '%'),
                 ele).otherwise(col('movie'))).filter(
                     col('movie') == '').persist(StorageLevel.DISK_ONLY)
        df1.registerTempTable('tab_name')
        sqlContext.sql("insert into table youku_iqy select * from tab_name ")
コード例 #6
0
ファイル: youku_user_tag.py プロジェクト: yyangcg/...
def match_all(data_path):
    sqlContext = HiveContext(sc)
    sqlContext.sql("use usercenter_dw")
    df = sqlContext.sql("select * from youku_mediadata")
    df = df.withColumn(
        "yk_movie",
        translate('title')("mediadata")).filter(col('yk_movie') != '').persist(
            StorageLevel.DISK_ONLY)
    iqiyi_tags = [
        '娱乐', '科技', '购物', '生活', '企业办公', '时尚', '旅游', '游戏', '财经', '女性', '体育',
        '摄影', '汽车'
    ]
    iqiyi2tag_df, iqy = iqiyi2tag(data_path, iqiyi_tags)
    yk_rdd = df.select('mediadata').rdd.map(list)
    yk_rdd = yk_rdd.map(lambda xs: [xs[0], find(xs[0], iqy)])
    df = yk_rdd.toDF(['mediadata', 'movie'])
    df = df.join(iqiyi2tag_df, "movie", "left")
    df.registerTempTable('tab_name')
    sqlContext.sql("insert into table yk_iqytag select * from tab_name ")