Exemple #1
0
M = len(ori)
V = len(ori[0])
assert M % d == 0
assert V % d == 0

m = M / d
v = V / d

GAMMA = 0.02
LAMBDA = 0.1
STEP = 0.9

W = MutableDict(d)
H = MutableDict(d)

ori_b = dpark.broadcast(ori)


def sgd(i_j):
    (i, j) = i_j
    Wi = W.get(i)
    if Wi is None:
        Wi = numpy.random.rand(m, k)
        W.put(i, Wi)

    Hj = H.get(j)
    if Hj is None:
        Hj = numpy.random.rand(v, k)
        H.put(j, Hj)

    ori = ori_b.value
Exemple #2
0
IBIAS_PATH='/nfs/wuhong/offline_use/ibias_0/'
RATING_PATH='/nfs/wuhong/fm_data/user_music_factor_model/user_track_rating_for_training/'
ITEM_FACTOR_PATH='/nfs/wuhong/offline_use/H_0/'

NEW_RATING_PATH='/nfs/wuhong/offline_use/rating_new/'
NEW_ITEM_FACTOR_PATH='/nfs/wuhong/offline_use/H_new/'

dpark = DparkContext()

f_global = file(MU_PATH)
line = ''
for l in f_global:
    line = l
mu = float(line.strip().split('\t')[1])
f_global.close()
mu = dpark.broadcast(mu)

def local_mapper(line):
    iid, v, _ = line.strip().split('\t')
    return (iid, float(v))

ibias = {}
ibias = dpark.textFile(glob.glob(IBIAS_PATH)).map(
        local_mapper
    ).collectAsMap()
ibias = dpark.broadcast(ibias)

def local_mapper2(line):
    uid, iid, aid, v = line.strip().split('\t')
    return '%s,%s,%s\n' % (uid , iid, float(v) - mu - ibias[iid])
            .filter(lambda x:x)\
            .map(lambda l: general_map.value.parse(l, spec))\
            .filter(lambda x:x)\
            .filter(lambda line: (not is_spider(line) and (line['uid'] or line['bid'])))\
            .filter(lambda l: l['bid'] not in fraud.value and l['uid'] not in fraud.value)


    spec = set(['url', 'uid', 'bid', 'unit_id', 'ad_id', 'status_code', 'user_agent', 'region', 'page_tags', 'hour', 'group'])
    features = common_gen(spec)

    features = features.map(feature_extract)\
        .filter(lambda x:x)\
        .cache()

    user_list = set(features.map(lambda x: x[0]).filter(lambda x: x<>'None').collect())
    user_list_b = dp.broadcast(user_list)

    user_feature = dp.makeRDD([])

    def _parse_list(line):
        uid, features = line.split('\t')
        features = [x.split(':') for x in features.split('|')]
        features = [(x[0], float(x[1])) for x in features]
        features = sorted(features, key=lambda x: x[1], reverse=True)
        return (uid, features)

    for name in ['book_cluster', 'movie_cluster', 'group_cluster', 'text_cluster']:
        fn = '/home2/alg/user_profile/%s/%s' % (current_date, name)
        if not os.path.exists(fn):
            continue
        rdd = dp.textFile(fn, splitSize=16<<20)\