def test_update(self): self.recommender.register(User(0)) self.recommender.register(Item(0)) self.recommender.update(Event(User(0), Item(0), 1)) self.assertEqual(self.recommender.n_user, 1) self.assertEqual(self.recommender.n_item, 1) assert_array_equal(self.recommender.freq, np.array([1]))
def test_score(self): self.recommender.register(User(0)) self.recommender.register(Item(0)) self.recommender.update(Event(User(0), Item(0), 1)) score = self.recommender.score(User(0), np.array([0])) print(score) self.assertTrue(score >= -1. and score <= 1.)
def test_user(self): user = User(1, np.arange(5)) self.assertEqual(user.index, 1) v = user.encode(dim=None, index=True, feature=True, vertical=False) assert_array_equal(v, np.array([0, 1, 0, 1, 2, 3, 4])) v = user.encode(dim=3, index=True, feature=False, vertical=False) assert_array_equal(v, np.array([0, 1, 0])) v = user.encode(dim=None, index=False, feature=True, vertical=True) assert_array_equal(v, np.array([[0], [1], [2], [3], [4]]))
def setUp(self): recommender = Popular() recommender.initialize() self.evaluator = Evaluator(recommender=recommender, repeat=False) self.samples = [Event(User(0), Item(0), 1), Event(User(0), Item(1), 1), Event(User(1), Item(2), 1), Event(User(0), Item(3), 1), Event(User(2), Item(4), 1), Event(User(1), Item(4), 1), Event(User(0), Item(5), 1), Event(User(2), Item(1), 1), Event(User(0), Item(6), 1), Event(User(2), Item(0), 1)]
def test_event(self): user = User(1, np.arange(3)) item = Item(1, np.arange(3)) event = Event(user, item, 5.0, np.arange(5)) self.assertEqual(event.value, 5.0) v = event.encode(index=False, feature=True, context=True, vertical=False) assert_array_equal(v, np.array([0, 1, 2, 0, 1, 2, 3, 4, 0, 1, 2])) v = event.encode(index=True, feature=True, context=False, vertical=False) assert_array_equal(v, np.array([0, 1, 0, 1, 2, 0, 1, 0, 1, 2])) v = event.encode(n_user=3, n_item=3, index=True, feature=False, context=False, vertical=True) assert_array_equal(v, np.array([[0], [1], [0], [0], [1], [0]]))
async def process(stream): async for obj in stream: event = json.loads(obj) if event['rating'] < 3: continue user, item = User(event['user'] - 1), Item(event['item'] - 1) print(recommender.recommend(user, np.arange(0, n_item))) recommender.update(Event(user, item))
def convert(self): """Create a list of samples and count number of users/items. """ clicks = [] with open(os.path.join(os.path.dirname(__file__), '../data/click.tsv')) as f: clicks = list( map(lambda l: list(map(int, l.rstrip().split('\t'))), f.readlines())) self.samples = [] u_index = 0 # each sample indicates different visitors n_geo = 50 # 50 states in US ad_ids = [] ad_categories = [] for ad_id, year, geo, sex in clicks: if ad_id not in ad_ids: ad_ids.append(ad_id) ad_categories.append(self.categories[ad_id]) i_index = ad_ids.index(ad_id) geo_vec = np.zeros(n_geo) geo_vec[geo - 1] = 1. # normalized age in [0, 1] # clickgenerator.jl generates a birth year in [1930, 2000] age = 1. - ((2000 - year) / 70.) user = User( 0, np.concatenate((np.array([age]), np.array([sex]), geo_vec))) # category vector category = np.zeros(3) category[ad_categories[i_index]] = 1 item = Item(i_index, category) sample = Event(user, item, 1.) self.samples.append(sample) u_index += 1 self.n_user = u_index self.n_item = 5 # 5 ad variants self.n_sample = len(self.samples) self.n_batch_train = int( self.n_sample * 0.2) # 20% for pre-training to avoid cold-start self.n_batch_test = int(self.n_sample * 0.1) # 10% for evaluation of pre-training self.n_test = self.n_sample - (self.n_batch_train + self.n_batch_test)
def recommend_service(chosen): evaluator = pickle.load(open('evaluator.pckl', 'rb')) user_id = 273882713 last = pickle.load(open('last.pckl', 'rb')) tfidfs = pickle.load(open('movies.pckl', 'rb')) item_ids_keyed = pickle.load(open('item_ids.pckl', 'rb')) # 70% incremental evaluation and updating logging.info('incrementally predict, evaluate and update the recommender') movie_names = pickle.load(open('movies_names.pckl', 'rb')) items = [] user = User(len(evaluator.rec.users), np.zeros(0)) if evaluator.rec.is_new_user(user.index): evaluator.rec.register_user(user) items_in_order = list(item_ids_keyed) for item_id in chosen: index = items_in_order.index(int(item_id)) item = Item(index, tfidfs[int(item_id)]) if evaluator.rec.is_new_item(item.index): evaluator.rec.register_item(item) items.append(item) events = [] # Calculate time of the week date = datetime.now() weekday_vec = np.zeros(7) weekday_vec[date.weekday()] = 1 if user_id in last: last_item_vec = last[user_id]['item'] last_weekday_vec = last[user_id]['weekday'] else: last_item_vec = np.zeros(49) last_weekday_vec = np.zeros(7) for item in items: others = np.concatenate((weekday_vec, last_item_vec, last_weekday_vec)) events.append(Event(user, item, 1, others)) last[user_id] = {'item': item.feature, 'weekday': weekday_vec} for e in events: evaluator.rec.update(e) # Re save pickles pickle.dump(evaluator, open('evaluator.pckl', 'wb')) pickle.dump(last, open('last.pckl', 'wb')) candidates = list(set(evaluator.item_buffer)) recommendations = evaluator.rec.recommend(user, np.array(candidates), [0 for x in range(0, 63)]) top_rec = recommendations[0][-10:] for top in reversed(top_rec): print(movie_names[list(item_ids_keyed)[top]])
def convert(self): """Create a list of samples and count number of users/items. """ df_lastfm = pd.read_csv(os.path.join(os.path.dirname(__file__), '../data/lastfm.tsv'), delimiter='\t') self.samples = [] self.dts = [] # number of artists will be dimension of item contexts n_artist = len(set(df_lastfm['artist_index'])) self.contexts['item'] = n_artist countries = list(set(df_lastfm['country'])) n_country = len(countries) # 16 in total d_country = dict(zip(countries, range(n_country))) for i, row in df_lastfm.iterrows(): country_vec = np.zeros(n_country) country_vec[d_country[row['country']]] = 1. user = User(row['u_index'], np.concatenate((np.array([row['age']]), np.array([row['gender']]), country_vec))) artist_vec = np.zeros(n_artist) artist_vec[row['artist_index']] = 1 item = Item(row['i_index'], artist_vec) sample = Event(user, item, 1., np.array([row['time']])) self.samples.append(sample) self.dts.append(row['dt']) self.n_user = len(set(df_lastfm['userid'])) self.n_item = len(set(df_lastfm['track-id'])) self.n_sample = len(self.samples) self.n_batch_train = int(self.n_sample * 0.2) # 20% for pre-training to avoid cold-start self.n_batch_test = int(self.n_sample * 0.1) # 10% for evaluation of pre-training self.n_test = self.n_sample - (self.n_batch_train + self.n_batch_test)
def test_add_user(self): self.recommender.add_user(User(0)) self.assertEqual(self.recommender.n_user, 1)
def test_score(self): self.recommender.register(User(0)) self.recommender.register(Item(0)) self.recommender.update(Event(User(0), Item(0), 1)) self.assertEqual(self.recommender.score(User(0), np.array([0])), 1)
def test_register_user(self): self.recommender.register(User(0)) self.assertEqual(self.recommender.n_user, 1)
def create_user(self): new_idx = self.users[-1] + 1 self.users.append(new_idx) i_idx = len(self.user_ids) self.user_ids[new_idx] = i_idx return User(new_idx, np.zeros(1))
def test_update(self): self.recommender.add_user(User(0)) self.recommender.add_item(Item(0)) self.recommender.update(Event(User(0), Item(0), 1)) self.assertEqual(self.recommender.n_user, 1) self.assertEqual(self.recommender.n_item, 1)
app = faust.App( 'flurs-recommender', broker='kafka://localhost:9092', value_serializer='raw', ) topic = app.topic('flurs-events', value_type=bytes) recommender = MFRecommender(k=40) recommender.initialize() n_user, n_item = 943, 1682 for u in range(1, n_user + 1): recommender.register(User(u - 1)) for i in range(1, n_item + 1): recommender.register(Item(i - 1)) @app.agent(topic) async def process(stream): async for obj in stream: event = json.loads(obj) if event['rating'] < 3: continue user, item = User(event['user'] - 1), Item(event['item'] - 1) print(recommender.recommend(user, np.arange(0, n_item))) recommender.update(Event(user, item))
map_user = {} map_item = {} user_idx = 0 item_idx = 0 print('load') for user, item in tqdm(df): if user not in map_user: map_user[user] = user_idx user_idx += 1 if item not in map_item: map_item[item] = item_idx item_idx += 1 user = map_user[user] item = map_item[item] user = User(user) recommender.add_user(user) item = Item(item) recommender.add_item(item) event = Event(user, item, 1) recommender.update(event) with open('recommend.pkl', 'wb') as f: pickle.dump(recommender, f, -1) # specify target user and list of item candidates #recommender.recommend(user, [0]) # => (sorted candidates, scores)
def convert(self): """Create a list of samples and count number of users/items. """ self.__load_ratings() users = self.__load_users() movies, movie_titles = self.__load_movies() user_ids = [] item_ids = [] self.samples = [] head_date = datetime(*time.localtime(self.ratings[0, 3])[:6]) self.dts = [] last = {} for user_id, item_id, rating, timestamp in self.ratings: # give an unique user index if user_id not in user_ids: user_ids.append(user_id) u_index = user_ids.index(user_id) # give an unique item index if item_id not in item_ids: item_ids.append(item_id) i_index = item_ids.index(item_id) # delta days date = datetime(*time.localtime(timestamp)[:6]) dt = self.__delta(head_date, date) self.dts.append(dt) weekday_vec = np.zeros(7) weekday_vec[date.weekday()] = 1 if user_id in last: last_item_vec = last[user_id]['item'] last_weekday_vec = last[user_id]['weekday'] else: last_item_vec = np.zeros(18) last_weekday_vec = np.zeros(7) others = np.concatenate( (weekday_vec, last_item_vec, last_weekday_vec)) user = User(u_index, users[user_id]) item = Item(i_index, movies[item_id]) sample = Event(user, item, 1., others) self.samples.append(sample) # record users' last rated movie features last[user_id] = {'item': movies[item_id], 'weekday': weekday_vec} self.n_user = len(user_ids) self.n_item = len(item_ids) self.n_sample = len(self.samples) self.n_batch_train = int( self.n_sample * 0.2) # 20% for pre-training to avoid cold-start self.n_batch_test = int(self.n_sample * 0.1) # 10% for evaluation of pre-training self.n_test = self.n_sample - (self.n_batch_train + self.n_batch_test)
def fetch_movielens(self): self.seen_movies = set() print('Getting ratings...') users, ratings = self.load_ratings() print('Getting movies...') movies = self.load_movies() samples = [] user_ids = {} item_ids = {} head_date = datetime(*time.localtime(ratings[0, 3])[:6]) dts = [] last = {} cnt = 0 print('Processing ratings...') for user_id, item_id, rating, timestamp in tqdm(ratings): # Remap user indices if user_id in user_ids: u_index = user_ids[user_id] else: u_index = len(user_ids) user_ids[user_id] = u_index # Remap item indices if item_id in item_ids: i_index = item_ids[item_id] else: i_index = len(item_ids) item_ids[item_id] = i_index self.seen_movies.add(item_id) date = datetime(*time.localtime(timestamp)[:6]) dt = delta(head_date, date) dts.append(dt) weekday_vec = np.zeros(7) weekday_vec[date.weekday()] = 1 if user_id in last: last_item_vec = last[user_id] else: last_item_vec = np.zeros(20) others = np.concatenate((weekday_vec, last_item_vec)) # Dummy feature to prevent errors user = User(u_index, np.zeros(1)) item = Item(i_index, movies[item_id]) sample = Event(user, item, 1., others) samples.append(sample) last[user_id] = movies[item_id] self.user_ids = user_ids self.item_ids = item_ids self.rev_item_ids = {y: x for (x, y) in item_ids.items()} print('Done loading!') return Bunch( samples=samples, can_repeat=False, # 7 days of the week + 20 genres # Dummy feature for user contexts={ 'others': 7 + 20, 'item': 20, 'user': 1 }, n_user=len(user_ids), n_item=len(item_ids), n_sample=len(samples))
def test_score(self): self.recommender.add_user(User(0)) self.recommender.add_item(Item(0)) self.recommender.update(Event(User(0), Item(0), 1)) score = self.recommender.score(User(0), np.array([0]), np.array([0])) self.assertTrue(score >= 0. and score <= 1.0)
def fetch_movielens(data_home=None, size='100k'): assert data_home is not None print('Loading ratings.') try: ratings = pickle.load(open('ratings.pkl', 'rb')) except FileNotFoundError: ratings = load_ratings(data_home, size) f = open('ratings.pkl', 'wb') pickle.dump(ratings, f) f.close() print('Loading movies.') try: movies = pickle.load(open('movies.pkl', 'rb')) except FileNotFoundError: movies = load_movies(data_home, size) f = open('movies.pkl', 'wb') pickle.dump(movies, f) f.close() samples = [] user_ids = [] item_ids = [] head_date = datetime(*time.localtime(int(ratings[0, 3]))[:6]) dts = [] user_ids_keyed = {} item_ids_keyed = {} last = {} print('creating dataset') i = 1 last_item_vec_zeros = np.zeros(0) for user_id, item_id, rating, timestamp in ratings: a = datetime.now() #if i%100000 == 0: #print('rating ' + str(i)) item_id = int(item_id) # give an unique user index if user_id in user_ids_keyed: u_index = user_ids_keyed[user_id] else: u_index = len(user_ids_keyed) user_ids_keyed[user_id] = u_index #if user_id not in user_ids: # user_ids.append(user_id) #u_index = user_ids.index(user_id) b = datetime.now() # give an unique item index if item_id in item_ids_keyed: i_index = item_ids_keyed[item_id] else: i_index = len(item_ids_keyed) item_ids_keyed[item_id] = i_index #if item_id not in item_ids: # item_ids.append(item_id) #i_index = item_ids.index(item_id) c = datetime.now() # delta days date = datetime(*time.localtime(int(timestamp))[:6]) dt = delta(head_date, date) dts.append(dt) d = datetime.now() weekday_vec = np.zeros(7) weekday_vec[date.weekday()] = 1 e = datetime.now() if user_id in last: last_item_vec = last[user_id]['item'] last_weekday_vec = last[user_id]['weekday'] else: last_item_vec = np.zeros(49) last_weekday_vec = np.zeros(7) f = datetime.now() others = np.concatenate((weekday_vec, last_item_vec, last_weekday_vec)) g = datetime.now() user = User(u_index, np.zeros(0)) item = Item(i_index, movies[item_id]) h = datetime.now() sample = Event(user, item, 1., others) samples.append(sample) ii = datetime.now() #print('to:') if i%100000 == 0: print (i) print (len(item_ids_keyed)) #print(((ii - a).microseconds)) # if i % 1000000 == 0: # f = open('samples' + str(i) + '.pkl', 'wb') # pickle.dump(samples, f) # f.close() # samples = [] # record users' last rated movie features last[user_id] = {'item': movies[item_id], 'weekday': weekday_vec} i = i+1 pickle.dump(last, open('last.pckl', 'wb')) file = open('item_ids.pkl', 'wb') pickle.dump(item_ids_keyed, file) file.close() file = open('user_ids.pkl', 'wb') pickle.dump(user_ids, file) file.close() # file = open('samples.pkl', 'wb') # pickle.dump(samples, file) # file.close() # contexts in this dataset # 1 delta time, 18 genres, and 23 demographics (1 for M/F, 1 for age, 21 for occupation(0-20)) # 7 for day of week, 18 for the last rated item genres, 7 for the last day of week, 28 for tf-idf return Bunch(samples=samples, can_repeat=False, contexts={'others': 7 + 21 + 28 + 7, 'item': 49, 'user': 0}, n_user=len(user_ids_keyed), n_item=len(item_ids_keyed), n_sample=len(samples))