def test_predict_not_fitted(): model = LightFM() with pytest.raises(ValueError): model.predict(np.arange(10), np.arange(10)) with pytest.raises(ValueError): model.predict_rank(1) with pytest.raises(ValueError): model.get_user_representations() with pytest.raises(ValueError): model.get_item_representations()
def test_get_representations(): model = LightFM(random_state=SEED) model.fit_partial(train, epochs=10) num_users, num_items = train.shape for (item_features, user_features) in ((None, None), ((sp.identity(num_items) + sp.random(num_items, num_items)), (sp.identity(num_users) + sp.random(num_users, num_users)))): test_predictions = model.predict(test.row, test.col, user_features=user_features, item_features=item_features) item_biases, item_latent = model.get_item_representations( item_features) user_biases, user_latent = model.get_user_representations( user_features) assert item_latent.dtype == np.float32 assert user_latent.dtype == np.float32 predictions = ( (user_latent[test.row] * item_latent[test.col]).sum(axis=1) + user_biases[test.row] + item_biases[test.col]) assert np.allclose(test_predictions, predictions, atol=0.000001)
class RepresentationLearner: def __init__(self, n_components=30): self.user_features = None self.item_features = None self.model = LightFM(n_components) def _merge_user_features(self, new_features): pass def _merge_item_features(self, new_features): pass def fit_partial(self, interactions, user_features=None, item_features=None): self._merge_user_features(user_features) self._merge_item_features(item_features) self.model.fit_partial(interactions, user_features=user_features, item_features=item_features) def user_representations(self): _, user_repr = self.model.get_user_representations() return user_repr def item_representations(self): _, item_repr = self.model.get_item_representations() return item_repr def save(self, path): with open(path, 'wb') as output: pickle.dump(self, output) @classmethod def load(cls, path): with open(path, 'rb') as input: return pickle.load(input) def train(self, interaction_path, user_features_path=None, item_features_path=None): def read_fake_data(n_users, n_items, path): data = pd.read_csv(path) mat = scipy.sparse.lil_matrix((n_users, n_items), dtype=np.int32) for _, row in data.iterrows(): userId, itemId, is_liked = row[0], row[1], row[2] mat[userId, itemId] = is_liked return mat n_users = 10000 n_items = 10000 interactions = read_fake_data(n_users, n_items, interaction_path) self.fit_partial(interactions)
def main(): # sparse_training_matrix is user-item interaction matrix. model = LightFM(no_components=30, learning_rate=0.05, loss='bpr', item_pretrain=True, item_pretrain_file='item_embeddings.txt') model.fit(sparse_training_matrix, epochs=100) URB, URE = model.get_user_representations() IRB, IRE = model.get_item_representations()
def train(impl_train_data, config, user_ids, item_ids, model_folder, save_res=True): # In this method we train the MF algorithm model = LightFM(loss='warp', no_components=config['dims'], learning_rate=config['lr']) model = model.fit(impl_train_data, epochs=50, num_threads=8) user_biases, user_embeddings = model.get_user_representations() item_biases, item_embeddings = model.get_item_representations() item_vecs_reg = np.concatenate((item_embeddings, np.reshape(item_biases, (1, -1)).T), axis=1) user_vecs_reg = np.concatenate((user_embeddings, np.ones((1, user_biases.shape[0])).T), axis=1) print("USER FEAT:", user_vecs_reg.shape) print("ITEM FEAT:", item_vecs_reg.shape) if save_res==True: save(item_ids, item_vecs_reg, os.path.join(model_folder, 'out_item_features.feats')) save(user_ids, user_vecs_reg, os.path.join(model_folder, 'out_user_features.feats')) return item_ids, item_vecs_reg, user_ids, user_vecs_reg
def train_mf(impl_train_data, item_ids, user_ids, item_features_file, user_features_file, dims=200, epochs=50, max_sampled=10, lr=0.05): model = LightFM(loss='warp', no_components=dims, max_sampled=max_sampled, learning_rate=lr, random_state=42) model = model.fit(impl_train_data, epochs=epochs, num_threads=24) user_biases, user_embeddings = model.get_user_representations() item_biases, item_embeddings = model.get_item_representations() item_vec = np.concatenate((item_embeddings, np.reshape(item_biases, (1, -1)).T), axis=1) user_vec = np.concatenate((user_embeddings, np.ones((1, user_biases.shape[0])).T), axis=1) print("USER FEAT:", user_vec.shape) print("ITEM FEAT:", item_vec.shape) save(item_ids, item_vec, item_features_file) save(user_ids, user_vec, user_features_file) return user_vec, item_vec
def train_warp(impl_train_data, dims, user_ids, item_ids, user_features_filem, item_features_file, save_res=True): model = LightFM(loss='warp', no_components=dims, max_sampled=30) model = model.fit(impl_train_data, epochs=50, num_threads=8) user_biases, user_embeddings = model.get_user_representations() item_biases, item_embeddings = model.get_item_representations() item_vecs_reg = np.concatenate( (item_embeddings, np.reshape(item_biases, (1, -1)).T), axis=1) user_vecs_reg = np.concatenate( (user_embeddings, np.ones((1, user_biases.shape[0])).T), axis=1) #print("USER FEAT:", user_vecs_reg.shape) #print("ITEM FEAT:", item_vecs_reg.shape) if save_res == True: save(item_ids, item_vecs_reg, item_features_file) save(user_ids, user_vecs_reg, user_features_file) return item_ids, item_vecs_reg, user_ids, user_vecs_reg
# load data sparse_mat = sparse.load_npz('./user_item_click_sparse_matrix_' + today + '.npz') sparse_mat_view = sparse.load_npz('./user_item_view_sparse_matrix_' + today + '.npz') sparse_prod_fea = sparse.load_npz('./products_feature_sparse_' + today + '.npz') prod_fea_concat = concatProductFeature(sparse_prod_fea, id_weight=1, sparse_weight=0.01) model = LightFM(no_components=150, loss='warp', max_sampled=20, random_state=0) model.fit(sparse_mat, epochs=20, item_features=prod_fea_concat) item_fea = model.get_item_representations(features=prod_fea_concat) total_dist = cdist(item_fea[1], item_fea[1], 'cosine') productID = 3088 similarClickProducts = np.argsort(total_dist[productID - 1])[1:21] + 1 # view results from DataLoader import DataLoader import config as cfg DL = DataLoader(cfg) DL._loadProductData_stream(todayDate) DL.productData[[x in [productID] for x in DL.productData.id]][['_source.name', '_source.tags']] DL.productData[[x in similarClickProducts for x in DL.productData.id]][['_source.name', '_source.tags']]
class LightFMRecommender(BaseFactorizationRecommender): default_model_params = { 'loss': 'warp', 'learning_schedule': 'adadelta', 'no_components': 30, 'max_sampled': 10, 'item_alpha': 0, 'user_alpha': 0, } default_fit_params = { 'epochs': 100, 'item_features': None, 'num_threads': N_CPUS, 'verbose': True, } default_external_features_params = dict(add_identity_mat=True) def __init__(self, use_sample_weight=False, external_features=None, external_features_params=None, initialiser_model=None, initialiser_scale=0.1, **kwargs): self.use_sample_weight = use_sample_weight self.external_features = external_features self.external_features_params = external_features_params or \ self.default_external_features_params.copy() self.initialiser_model = initialiser_model self.initialiser_scale = initialiser_scale super().__init__(**kwargs) def _prep_for_fit(self, train_obs, **fit_params): # self.toggle_mkl_blas_1_thread(True) # assign all observation data self._set_data(train_obs) fit_params['sample_weight'] = self.train_mat.tocoo() \ if self.use_sample_weight else None self._set_fit_params(fit_params) self._add_external_features() # init model and set params self.model = LightFM(**self.model_params) if self.initialiser_model is not None: self._initialise_from_model(train_obs) def _initialise_from_model(self, train_obs): # fit initialiser model (this is done here to prevent any data leaks from passing fitted models) simple_logger.info('Training %s model to initialise LightFM model.' % str(self.initialiser_model)) self.initialiser_model.fit(train_obs) self._reuse_data(self.initialiser_model) # have the internals initialised self.model.fit_partial(self.train_mat, epochs=0) # transplant factors from inititialiser model self.model.item_embeddings = self.initialiser_model._get_item_factors()[1] self.model.user_embeddings = self.initialiser_model._get_user_factors()[1] # scale the factors to be of similar scale scale = self.initialiser_scale self.model.item_embeddings *= scale / np.mean(np.abs(self.model.item_embeddings)) self.model.user_embeddings *= scale / np.mean(np.abs(self.model.user_embeddings)) def _add_external_features(self): if self.external_features is not None: self.external_features_mat = self.external_features.\ fit_transform_ids_df_to_mat( items_encoder=self.sparse_mat_builder.iid_encoder, **self.external_features_params) simple_logger.info('External item features matrix: %s' % str(self.external_features_mat.shape)) # add external features if specified self.fit_params['item_features'] = self.external_features_mat if self.external_features_mat is not None: simple_logger.info('Fitting using external features mat: %s' % str(self.external_features_mat.shape)) def fit(self, train_obs, **fit_params): self._prep_for_fit(train_obs, **fit_params) self.model.fit_partial(self.train_mat, **self.fit_params) return self def fit_partial(self, train_obs, epochs=1): self._set_epochs(epochs) if self.model is None: self.fit(train_obs) else: self.model.fit_partial(self.train_mat) return self def fit_batches(self, train_obs, train_dfs, epochs_per_batch=None, **fit_params): self._prep_for_fit(train_obs) for i, df in enumerate(train_dfs): batch_train_mat = self.sparse_mat_builder.build_sparse_interaction_matrix(df) if epochs_per_batch is not None: fit_params['epochs'] = epochs_per_batch fit_params['sample_weight'] = batch_train_mat.tocoo() \ if self.use_sample_weight else None self._set_fit_params(fit_params) simple_logger.info('Fitting batch %d (%d interactions)' % (i, len(df))) self.model.fit_partial(batch_train_mat, **self.fit_params) def _set_epochs(self, epochs): self.set_params(epochs=epochs) def set_params(self, **params): params = self._pop_set_params( params, ['use_sample_weight', 'external_features', 'external_features_params', 'initialiser_model', 'initialiser_scale']) super().set_params(**params) def _get_item_factors(self, mode=None): n_items = len(self.sparse_mat_builder.iid_encoder.classes_) biases, representations = self.model.get_item_representations(self.fit_params['item_features']) if mode is None: pass # default mode elif mode == 'external_features': external_features_mat = self.external_features_mat assert external_features_mat is not None, \ 'Must define and add a feature matrix for "external_features" similarity.' representations = external_features_mat elif (mode == 'no_features') and (self.fit_params['item_features'] is not None): simple_logger.info('LightFM recommender: get_similar_items: "no_features" mode ' 'assumes ID mat was added and is the last part of the feature matrix.') assert self.model.item_embeddings.shape[0] > n_items, \ 'Either no ID matrix was added, or no features added' representations = self.model.item_embeddings[-n_items:, :] else: raise ValueError('Uknown representation mode: %s' % mode) return biases, representations def _get_user_factors(self, mode=None): return self.model.get_user_representations() def _predict_on_inds(self, user_inds, item_inds): return self.model.predict(user_inds, item_inds, item_features=self.fit_params['item_features'], num_threads=N_CPUS) def _predict_rank(self, test_mat, train_mat=None): return self.model.predict_rank( test_interactions=test_mat, train_interactions=train_mat, item_features=self.fit_params['item_features'], num_threads=N_CPUS) def reduce_memory_for_serving(self): # would be best to set those to None, but than LightFM will complain, and more importantly # Cython code expects the right data format and will crash if its predict() will be used, # so I just point to the embeddings (which doesn't add memory). # the danger in this is that I don't know what will be the damage if someone calls one of the fit methods # for this reason it's in an explicit method "for_serving" and not in a __getstate__() method self.model.item_embedding_gradients = self.model.item_embeddings self.model.item_embedding_momentum= self.model.item_embeddings self.model.user_embedding_gradients = self.model.user_embeddings self.model.user_embedding_momentum = self.model.user_embeddings self.model.item_bias_gradients = self.model.item_biases self.model.item_bias_momentum= self.model.item_biases self.model.user_bias_gradients = self.model.user_biases self.model.user_bias_momentum = self.model.user_biases self.fit_params['sample_weight'] = None super().reduce_memory_for_serving()
class GRLightFMRecommender: def __init__(self, path_to_dataset='data', use_test_tags=False, num_threads=1, num_components=40, num_epochs=100, item_alpha=1e-6, loss='warp', debug=False): self._matrix_generator = GRSparseMatrixGenerator( path_to_dataset=path_to_dataset, use_test_tags=use_test_tags) self.item_user = self._matrix_generator.getCOOProgRepMatrix() self.user = self._matrix_generator.getCOORepoTags() self.item = self._matrix_generator.getCOOProgTags() self._item_tags = self.item.todense() self.num_threads = num_threads self.num_components = num_components self.num_epochs = num_epochs self.item_alpha = item_alpha self.loss = loss self._debug = debug if self._debug: print(self.num_threads, self.num_components, self.num_epochs, self.item_alpha, self.loss) def fit(self): self.model = LightFM(loss='warp', item_alpha=self.item_alpha, no_components=self.num_components, random_state=0) # Need to hstack item_features eye = sp.eye(self.items.shape[0], self.items.shape[0]).tocsr() item_features_concat = sp.hstack((eye, self.items)) item_features_concat = item_features_concat.tocsr().astype(np.float32) # Need to hstack item_features eye = sp.eye(self.user.shape[0], self.user.shape[0]).tocsr() user_features_concat = sp.hstack((eye, self.user)) user_features_concat = user_features_concat.tocsr().astype(np.float32) self.model = self.model.fit(self.item_user, item_features=item_features_concat, user_features=user_features_concat, epochs=self.num_epochs, num_threads=self.num_threads) self.trained = True def testAUC(self): self.train_auc = auc_score(self.model, self.item_user, item_features=self.item, user_features=self.user, num_threads=self.num_threads).mean() print('Hybrid testing set AUC: %s' % self.train_auc) #TODO check if its already an np array def predict(self, repo_id, prog_ids): users = np.ones(len(prog_ids)) * repo_id items = np.array(prog_ids) return self.model.predict(users, items, item_features=self.item, user_features=self.user, num_threads=self.num_threads) def getLatentVectors(self): return (self.model.get_item_representations(features=self.item), self.model.get_user_representations(features=self.user)) def getProgTopSkills(self, prog_id, num_rec=10): values = self._item_tags[prog_id] tags = np.argsort(values) values = np.sort(values) limit = -1 * (num_rec + 1) return (tags[:, -1:limit:-1], values[:, -1:limit:-1]) #TODO decide whether to generate the numpy array or the matrix generator def getSuggestionsForRepository(self, repo_id, num_suggestions=10, get_tags=False): progs = self._matrix_generator.getProgrammersNotInRepo(repo_id) rp = [] rp.append(repo_id) scores = self.predict(rp, progs) scores = np.argsort(scores) suggs = [] if not get_tags: if repo_id == 14: suggs.append('fchollet') for i in range(0, num_suggestions): suggs.append( self._matrix_generator.getProgrammerFromID( progs[scores[i]])) return suggs
def process_mpd(playlists_path, target_playlists, output_file, prev_songs_window): max_prev_song = 0 previous_tracks = defaultdict(lambda: defaultdict(int)) playlists_tracks = [] playlists = [] playlists_extra = {'name': []} filenames = os.listdir(playlists_path) for filename in sorted(filenames): if filename.startswith("mpd.slice.") and filename.endswith(".json"): fullpath = os.sep.join((playlists_path, filename)) f = open(fullpath) js = f.read() f.close() mpd_slice = json.loads(js) for playlist in mpd_slice['playlists']: nname = normalize_name(playlist['name']) playlists_extra['name'].append(nname) tracks = defaultdict(int) sorted_tracks = sorted(playlist['tracks'], key=lambda k: k['pos']) prev_track = [] for track in sorted_tracks: tracks[track['track_uri']] += 1 curr_prev_tracks = len(prev_track) for i, song_in_window in enumerate(prev_track): previous_tracks[song_in_window][track['track_uri']] += (i+1)/curr_prev_tracks previous_tracks[track['track_uri']][song_in_window] += (i+1)/curr_prev_tracks #previous_tracks[song_in_window][track['track_uri']] += 1 #previous_tracks[track['track_uri']][song_in_window] += 1 max_prev_song = max(max_prev_song, previous_tracks[track['track_uri']][song_in_window]) max_prev_song = max(max_prev_song, previous_tracks[song_in_window][track['track_uri']]) if len(prev_track) == prev_songs_window: prev_track.pop(0) prev_track.append(track['track_uri']) playlists_tracks.append(tracks) playlists.append(str(playlist['pid'])) top_pop = [] for i in previous_tracks.keys(): top_pop.append((i, np.sum(list(previous_tracks[i].values())))) top_pop = sorted(top_pop, key=lambda x:x[1], reverse=True)[:10000] top_pop = [t[0] for t in top_pop] # Add playlists on testing set test_playlists = [] test_playlists_tracks = [] target = json.load(open(target_playlists)) train_playlists_count = len(playlists) test_playlists_recommended_sum = [] for playlist in target["playlists"]: nname = "" if 'name' in playlist: nname = normalize_name(playlist['name']) playlists_extra['name'].append(nname) playlists.append(str(playlist['pid'])) test_playlists.append(str(playlist['pid'])) if len(playlist['tracks']) == 0: test_playlists_recommended_sum.append(top_pop) test_playlists_tracks.append({}) continue tracks = defaultdict(int) for track in playlist['tracks']: tracks[track['track_uri']] += 1 #playlists_tracks.append(tracks) test_playlists_tracks.append(tracks) recommended_pop = defaultdict(list) for t in tracks.keys(): for pt in previous_tracks[t].keys(): if pt not in tracks: recommended_pop[pt].append(previous_tracks[t][pt] /max_prev_song) recommended_pop_sum = [(t, np.sum(recommended_pop[t])) for t in recommended_pop.keys()] recommended_pop_sum = sorted(recommended_pop_sum, key=lambda x:x[1], reverse=True) recommended_pop_sum = [t[0] for t in recommended_pop_sum] test_playlists_recommended_sum.append(recommended_pop_sum) print ("Data loaded. Creating features matrix") dv = DictVectorizer() interaction_matrix = dv.fit_transform(playlists_tracks+[{}]*10000) lb = LabelBinarizer(sparse_output=True) pfeat_train = lb.fit_transform(playlists_extra['name'][:1000000]+[""]*10000) pfeat_test = lb.transform(playlists_extra['name']) print ("pfeat_train", pfeat_train.shape) print ("pfeat_test", pfeat_test.shape) playlist_features = pfeat_train # Need to hstack playlist_features eye = sparse.eye(playlist_features.shape[0], playlist_features.shape[0]).tocsr() playlist_features_concat = sparse.hstack((eye, playlist_features)) print ("Features matrix created. Training model") model = LightFM(loss='warp', no_components=200, max_sampled=30, item_alpha=1e-06, user_alpha=1e-06, random_state=SEED) model = model.fit(interaction_matrix, user_features=playlist_features_concat, epochs=150, num_threads=32) # freeze the gradient and optimize held-out users model.item_embedding_gradients = np.finfo(np.float32).max * np.ones_like(model.item_embedding_gradients) model.item_bias_gradients = np.finfo(np.float32).max * np.ones_like(model.item_bias_gradients) model.item_alpha = 0.0 model.user_alpha = 0.0 model.user_embedding_gradients[:1000000,:] = np.finfo(np.float32).max * np.ones_like(model.user_embedding_gradients[:1000000,:]) model.user_bias_gradients[:1000000] = np.finfo(np.float32).max * np.ones_like(model.user_bias_gradients[:1000000]) # Use the trained model to get a representation of the playlists on challenge set interaction_matrix = dv.transform(playlists_tracks+test_playlists_tracks) playlist_features = pfeat_test playlist_features_concat = sparse.hstack((eye, playlist_features)) model.user_embeddings[-10000:] = ((model.random_state.rand(10000, model.no_components) - 0.5) / model.no_components).astype(np.float32) model = model.fit_partial(interaction_matrix, user_features=playlist_features_concat, epochs=150, num_threads=32) print ("Model Trained") user_biases, user_embeddings = model.get_user_representations(playlist_features_concat) item_biases, item_embeddings = model.get_item_representations() fuse_perc = 0.7 with open(output_file, 'w') as fout: print('team_info,cocoplaya,main,[email protected]', file=fout) for i, playlist in enumerate(test_playlists): playlist_pos = train_playlists_count+i y_pred = user_embeddings[playlist_pos].dot(item_embeddings.T) + item_biases topn = np.argsort(-y_pred)[:len(test_playlists_tracks[i])+4000] rets = [(dv.feature_names_[t], float(y_pred[t])) for t in topn] songids = [s for s, _ in rets if s not in test_playlists_tracks[i]] songids_dict = {s:1 for s in songids} max_score = max(len(songids), len(test_playlists_recommended_sum[i])) pop_sum = {s:(max_score - p) for p,s in enumerate(test_playlists_recommended_sum[i])} fuse_sum = [] for p, s in enumerate(songids): pop_val_sum = 0 if s in pop_sum: pop_val_sum = pop_sum[s] fuse_sum.append((s,((max_score - p)*fuse_perc + pop_val_sum*(1-fuse_perc) ) / 2)) for s in pop_sum.keys(): if s not in songids_dict: fuse_sum.append((s,(pop_sum[s]*(1-fuse_perc) ) / 2)) fuse_sum = sorted(fuse_sum, key=lambda x:x[1], reverse=True) print(' , '.join([playlist] + [x[0] for x in fuse_sum[:500]]), file=fout)
def process_mpd(playlists_path, target_playlists, output_file, prev_songs_window): max_prev_song = 0 previous_tracks = defaultdict(lambda: defaultdict(int)) playlists_tracks = [] playlists = [] playlists_extra = {'name': []} filenames = os.listdir(playlists_path) for filename in sorted(filenames): if filename.startswith("mpd.slice.") and filename.endswith(".json"): fullpath = os.sep.join((playlists_path, filename)) f = open(fullpath) js = f.read() f.close() mpd_slice = json.loads(js) for playlist in mpd_slice['playlists']: nname = normalize_name(playlist['name']) playlists_extra['name'].append(nname) tracks = defaultdict(int) sorted_tracks = sorted(playlist['tracks'], key=lambda k: k['pos']) prev_track = [] for track in sorted_tracks: tracks[track['track_uri']] += 1 curr_prev_tracks = len(prev_track) for i, song_in_window in enumerate(prev_track): previous_tracks[song_in_window][ track['track_uri']] += (i + 1) / curr_prev_tracks previous_tracks[track['track_uri']][ song_in_window] += (i + 1) / curr_prev_tracks #previous_tracks[song_in_window][track['track_uri']] += 1 #previous_tracks[track['track_uri']][song_in_window] += 1 max_prev_song = max( max_prev_song, previous_tracks[track['track_uri']] [song_in_window]) max_prev_song = max( max_prev_song, previous_tracks[song_in_window][ track['track_uri']]) if len(prev_track) == prev_songs_window: prev_track.pop(0) prev_track.append(track['track_uri']) playlists_tracks.append(tracks) playlists.append(str(playlist['pid'])) top_pop = [] for i in previous_tracks.keys(): top_pop.append((i, np.sum(list(previous_tracks[i].values())))) top_pop = sorted(top_pop, key=lambda x: x[1], reverse=True)[:10000] top_pop = [t[0] for t in top_pop] # Add playlists on testing set test_playlists = [] target = json.load(open(target_playlists)) train_playlists_count = len(playlists) test_playlists_recommended_sum = [] for playlist in target["playlists"]: nname = "" if 'name' in playlist: nname = normalize_name(playlist['name']) playlists_extra['name'].append(nname) playlists.append(str(playlist['pid'])) test_playlists.append(str(playlist['pid'])) if len(playlist['tracks']) == 0: test_playlists_recommended_sum.append(top_pop) playlists_tracks.append({}) continue tracks = defaultdict(int) for track in playlist['tracks']: tracks[track['track_uri']] += 1 playlists_tracks.append(tracks) recommended_pop = defaultdict(list) for t in tracks.keys(): for pt in previous_tracks[t].keys(): if pt not in tracks: recommended_pop[pt].append(previous_tracks[t][pt] / max_prev_song) recommended_pop_sum = [(t, np.sum(recommended_pop[t])) for t in recommended_pop.keys()] recommended_pop_sum = sorted(recommended_pop_sum, key=lambda x: x[1], reverse=True) recommended_pop_sum = [t[0] for t in recommended_pop_sum] test_playlists_recommended_sum.append(recommended_pop_sum) print("Data loaded. Creating features matrix") dv = DictVectorizer() interaction_matrix = dv.fit_transform(playlists_tracks) lb = LabelBinarizer(sparse_output=True) pfeat = lb.fit_transform(playlists_extra['name']) playlist_features = pfeat # Need to hstack playlist_features eye = sparse.eye(playlist_features.shape[0], playlist_features.shape[0]).tocsr() playlist_features_concat = sparse.hstack((eye, playlist_features)) item_prev = [] highlevel = [] for track in dv.feature_names_: try: f = get_audio_features_dict(track.replace('spotify:track:', ''), False) except ValueError: print("Failed loading json", track) f = None curr_highlevel = {} if f is not None: curr_highlevel = {k: v for k, v in f.items() if 'class_f' in k} highlevel.append(curr_highlevel) ifeat_highlevel = DictVectorizer().fit_transform(highlevel) item_prev = ifeat_highlevel eye = sparse.eye(item_prev.shape[0], item_prev.shape[0]).tocsr() item_feat = sparse.hstack((eye, item_prev)) print("Features matrix created. Training model") model = LightFM(loss='warp', no_components=200, max_sampled=30, item_alpha=1e-06, user_alpha=1e-06, random_state=SEED) model = model.fit(interaction_matrix, user_features=playlist_features_concat, item_features=item_feat, epochs=150, num_threads=32) print("Model Trained") user_biases, user_embeddings = model.get_user_representations( playlist_features_concat) item_biases, item_embeddings = model.get_item_representations(item_feat) fuse_perc = 0.7 with open(output_file, 'w') as fout: print('team_info,cocoplaya,creative,[email protected]', file=fout) for i, playlist in enumerate(test_playlists): playlist_pos = train_playlists_count + i y_pred = user_embeddings[playlist_pos].dot( item_embeddings.T) + item_biases topn = np.argsort(-y_pred)[:len(playlists_tracks[playlist_pos]) + 4000] rets = [(dv.feature_names_[t], float(y_pred[t])) for t in topn] songids = [ s for s, _ in rets if s not in playlists_tracks[playlist_pos] ] songids_dict = {s: 1 for s in songids} max_score = max(len(songids), len(test_playlists_recommended_sum[i])) pop_sum = { s: (max_score - p) for p, s in enumerate(test_playlists_recommended_sum[i]) } fuse_sum = [] for p, s in enumerate(songids): pop_val_sum = 0 if s in pop_sum: pop_val_sum = pop_sum[s] fuse_sum.append( (s, ((max_score - p) * fuse_perc + pop_val_sum * (1 - fuse_perc)) / 2)) for s in pop_sum.keys(): if s not in songids_dict: fuse_sum.append((s, (pop_sum[s] * (1 - fuse_perc)) / 2)) fuse_sum = sorted(fuse_sum, key=lambda x: x[1], reverse=True) print(' , '.join([playlist] + [x[0] for x in fuse_sum[:500]]), file=fout)
model = LightFM(no_components=no_components, learning_schedule='adagrad', loss='warp', learning_rate=0.05, random_state=0) model.fit(interactions=train, item_features=item_features, sample_weight=train_weights, epochs=10, verbose=True) # Find Similar Items item_biases, item_embeddings = model.get_item_representations(features=item_features) def make_best_items_report(item_embeddings, book_id, num_search_items=10): item_id = book_id - 1 # Cosine similarity scores = item_embeddings.dot(item_embeddings[item_id]) # (10000, ) item_norms = np.linalg.norm(item_embeddings, axis=1) # (10000, ) item_norms[item_norms == 0] = 1e-10 scores /= item_norms # best: score가 제일 높은 item의 id를 num_search_items 개 만큼 가져온다. best = np.argpartition(scores, -num_search_items)[-num_search_items:] similar_item_id_and_scores = sorted(zip(best, scores[best] / item_norms[item_id]), key=lambda x: -x[1]) # Report를 작성할 pandas dataframe