def get_results(dataset_name, coldstart, cs_type='none', n_entries=0): # Get dataset dataset = get_dataset(dataset_name) models = [] # Hybrid Model from hybrid_model.hybrid import HybridModel from hybrid_model.config import hybrid_config model_type = HybridModel config = hybrid_config models.append(EvalModel(model_type.__name__, model_type, config)) # Bias Baseline from hybrid_model.models import BiasEstimator model_type = BiasEstimator config = {} models.append(EvalModel(model_type.__name__, model_type, config)) # SVD from hybrid_model.models import SVD model_type = SVD config = {} models.append(EvalModel(model_type.__name__, model_type, config)) results = evaluate_models_xval(dataset, models, coldstart=coldstart, cs_type=cs_type, n_entries=n_entries) return results
def analyze(ds_name): ds = dataset.get_dataset(ds_name) (inds_u, inds_i, y, users_features, items_features) = ds.data (users_desc, items_desc, users_features_desc, items_features_desc) = dataset.get_dataset_desc(ds_name) n_users = ds.n_users n_items = ds.n_items n_users_features = len(users_features_desc) n_items_features = len(items_features_desc) # Sanity checks assert (n_users, n_users_features) == users_features.shape assert (n_items, n_items_features) == items_features.shape matrix = np.zeros((n_users, n_items)) for u, i, r in zip(inds_u, inds_i, y): matrix[u, i] = r entries = len(matrix.nonzero()[0]) sparsity = float(entries) sparsity /= (matrix.shape[0] * matrix.shape[1]) print('Number of users {}'.format(n_users)) print('Number of Items {}'.format(n_items)) print('Total valid entries {}'.format(entries)) print('Sparsity {:4.4f}%'.format(sparsity * 100)) items_per_user_avg = np.mean(np.sum((matrix != 0).astype(np.int), 1)) users_per_item_avg = np.mean(np.sum((matrix != 0).astype(np.int), 0)) print('Average number of items per user {}'.format(items_per_user_avg)) print('Average number of users per item {}'.format(users_per_item_avg)) users_witout_items = np.sum(np.sum((matrix != 0), 1) == 0) print('Users without any items {}'.format(users_witout_items)) items_without_users = np.sum(np.sum((matrix != 0), 0) == 0) print('Items without any users {}'.format(items_without_users)) # # Sparsify # # Delete users and items without valid matrix # sales_sparse = matrix[~np.all(matrix == 0, 1), :] # sales_sparse = sales_sparse[:, ~np.all(matrix == 0, 0)] # # entries_sparse = len(sales_sparse.nonzero()[0]) # sparsity_sparse = float(entries) # sparsity_sparse /= (sales_sparse.shape[0] * sales_sparse.shape[1]) # print('Removing users and items without matrix:') # print('Number of users {}'.format(sales_sparse.shape[0])) # print('Number of Items {}'.format(sales_sparse.shape[1])) # print('Sparsity after removal of users without matrix {:4.4f}%'.format(sparsity_sparse * 100)) # items_per_user_avg = np.mean(np.sum((sales_sparse != 0), 1)) # users_per_item_avg = np.mean(np.sum((sales_sparse != 0), 0)) # print('Average number of items per user {}'.format(items_per_user_avg)) # print('Average number of users per item {}'.format(users_per_item_avg)) ga = np.mean(y) y_shift = y - ga print('Global Average: {:4.4f}'.format(ga)) user_stats = pd.DataFrame([], index=users_features_desc) user_stats['# users'] = np.sum(users_features, 0) user_stats['# interactions'] = np.sum(users_features[inds_u, :], 0) user_stats['avg rating'] = users_features[inds_u, :].T @ y_shift / np.sum( users_features[inds_u, :], 0) print(user_stats) item_stats = pd.DataFrame([], index=items_features_desc) item_stats['# items'] = np.sum(items_features, 0) item_stats['# interactions'] = np.sum(items_features[inds_i, :], 0) item_stats['avg rating'] = items_features[inds_i, :].T @ y_shift / np.sum( items_features[inds_i, :], 0) print(item_stats) user_feature_stats = pd.DataFrame( (users_features.T @ users_features)[7:, :9], index=users_features_desc[7:], columns=users_features_desc[:9]) print(user_feature_stats) item_feature_stats = pd.DataFrame((items_features.T @ items_features), index=items_features_desc, columns=items_features_desc) print(item_feature_stats) concat_features_desc = users_features_desc + items_features_desc concat_features_interactions = np.concatenate( (users_features[inds_u, :], items_features[inds_i, :]), 1) feature_interactions = pd.DataFrame( concat_features_interactions.T @ concat_features_interactions, index=concat_features_desc, columns=concat_features_desc) print(feature_interactions) feature_corr = pd.DataFrame( (concat_features_interactions.T * y_shift) @ concat_features_interactions / feature_interactions.values, index=concat_features_desc, columns=concat_features_desc) print(feature_corr)
import script_chdir import numpy as np import results.plots as lplot import matplotlib.pyplot as plt from hybrid_model.dataset import get_dataset from hybrid_model.index_sampler import IndexSamplerUserItembased as IndexSampler dataset = get_dataset('ml100k') (inds_u, inds_i, y, users_features, items_features) = dataset.data # mat = np.zeros((dataset.n_users, dataset.n_items), np.float) # # for u, i in zip(inds_u, inds_i): # mat[u, i] = 1.0 # # # Get user/item distributions and order # dist_users = np.sum(mat, axis=1).astype(np.int) # dist_items = np.sum(mat, axis=0).astype(np.int) user_dist = np.bincount(inds_u, minlength=dataset.n_users) item_dist = np.bincount(inds_i, minlength=dataset.n_items) order_users = np.argsort(-user_dist) order_items = np.argsort(-item_dist) dist_users = user_dist[order_users] dist_items = item_dist[order_items] inds_u = np.argsort(order_users)[inds_u] inds_i = np.argsort(order_items)[inds_i]