def get_group_ratings(): """ Returns a [p, m, r] array containing the ratings given by each group to each item p = number of groups m = number of items r = number of rating values """ cachefile = group_ratings_cache_file if os.path.isfile(cachefile): with msg(f'Reading group ratings from "{cachefile}"'): return np.load(cachefile) with msg('Getting group ratings'): R = get_R() P = get_groups() item_count = R.shape[1] rating_count = rating_value_count ratings = np.zeros((number_of_groups(), item_count, rating_count), dtype=np.float32) with msg(f'Calculating rating counts'): for group_n, group in enumerate(P): for rating_index in range(rating_count): rating = rating_index + 1 ratings[group_n, :, rating_index] = (R[group] == rating).sum(axis=0) with msg(f'Saving group ratings to "{cachefile}"'): np.save(cachefile, ratings) return ratings
def total_rmse(): group_count = DataReader.nym_count() item_count = R.shape[1] total_rmse = 0 item_lam = lam.sum(axis=0) highest_n = 500 large_items = np.argpartition(item_lam, -highest_n)[-highest_n:] with msg('Splitting group ratings'): group_ratings = [] for group in range(group_count): group_ratings.append(R[P[group]]) with msg('Getting rmse(s)'): count = 0 for nth_item, item in enumerate(large_items): for group in range(group_count): mean = Rtilde[group, item] # if mean < 3.5 and mean > 2.5: # if mean > 4: if True: count += 1 data = group_ratings[group][:, item].data var = Rvar[group, item] if var == 0: var = 0.01 total_rmse += get_rmse(data, mean, var) if (nth_item) % 10 == 0: mean_rmse = total_rmse / (count) print(f'[{nth_item}, {count}] Mean RMSE: {mean_rmse}')
def get_group_rating_distributions(): cachefile = DataReader.group_rating_dists_cache_file if os.path.isfile(cachefile): with msg( f'Reading distribution of ratings for each item per group from "{cachefile}"' ): return np.load(cachefile) with msg('Getting distribution of ratings for each item and group'): R = DataReader.get_ratings() P = DataReader.get_nyms() group_count = DataReader.nym_count() item_count = R.shape[1] rating_count = DataReader.rating_value_count dists = np.zeros((group_count, item_count, rating_count), dtype=np.float32) for group_n, group in enumerate(P): with msg(f'Calculating group {group_n} distributions'): R_g = R[group].tocoo() for item, rating in zip(R_g.col, R_g.data): dists[group_n, item, int(rating - 1)] += 1 with msg('Normalising distributions'): dists /= dists.sum(axis=2, keepdims=True) # nan's imply 0 ratings by group on item, so give equal distribution dists[np.isnan(dists)] = 1.0 / rating_count with msg(f'Saving distribution of ratings to "{cachefile}"'): np.save(cachefile, dists) return dists
def barplot_rating_dist(item, single=False, group=None, savefig=None): with msg("plotting rating distribution"): ratings = Data.get_ratings()[:,item] nyms = Data.get_nyms() plt.xlabel('rating') plt.ylabel('no. ratings') step = 1 bins = np.arange(step/2, 5 + 1.5*step, step) hist = lambda d, **kwargs: plt.hist(d, bins=bins, rwidth=step*0.75, **kwargs) if group is not None: plt.title(f'Item {item}, group {group} rating distribution') hist(ratings[nyms[group]].data) elif single: plt.title(f'Item {item} rating distribution') hist(ratings.data) else: plt.title(f'Item {item}, all groups rating distributions') for nym_n, nym in enumerate(nyms): hist(ratings[nym].data, histtype='step', linewidth=2 ,label=f'group {nym_n}') plt.legend() if savefig is None: plt.show() else: with msg(f'Saving figure to "{savefig}"'): plt.savefig(savefig, dpi=150) plt.clf()
def save_data(ratings, user_ids, movie_ids): with msg(f'Saving ratings to "{ratings_file}"'): sp.save_npz(ratings_file, ratings) with msg(f'Saving original user ids to "{users_file}"'): np.save(users_file, user_ids) with msg(f'Saving original movie ids to "{movies_file}"'): np.save(movies_file, movie_ids)
def get_nym_stats(): """ Returns statistics about rating distributions of all items for each nym, as a 3d numpy array [nym number, item number, <stat>] (type np.float32), where <stat> index 0 : item index 1 : distribution mean 2 : distribution variance 3 : number of ratings Cached result to allow single load on multiple calls. """ filename = DataReader.nym_stats_cache_file if os.path.isfile(filename): with msg(f'Reading nym stats from "{filename}"'): stats = np.load(filename) else: ratings = DataReader.get_ratings() nyms = DataReader.get_nyms() stats = np.zeros((len(nyms), ratings.shape[1], 4), dtype=np.float32) for nym_n, nym in enumerate(nyms): with msg(f'Getting nym #{nym_n} stats'): for i, items in enumerate(ratings[nym].T): data = items.data stats[nym_n, i, 0] = i stats[nym_n, i, 1] = data.mean() if len(data) > 0 else 0 stats[nym_n, i, 2] = data.var() if len(data) > 0 else 0 stats[nym_n, i, 3] = len(data) with msg(f'Saving nym stats to "{filename}"'): np.save(filename, stats) return stats
def get_P(): filename = P_file if os.path.isfile(filename + '.npz'): with msg(f'Reading "{filename}.npz"'): return sp.load_npz(filename + '.npz').toarray() elif os.path.isfile(filename + '.npy'): with msg(f'Reading "{filename}.npy"'): return sp.load(filename + '.npy')
def prepare_ratings(ratings): with msg('Preparing ratings'): with msg('Converting to csc matrix'): ratings = ratings.tocsc(copy=False) with msg('Removing empty cols'): non_zero_cols = ratings.getnnz(0) > 0 ratings = ratings[:, non_zero_cols] with msg('Converting to csr matrix'): ratings = ratings.tocsr(copy=False) with msg('Removing empty rows'): non_zero_rows = ratings.getnnz(1) > 0 ratings = ratings[non_zero_rows] return ratings, np.where(non_zero_rows)[0], np.where(non_zero_cols)[0]
def read_numpy_file(filename, dtype=np.float32): """ Read from numpy file if it exists, otherwise from raw text file """ with msg(f'Reading "{filename}"'): if os.path.isfile(filename + ".npy"): return np.load(filename + ".npy") else: return np.loadtxt(open(filename + ".txt", "r"), dtype=dtype)
def get_groups(): """ Returns the groups as a list of numpy arrays. Cached result to allow single load on multiple calls. """ P = get_P() with msg(f'Converting to list of user indexes by group'): indexes = np.arange(P.shape[1]) return [indexes[group] for group in P.astype(bool)]
def save_fig(f, label, tag=None): if tag is None: tag = '' else: tag = f'-{tag}' plot_name = f"{label.replace(' ', '-')}{tag}.png" fname = f + plot_name with msg("saving",fname): plt.savefig(fname, bbox_inches='tight') plt.clf()
def plot_nym_stat(thresh=thresh_default, inv=False, savefig=False, outfile=outfile_default, begin=None, num=None, stat_option=stat_option_default): stat_name = stat_options[stat_option] if inv: stat_name = f'inverse {stat_name}' fig, ax = plt.subplots() ax.set( # ylim=(0, None), title=f'{stat_name} of each group by item number (thresh no. ratings >= {thresh})', xlabel='item number', ylabel=stat_name) cm = plt.get_cmap('gist_rainbow') colors = [cm(1.*i/Data.nym_count()) for i in range(Data.nym_count())] begin = 0 if begin is None else begin end = None if num is None else begin + num nym_stats = Data.get_nym_stats()[:, begin : (None if num is None else begin+num),:] for nym_n in range(Data.nym_count()): nym_n_stats = nym_stats[nym_n] with msg(f'plotting nym #{nym_n} {stat_name}'): valids = (nym_n_stats[:,3] >= thresh) print(f'{valids.sum()} of {len(valids)} valid (thresh = {thresh})') x = nym_n_stats[:,0][valids] if stat_option is 1: y = nym_n_stats[:,1][valids] elif stat_option is 2: y = nym_n_stats[:,2][valids] elif stat_option is 3: y = np.sqrt(nym_n_stats[:,2][valids]) if inv: y[y > 0] = 1 / y[y > 0] s = np.sqrt(nym_n_stats[:,3][valids]) ax.scatter(x, y, s=s, facecolors='none', edgecolors=colors[nym_n], label=f'group {nym_n}') ax.legend() if savefig: with msg('Saving "{}" to "{}"'.format(ax.title.get_text(), outfile)): ax.get_figure().savefig(outfile, dpi=150) plt.clf() else: plt.show()
def gen_test_data(user_groups=None, item_n=100, subdir=''): """ Generate presistent data for testing """ items = GroupRatings() item_count = items.item_count() user_n = user_groups.shape[0] test_item_means = np.zeros((user_n, items.n_groups, item_n)) test_item_ratings = np.zeros((user_n, item_n)) test_item_ids = np.zeros((user_n, item_n), dtype=int) lam_dist = items.lam() / items.lam().sum(axis=1, keepdims=True) dists = items.dist() with msg('Generating test data'): for n, group in enumerate(user_groups): # choose test items item_ids = np.random.choice(item_count, size=item_n, p=lam_dist[group]) test_item_ids[n] = items.items[item_ids] # get group proxy ratings items.items = item_ids test_item_means[n] = items.mean() items.reset() # get sampled ratings for i, item_id in enumerate(item_ids): rating_dist = dists[group, item_id] test_item_ratings[n, i] = np.random.choice(items.n_rating_vals, p=rating_dist) + 1 means_file = test_item_dist_means_file.format(subdir, user_n) sampled_file = test_sampled_ratings_file.format(subdir, user_n) item_ids_file = test_item_ids_file.format(subdir, user_n) with msg(f'Saving test item dist means to {means_file}'): np.save(means_file, test_item_means) with msg(f'Saving test item sampled ratings to {sampled_file}'): np.save(sampled_file, test_item_ratings) with msg(f'Saving test item ids to {item_ids_file}'): np.save(item_ids_file, test_item_ids) return test_item_means, test_item_ratings, test_item_ids
def get_nyms(): """ Returns the nyms as a list of numpy arrays. Cached result to allow single load on multiple calls. """ filename = DataReader.nyms_file with msg(f'Reading nyms from "{filename}"'), open(filename, 'r') as f: nyms_raw = np.loadtxt(f, delimiter=',', dtype=int) # parse into list of nyms nym_count = nyms_raw[:, 1].max() + 1 return [ nyms_raw[:, 0][nyms_raw[:, 1] == nym_n] for nym_n in range(0, nym_count) ]
def get_ratings(): """ Returns the ratings matrix in compressed sparse column (csc) format. Stores csc matrix to ratings_cache_file for faster loading in future. Cached result to allow single load on multiple calls. """ filename = DataReader.ratings_file if os.path.isfile(filename): with msg(f'Loading rating matrix from "{filename}"'): return sp.load_npz(filename) else: raise RuntimeError( f'"{filename}" does not exist. Use "netflix_data.py" to generate it.' )
def parse_ratings(zipfile=netflix_data): filecount = 4 basefilename = 'combined_data_{}.txt' ratingfiles = [basefilename.format(i) for i in range(1, filecount + 1)] row, col, data = [], [], [] item_id = 1 with msg(f'Reading from "{netflix_data}"'), ZipFile(zipfile, 'r') as myzip: for filename in ratingfiles: with msg(f'Parsing "{filename}"'), myzip.open(filename, 'r') as file: for line in TextIOWrapper(file): tokens = line.split(',') if len(tokens) == 3: row.append(int(tokens[0])) col.append(item_id) data.append(int(tokens[1])) else: item_id = int( line[:-2] ) # -2 to remove ':' and newline at end of line with msg('Creating sparse matrix from ratings'): return sp.coo_matrix((data, (row, col)), dtype=np.float32)
def heatmap_rating_dist(item): # def plot_rating_dists_across_groups(ratings, item, groups, savefig=False): with msg("plotting rating distribution"): ratings = Data.get_ratings()[:,item] nyms = Data.get_nyms() data = np.zeros((10, len(nyms))) for nym_n, nym in enumerate(nyms): unique, count = np.unique(ratings[nym].data, return_counts=True) for rating, count in dict(zip(unique, count)).items(): data[int(2*rating - 1), nym_n] = count ax = sns.heatmap(data) ax.set( title="Distribution of item #{} ratings by group".format(int(item)), xlabel="group number", ylabel="rating", yticklabels=np.linspace(0.5, 5, 10)) plt.show()
import numpy as np import matplotlib.pyplot as plt from myutils import msg from datareader import DataReader from dist_model import DiscreteNormal as DiscNorm rating_count = 5 dist_gen = DiscNorm(np.linspace(0.5, 5.5, num=rating_count + 1)) with msg("Getting data"): Rtilde = DataReader.get_Rtilde() Rvar = DataReader.get_Rvar() R = DataReader.get_ratings() lam = DataReader.get_lam() P = DataReader.get_nyms() def get_data_dist(data): ratings, counts = np.unique(data, return_counts=True) dist_data = np.zeros(rating_count) dist_data[ratings.astype(int) - 1] = counts / counts.sum() return dist_data def get_err(data, mean, var): dist_data = get_data_dist(data) dist_model = dist_gen.pmf(mean, var) return abs(dist_data / dist_model)
def run(label=None, user_n=500, sample_n=500, n_points=default_n_points, correct_error=False, thresh=60, best_n=100, with_rmse=True, with_accuracy=True, save_points=True, baseline=True, weight=False, plot_spread=False, plot=True, hard_memb=False): # pick item pool with msg('Configuring item pool'): g = GroupRatings() g.thresh(thresh) if label is None: label = 'sampling by pop' if label == 'highest pop': g.highest_pop(best_n) if label == 'lowest pop': g.lowest_pop(best_n) if label == 'highest variance': g.highest_var(best_n) if label == 'lowest variance': g.lowest_var(best_n) if label == 'lowest entropy': g.lowest_entropy(best_n) if label == 'highest 2-norm': g.highest_pnorm(best_n) if label == 'highest max-norm': g.highest_maxnorm(best_n); # generate user data with msg('Generating users'): users = Users(training=g, user_n=user_n) priors = g.group_size_dist() liklihoods = g.dist() max_accuracies = np.zeros(n_points) median_accuracies = np.zeros(n_points) min_accuracies = np.zeros(n_points) max_rmses = np.zeros(n_points) median_rmses = np.zeros(n_points) min_rmses = np.zeros(n_points) if hard_memb: max_rmses_hard = np.zeros(n_points) median_rmses_hard = np.zeros(n_points) min_rmses_hard = np.zeros(n_points) for point in range(0, n_points): with msg(f'Computing point {point}'): with msg('Getting posteriors'): if point > 0: samples = g.sample(min(sample_n, g.item_count()**point), items_per=point, weight=weight) posteriors = get_posteriors(liklihoods, users.training_ratings, samples, priors) else: posteriors = np.full((1, user_n, priors.shape[0]), priors) if with_accuracy: with msg('Getting max accuracy'): accuracies = get_accuracy(posteriors, users) max_accuracies[point] = np.max(accuracies) median_accuracies[point] = np.median(accuracies) min_accuracies[point] = np.min(accuracies) if with_rmse: with msg('Getting min RMSE'): rmses = get_rmse(posteriors, users, hard_memb=False) max_rmses[point] = np.max(rmses) median_rmses[point] = np.median(rmses) min_rmses[point] = np.min(rmses) if hard_memb: with msg('Getting min RMSE hard memeb'): rmses_hard = get_rmse(posteriors, users, hard_memb=True) max_rmses_hard[point] = np.max(rmses_hard) median_rmses_hard[point] = np.median(rmses_hard) min_rmses_hard[point] = np.min(rmses_hard) if save_points: if label == 'sampling by pop': plabel = 'passive' if with_accuracy: with msg('saving', accuracy_save_file(plabel, n_points)): np.save(accuracy_save_file(plabel, n_points), median_accuracies) if with_rmse: with msg('saving', rmse_save_file(plabel, n_points)): np.save(rmse_save_file(plabel, n_points), median_rmses) if hard_memb and with_rmse: with msg('saving', rmse_save_file(plabel, n_points, hard_memb=True)): np.save(rmse_save_file(plabel, n_points, hard_memb=True), median_rmses_hard) else: if with_accuracy: with msg('saving', accuracy_save_file(label, n_points)): np.save(accuracy_save_file(label, n_points), max_accuracies) if with_rmse: with msg('saving', rmse_save_file(label, n_points)): np.save(rmse_save_file(label, n_points), min_rmses) if hard_memb and with_rmse: with msg('saving', rmse_save_file(label, n_points, hard_memb=True)): np.save(rmse_save_file(label, n_points, hard_memb=True), min_rmses_hard) if plot_spread: if with_accuracy: plot_accuracy_spread(min_accuracies, median_accuracies, max_accuracies, label) if with_rmse: plot_rmse_spread(min_rmses, median_rmses, max_rmses, users.test_data_rmse(), label, hard_memb=False) if hard_memb: if with_rmse: plot_rmse_spread(min_rmses_hard, median_rmses_hard, max_rmses_hard, users.test_data_rmse(), label, hard_memb=True) elif plot: if with_accuracy: plot_accuracy(max_accuracies, label, baseline) if with_rmse: plot_rmse(min_rmses, users.test_data_rmse(), label, baseline, hard_memb=False) if hard_memb: if with_rmse: plot_rmse(min_rmses_hard, users.test_data_rmse(), label, baseline, hard_memb=True)