def est_si(graph, p_base=0.9): """ heuristic for setting sigma. Based on the idea that asymptotically log(items) ~ c + log(alpha) + si*log(s) and sim for items :param graph: :return: """ n_samp = 10 items = np.zeros(n_samp) samp = np.copy(graph) for i in range(n_samp): _, samp = user_p_sample( samp, 1. - p_base ) # logically equivalent to samp, _ = user_p_sample(samp, p), but mb faster items[i] = np.unique(samp[:, 1]).shape[0] slope = linregress(x=np.arange(1, n_samp + 1), y=np.log(items)).slope #estimate: # I = C + si * log(s) and s=p^n * s_0 implies # I = C' + si * log(p) * n ; i.e. si = slope / log(p) return slope / np.log(p_base)
def plot_rho_v_psamp_users(edges, p_base=0.9, n_samp=20, N=1, title="", color='blue'): def rho(edges): U = np.unique(edges[:, 0]).shape[0] I = np.unique(edges[:, 1]).shape[0] rho = np.float(edges.shape[0]) / np.float(U * I) return rho P = np.power(p_base, range(n_samp)) rho_matrix = np.zeros((N, n_samp)) rho_matrix[:, 0] = rho(edges) for i in range(N): samp = np.copy(edges) for j in range(n_samp - 1): _, samp = user_p_sample( samp, 1. - p_base ) # logically equivalent to samp, _ = user_p_sample(samp, p), but mb faster rho_matrix[i, j + 1] = rho(samp) # plt.title(title) # plt.xlabel("") # plt.ylabel("") line = plt.plot(P, rho_matrix[0, :], 'o', color=color, label=r'User $p$-sampling') for i in range(N - 1): plt.plot(P, rho_matrix[i + 1, :], 'o', color=color) plt.legend() return line
def clean_item_p_sample(graph, p): """ p sample the items to create a train-test split, and then clean up the resulting test set so that test contains only users that are also in train Note that we *do not* zero index the items (because these will be passed in to something that contains the full item set) :param graph: :param p: train set is p-sampling of items of graph :return: """ lazy = np.copy(graph) # interchange users and items lazy[:, 0] = graph[:, 1] lazy[:, 1] = graph[:, 0] ltrain, ltest = user_p_sample(lazy, p) # eliminate any users in test that aren't also in train, and then give those users a new common zero index # do not reindex the items! (that would break prediction code) ltrain, ltest = clean_p_samp_split(ltrain, ltest, zi_train_u=False, zi_test_u=False) train = ltrain.copy() train[:, 0] = ltrain[:, 1] train[:, 1] = ltrain[:, 0] test = ltest.copy() test[:, 0] = ltest[:, 1] test[:, 1] = ltest[:, 0] return train, test
def rho_after_p_samp(graph, p): """ user_p_sample a graph and compute rho (graph density) rho = |E|/(|V_I|*|V_U|) """ samp, _ = user_p_sample(graph, p) U = np.unique(samp[:, 0]).shape[0] I = np.unique(samp[:, 1]).shape[0] E = samp.shape[0] return np.float(E) / (U * I), U, I
def _build_appx_elbo(self): """ Returns an estimate of \sum_{e in test_idxs} log(prob(e)) + \sum{e not in test_idxs} log(1-prob(e)) this is not actually the log likelihood because it ignores the contribution of uninstantiated atoms (actually, maybe this is handled after all...) :param test_idxs: tensor of shape [e, 2], indices of edges of graph :return: estimate of \sum_{e in test_idxs} log(prob(e)) + \sum{e not in test_idxs} log(1-prob(e)) """ # MC estimate of contribution from edges # obvious choice: uniformly sample terms... but resulting estimator is super high variance # edges_sample = np.copy(self.edge_idx_d[np.random.choice(self.edge_idx_d.shape[0], 3000, replace=False)]).astype(np.int32) # so instead use p-sampling... although it's unclear whether this really represents a major improvement e = self.edge_vals_d.shape[0] p_inc = np.sqrt(5000. / e) #use about 5000 edges for MC est edges_sample = item_p_sample(user_p_sample(self.edge_idx_d, p_inc)[0], p_inc)[0].astype(np.int32) # clip by value because of numerical issues p_edge_samples = tf.clip_by_value(self._edge_prob_samples(edges_sample), 1e-15, 1.) # reduce_mean is MC estimate over params of model, reduce_sum is summing cont from p-samp edge_llhd_est = 1. / p_inc**2 * tf.reduce_sum(tf.reduce_mean(tf.log(p_edge_samples), axis=0)) # log(1-p_ij) = -lambda_ij, so: tot_lam_sum = tf.reduce_sum(self.i_tot_mass_m*self.u_tot_mass_m) # includes contribution from edges as well as non-edges # subtract off edge contribution: user_params = tf.gather(self.q_gam.mean() * self.q_theta.mean(), self.edge_idx[:,0]) item_params = tf.gather(self.q_omega.mean() * self.q_beta.mean(), self.edge_idx[:,1]) edges_lam_sum = tf.reduce_sum(user_params * item_params) nonedge_llhd_term = -(tot_lam_sum - edges_lam_sum) # hopefully lower variance than direct MC est #\sum_edges log(p_ij) = -\sum_edges lam_ij + \sum_ij log(p_ij / (1-p_ij)) # note: the reduce mean here averages over both the sampled params in p_edge_samples, and over the random choice of edges # edge_llhd_est = -edges_lam_sum + e*tf.reduce_mean(tf.reduce_mean(tf.log(p_edge_samples / (1. - p_edge_samples)), axis=0)) self.appx_elbo = [edge_llhd_est, nonedge_llhd_term]
def user_p_samp_stats(graph, n_samp=100): """ Returns |V_U|, |V_I| and |E| after repeated p-sampling """ items = np.zeros(n_samp) users = np.zeros(n_samp) occ_pairs = np.zeros(n_samp) p_incr = 1. / n_samp samp = np.copy(graph) p_last = 1. for i in range(n_samp): p_target = 1. - i * p_incr # 'p' value for ith entry p = p_target / p_last # a p-sampling of samp(G, p_last) is samp(G, p*p_last) samp, _ = user_p_sample(samp, p) users[i] = np.unique(samp[:, 0]).shape[0] items[i] = np.unique(samp[:, 1]).shape[0] occ_pairs[i] = samp.shape[0] p_last = p_target return users, items, occ_pairs
def main(data_dir, params): # Store true_gam, true_theta, true_beta # Store edges to data.txt print "Simulating data with following paramaters" print params [true_gam, true_theta, true_omega, true_beta, edges] = sim_data(params['tu'], params['su'], params['size_u'], params['a'], params['b'], params['ti'], params['si'], params['size_i'], params['c'], params['d'], params['K'], params['eps']) print edges[:, 2].mean() print("Number of occupied pairs in the dataset: {}".format(edges.shape[0])) print("Number of users in the dataset: {}".format( np.unique(edges[:, 0]).shape[0])) print("Number of items in the dataset: {}".format( np.unique(edges[:, 1]).shape[0])) print("e / (U*I): {}").format( np.float(edges.shape[0]) / np.float( np.unique(edges[:, 0]).shape[0] * np.unique(edges[:, 1]).shape[0])) print("e / (size_u*size_i): {}").format( np.float(edges.shape[0]) / np.float(params['size_u'] * params['size_i'])) print "Storing data at: " + data_dir + "/data.pkl" with open(data_dir + "/data.pkl", "wb") as f: pickle.dump(edges, f) pickle.dump(true_gam, f) pickle.dump(true_theta, f) pickle.dump(true_omega, f) pickle.dump(true_beta, f) pickle.dump(params, f) pass print "Splitting the dataset" edges_train, edges_test = user_p_sample(edges, p=0.8) edges_train, edges_test, allusers_train, allitems = clean_p_samp_split( edges_train, edges_test) true_gam_train = true_gam[allusers_train] true_theta_train = true_theta[allusers_train, :] true_omega_train = true_omega[allitems] true_beta_train = true_beta[allitems, :] print("Edges in train set: {}".format(edges_train.shape[0])) print("Edges in test set: {}".format(edges_test.shape[0])) print("Storing train set to {}".format(data_dir + "/train.pkl")) with open(data_dir + "/train.pkl", "wb") as f: pickle.dump(edges_train, f) pickle.dump(true_gam_train, f) pickle.dump(true_theta_train, f) pickle.dump(true_omega_train, f) pickle.dump(true_beta_train, f) pickle.dump(params, f) # Split edges_test to test_holdout and test_look print "Splitting the test set to lookup and holdout" edges_test_lookup, edges_test_holdout = clean_item_p_sample( edges_test, 0.8) print("Storing test lookup set to {}".format(data_dir + "/test_lookup.pkl")) print("Edges in test lookup set: {}".format(edges_test_lookup.shape[0])) print("Edges in test holdout set: {}".format(edges_test_holdout.shape[0])) with open(data_dir + "/test_lookup.pkl", "wb") as f: pickle.dump(edges_test_lookup, f) print("Storing test holdout set to {}".format(data_dir + "/test_holdout.pkl")) with open(data_dir + "/test_holdout.pkl", "wb") as f: pickle.dump(edges_test_holdout, f) print("Done!")