def est_si(graph, p_base=0.9):
    """
    heuristic for setting sigma. Based on the idea that asymptotically
    log(items) ~ c + log(alpha) + si*log(s) and sim for items

    :param graph:
    :return:
    """
    n_samp = 10

    items = np.zeros(n_samp)
    samp = np.copy(graph)

    for i in range(n_samp):
        _, samp = user_p_sample(
            samp, 1. - p_base
        )  # logically equivalent to samp, _ = user_p_sample(samp, p), but mb faster
        items[i] = np.unique(samp[:, 1]).shape[0]

    slope = linregress(x=np.arange(1, n_samp + 1), y=np.log(items)).slope
    #estimate:

    # I = C + si * log(s) and s=p^n * s_0 implies
    # I = C' + si * log(p) * n ; i.e. si = slope / log(p)
    return slope / np.log(p_base)
def plot_rho_v_psamp_users(edges,
                           p_base=0.9,
                           n_samp=20,
                           N=1,
                           title="",
                           color='blue'):
    def rho(edges):
        U = np.unique(edges[:, 0]).shape[0]
        I = np.unique(edges[:, 1]).shape[0]
        rho = np.float(edges.shape[0]) / np.float(U * I)
        return rho

    P = np.power(p_base, range(n_samp))
    rho_matrix = np.zeros((N, n_samp))
    rho_matrix[:, 0] = rho(edges)
    for i in range(N):
        samp = np.copy(edges)
        for j in range(n_samp - 1):
            _, samp = user_p_sample(
                samp, 1. - p_base
            )  # logically equivalent to samp, _ = user_p_sample(samp, p), but mb faster
            rho_matrix[i, j + 1] = rho(samp)

    # plt.title(title)
    # plt.xlabel("")
    # plt.ylabel("")
    line = plt.plot(P,
                    rho_matrix[0, :],
                    'o',
                    color=color,
                    label=r'User $p$-sampling')
    for i in range(N - 1):
        plt.plot(P, rho_matrix[i + 1, :], 'o', color=color)
    plt.legend()
    return line
Beispiel #3
0
def clean_item_p_sample(graph, p):
    """
    p sample the items to create a train-test split, and then clean up the resulting test set so that
    test contains only users that are also in train
    Note that we *do not* zero index the items (because these will be passed in to something that contains the full item set)
    :param graph:
    :param p: train set is p-sampling of items of graph
    :return:
    """
    lazy = np.copy(graph)
    # interchange users and items
    lazy[:, 0] = graph[:, 1]
    lazy[:, 1] = graph[:, 0]

    ltrain, ltest = user_p_sample(lazy, p)
    # eliminate any users in test that aren't also in train, and then give those users a new common zero index
    # do not reindex the items! (that would break prediction code)
    ltrain, ltest = clean_p_samp_split(ltrain,
                                       ltest,
                                       zi_train_u=False,
                                       zi_test_u=False)

    train = ltrain.copy()
    train[:, 0] = ltrain[:, 1]
    train[:, 1] = ltrain[:, 0]

    test = ltest.copy()
    test[:, 0] = ltest[:, 1]
    test[:, 1] = ltest[:, 0]

    return train, test
def rho_after_p_samp(graph, p):
    """
    user_p_sample a graph and compute rho (graph density) 
    rho = |E|/(|V_I|*|V_U|)
    """
    samp, _ = user_p_sample(graph, p)
    U = np.unique(samp[:, 0]).shape[0]
    I = np.unique(samp[:, 1]).shape[0]
    E = samp.shape[0]
    return np.float(E) / (U * I), U, I
Beispiel #5
0
    def _build_appx_elbo(self):
        """
        Returns an estimate of \sum_{e in test_idxs} log(prob(e)) + \sum{e not in test_idxs} log(1-prob(e))
        this is not actually the log likelihood because it ignores the contribution of uninstantiated atoms
        (actually, maybe this is handled after all...)
        :param test_idxs: tensor of shape [e, 2], indices of edges of graph
        :return: estimate of \sum_{e in test_idxs} log(prob(e)) + \sum{e not in test_idxs} log(1-prob(e))
        """

        # MC estimate of contribution from edges
        # obvious choice: uniformly sample terms... but resulting estimator is super high variance
        # edges_sample = np.copy(self.edge_idx_d[np.random.choice(self.edge_idx_d.shape[0], 3000, replace=False)]).astype(np.int32)
        # so instead use p-sampling... although it's unclear whether this really represents a major improvement
        e = self.edge_vals_d.shape[0]
        p_inc = np.sqrt(5000. / e) #use about 5000 edges for MC est
        edges_sample = item_p_sample(user_p_sample(self.edge_idx_d, p_inc)[0], p_inc)[0].astype(np.int32)

        # clip by value because of numerical issues
        p_edge_samples = tf.clip_by_value(self._edge_prob_samples(edges_sample), 1e-15, 1.)

        # reduce_mean is MC estimate over params of model, reduce_sum is summing cont from p-samp
        edge_llhd_est = 1. / p_inc**2 * tf.reduce_sum(tf.reduce_mean(tf.log(p_edge_samples), axis=0))

        # log(1-p_ij) = -lambda_ij, so:
        tot_lam_sum = tf.reduce_sum(self.i_tot_mass_m*self.u_tot_mass_m) # includes contribution from edges as well as non-edges
        # subtract off edge contribution:
        user_params = tf.gather(self.q_gam.mean() * self.q_theta.mean(), self.edge_idx[:,0])
        item_params = tf.gather(self.q_omega.mean() * self.q_beta.mean(), self.edge_idx[:,1])
        edges_lam_sum = tf.reduce_sum(user_params * item_params)
        nonedge_llhd_term = -(tot_lam_sum - edges_lam_sum)

        # hopefully lower variance than direct MC est
        #\sum_edges log(p_ij) = -\sum_edges lam_ij + \sum_ij log(p_ij / (1-p_ij))

        # note: the reduce mean here averages over both the sampled params in p_edge_samples, and over the random choice of edges
        # edge_llhd_est = -edges_lam_sum + e*tf.reduce_mean(tf.reduce_mean(tf.log(p_edge_samples / (1. - p_edge_samples)), axis=0))

        self.appx_elbo = [edge_llhd_est, nonedge_llhd_term]
def user_p_samp_stats(graph, n_samp=100):
    """
    Returns |V_U|, |V_I| and |E| after repeated p-sampling
    """
    items = np.zeros(n_samp)
    users = np.zeros(n_samp)
    occ_pairs = np.zeros(n_samp)

    p_incr = 1. / n_samp
    samp = np.copy(graph)
    p_last = 1.
    for i in range(n_samp):
        p_target = 1. - i * p_incr  # 'p' value for ith entry
        p = p_target / p_last  # a p-sampling of samp(G, p_last) is samp(G, p*p_last)

        samp, _ = user_p_sample(samp, p)

        users[i] = np.unique(samp[:, 0]).shape[0]
        items[i] = np.unique(samp[:, 1]).shape[0]
        occ_pairs[i] = samp.shape[0]

        p_last = p_target

    return users, items, occ_pairs
Beispiel #7
0
def main(data_dir, params):
    # Store true_gam, true_theta, true_beta
    # Store edges to data.txt
    print "Simulating data with following paramaters"
    print params
    [true_gam, true_theta, true_omega, true_beta,
     edges] = sim_data(params['tu'], params['su'], params['size_u'],
                       params['a'], params['b'], params['ti'], params['si'],
                       params['size_i'], params['c'], params['d'], params['K'],
                       params['eps'])

    print edges[:, 2].mean()

    print("Number of occupied pairs in the dataset: {}".format(edges.shape[0]))
    print("Number of users in the dataset: {}".format(
        np.unique(edges[:, 0]).shape[0]))
    print("Number of items in the dataset: {}".format(
        np.unique(edges[:, 1]).shape[0]))
    print("e / (U*I): {}").format(
        np.float(edges.shape[0]) / np.float(
            np.unique(edges[:, 0]).shape[0] * np.unique(edges[:, 1]).shape[0]))
    print("e / (size_u*size_i): {}").format(
        np.float(edges.shape[0]) /
        np.float(params['size_u'] * params['size_i']))

    print "Storing data at: " + data_dir + "/data.pkl"
    with open(data_dir + "/data.pkl", "wb") as f:
        pickle.dump(edges, f)
        pickle.dump(true_gam, f)
        pickle.dump(true_theta, f)
        pickle.dump(true_omega, f)
        pickle.dump(true_beta, f)
        pickle.dump(params, f)
    pass

    print "Splitting the dataset"
    edges_train, edges_test = user_p_sample(edges, p=0.8)
    edges_train, edges_test, allusers_train, allitems = clean_p_samp_split(
        edges_train, edges_test)
    true_gam_train = true_gam[allusers_train]
    true_theta_train = true_theta[allusers_train, :]
    true_omega_train = true_omega[allitems]
    true_beta_train = true_beta[allitems, :]

    print("Edges in train set: {}".format(edges_train.shape[0]))
    print("Edges in test set: {}".format(edges_test.shape[0]))
    print("Storing train set to {}".format(data_dir + "/train.pkl"))
    with open(data_dir + "/train.pkl", "wb") as f:
        pickle.dump(edges_train, f)
        pickle.dump(true_gam_train, f)
        pickle.dump(true_theta_train, f)
        pickle.dump(true_omega_train, f)
        pickle.dump(true_beta_train, f)
        pickle.dump(params, f)

    # Split edges_test to test_holdout and test_look
    print "Splitting the test set to lookup and holdout"
    edges_test_lookup, edges_test_holdout = clean_item_p_sample(
        edges_test, 0.8)
    print("Storing test lookup set to {}".format(data_dir +
                                                 "/test_lookup.pkl"))
    print("Edges in test lookup set: {}".format(edges_test_lookup.shape[0]))
    print("Edges in test holdout set: {}".format(edges_test_holdout.shape[0]))
    with open(data_dir + "/test_lookup.pkl", "wb") as f:
        pickle.dump(edges_test_lookup, f)
    print("Storing test holdout set to {}".format(data_dir +
                                                  "/test_holdout.pkl"))

    with open(data_dir + "/test_holdout.pkl", "wb") as f:
        pickle.dump(edges_test_holdout, f)
    print("Done!")