Esempio n. 1
0
    def __init__(self,
                 train_data,
                 test_data,
                 interaction_data: sp.csr_matrix,
                 n_epochs=100,
                 batch_size=256,
                 embedding_k=64,
                 top_k=10,
                 learning_rate=0.0001,
                 use_model=True):
        """
        Init function.
        :param train_data: The train data.
        :param test_data: The test data.
        :param interaction_data: The user-track interaction data.
        :param n_epochs: Train epochs.
        :param batch_size: Train batch size.
        :param embedding_k: The length of user/track embedding vector.
        :param top_k: In top-k recommendation, Recommend top_k tracks for each user.
        """
        self.train_data = train_data
        self.test_data = test_data
        self.interaction_data: sp.csr_matrix = interaction_data

        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.embedding_k = embedding_k
        self.top_k = top_k
        self.learning_rate = learning_rate

        self.num_user = interaction_data.get_shape()[0]
        self.num_item = interaction_data.get_shape()[1]

        # build TF graph
        self.build_model()

        # create session
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()

        if use_model:
            ckpt = tf.train.get_checkpoint_state('../cpkt/')  # checkpoint存在的目录
            if ckpt and ckpt.model_checkpoint_path:
                self.saver.restore(self.sess, ckpt.model_checkpoint_path
                                   )  # 自动恢复model_checkpoint_path保存模型一般是最新
                print("Model restored...")
            else:
                print('No Model')
        return
def page_rank(m: csr_matrix, alpha):
    """
    """
    n, _ = m.get_shape()
    degree = numpy.sum(m, axis=1)
    weight_mat = m / (degree + numpy.array(degree == 0, dtype=int))
    # weight_mat = m
    v = numpy.random.rand(n).reshape((-1, 1))
    last_v = v
    while True:
        v = alpha * (weight_mat.T * v) + (1 - alpha) * v
        delta = numpy.sum(abs(v - last_v))
        print(delta)
        if delta < 0.0001:
            break
        last_v = v
    return numpy.array(v).flatten()
 def __init__(self, A: sparse.csr_matrix, L: sparse.csr_matrix, batch_size=1):
     '''
     This is trick dataset for graph. I pass batch_size here so when training, DataLoader is always batch_size =1
     :param A:
     :param L:
     :param batch_size:
     '''
     # self.dts = []
     # dataset_size = A.shape[0]
     # steps_per_epoch = (dataset_size - 1) // batch_size + 1
     # for i in range(steps_per_epoch):
     #     index = np.arange(
     #         i * batch_size, min((i + 1) * batch_size, dataset_size))
     #     A_train = A[index, :].todense()
     #     L_train = L[index][:, index].todense()
     #
     #     A_train = torch.tensor(A_train)
     #     L_train = torch.tensor(L_train)
     #     batch_inp = [A_train, L_train]
     #     self.dts.append(batch_inp)
     self.A = A
     self.L = L
     self.size = A.get_shape()[0]
Esempio n. 4
0
def bisecting_kmeans(points: sparse.csr_matrix, k=2):
    user_to_cluster = {}
    cluster_to_user = {}
    clusters = {0: points}
    for i in range(points.get_shape()[0]):
        user_to_cluster[i] = (0, i)
        cluster_to_user[(0, i)] = i
    while len(clusters) < k:
        biggest_cluster_key = list(clusters.keys())[0]
        for i in clusters:
            if clusters[i].get_shape(
            )[0] > clusters[biggest_cluster_key].get_shape()[0]:
                biggest_cluster_key = i
        biggest_cluster = clusters[biggest_cluster_key]
        # remove cluster from dict
        del clusters[biggest_cluster_key]
        kmeans = KMeans(n_clusters=2, max_iter=100).fit(biggest_cluster)

        key1 = random.randint(1, 1000000)
        key2 = random.randint(1, 1000000)
        id1 = 0
        id2 = 0
        clusters_data1 = [[], [], []]
        clusters_data2 = [[], [], []]

        for i in range(len(kmeans.labels_)):
            row, col = biggest_cluster.getrow(i).nonzero()
            data = np.array(biggest_cluster.getrow(i)[row, col]).flatten()
            if kmeans.labels_[i] == 0:
                row = np.ones(len(col), dtype=int) * id1
                for j in range(len(col)):
                    clusters_data1[0].append(row[j])
                    clusters_data1[1].append(col[j])
                    clusters_data1[2].append(data[j])
                    # update mapping
                user_id = cluster_to_user[(key1, id1)] = cluster_to_user[(
                    biggest_cluster_key, i)]
                user_to_cluster[user_id] = (key1, id1)
                id1 += 1

            else:
                row = np.ones(len(col), dtype=int) * id2
                for j in range(len(col)):
                    clusters_data2[0].append(row[j])
                    clusters_data2[1].append(col[j])
                    clusters_data2[2].append(data[j])
                    # update mapping
                user_id = cluster_to_user[(key2, id2)] = cluster_to_user[(
                    biggest_cluster_key, i)]
                user_to_cluster[user_id] = (key2, 2)
                id2 += 1
            del cluster_to_user[biggest_cluster_key, i]

        clusters[key1] = (sparse.csr_matrix(
            (clusters_data1[2], (clusters_data1[0], clusters_data1[1])),
            (id1, biggest_cluster.get_shape()[1])))
        clusters[key2] = (sparse.csr_matrix(
            (clusters_data2[2], (clusters_data2[0], clusters_data2[1])),
            (id2, biggest_cluster.get_shape()[1])))

    return clusters, user_to_cluster