Esempio n. 1
0
def compute_nystrom(ds_name, use_node_labels, embedding_dim,
                    community_detection_method, kernels):
    if ds_name == "SYNTHETIC":
        graphs, labels = generate_synthetic()
    else:
        graphs, labels = load_data(ds_name, use_node_labels)

    print('computing communities ...')
    communities, subgraphs = compute_communities(graphs, use_node_labels,
                                                 community_detection_method)

    print("Number of communities: ", len(communities))
    lens = []
    for community in communities:
        lens.append(community.number_of_nodes())

    print("Average size: %.2f" % np.mean(lens))
    Q = []
    for idx, k in enumerate(kernels):
        model = Nystrom(k, n_components=embedding_dim)
        model.fit(communities)
        Q_t = model.transform(communities)
        Q_t = np.vstack([np.zeros(embedding_dim), Q_t])
        Q.append(Q_t)

    return Q, subgraphs, labels, Q_t.shape
 def fit(self,X,y,batch_size=100):
     '''
     Arguments:
     X: (array) The training data, must have two dimensions.
     y: (array) The training labels, must be one-hot encoded with two dimensions. To produce training labels in 
     this format, one might use sklearn.preprocessing.LabelBinarizer.
     batch_size: (int) The mini-batch size for computing the stochastic gradients.
     '''
     
     mapper = Nystrom(kernel=self.kernel,gamma=self.gamma,degree=self.degree,
                      coef0=self.coef0,n=self.n,k=self.k,rand_svd=self.rand_svd);
     mapper.fit(X);
     self._Xrep = mapper._Xrep;
     clf = TFClassifier(loss=self.loss,alpha=self.alpha,optimizer=self.optimizer)
         
     clf.fit(mapper.transform(X),y,batch_size=batch_size);
     self.dual_coef_ = np.dot(mapper.A,clf.coef_.T).T;
     self.intercept_ = clf.intercept_;
     return
Esempio n. 3
0
def compute_nystrom(use_node_labels, embedding_dim, community_detection_method, kernels):
   graphs_reg, labels_reg,graphs_gen, labels_gen,graphs_mal, labels_mal = load()
   graphs=graphs_reg+graphs_gen+graphs_mal
   labels=np.concatenate((labels_reg,labels_gen,labels_mal),axis=0)
   communities, subgraphs = compute_communities(graphs, use_node_labels, community_detection_method)

   print("Number of communities: ", len(communities))
   lens = []
   for community in communities:
       lens.append(community.number_of_nodes())

   print("Average size: %.2f" % np.mean(lens))
   Q=[]
   for idx, k in enumerate(kernels):
       model = Nystrom(k, n_components=embedding_dim)
       model.fit(communities)
       Q_t = model.transform(communities)
       Q_t = np.vstack([np.zeros(embedding_dim), Q_t])
       Q.append(Q_t)

   return Q, subgraphs, labels, Q_t.shape
Esempio n. 4
0
def compute_nystrom(ds_name, pct_data, use_node_labels, embedding_dim, community_detection_method, kernels, seed):
    communities_load_path = 'communities_dump_" + ds_name + "_balance_42.pkl'
    nystrom_load_path = "nystrom_dump_" + ds_name + "_balance_42.pkl"

    if os.path.exists(nystrom_load_path):
        print('loading Nystrom results from ', nystrom_load_path)
        return pkl.load(open(nystrom_load_path, 'rb'))
    if os.path.exists(communities_load_path):
        print("loading preprocessed communities data from", communities_load_path)
        communities, subgraphs = pkl.load(open(communities_load_path, 'rb'))
    else:
        if ds_name == "SYNTHETIC":
            graphs, labels = generate_synthetic()
        else:
            graphs, labels = load_data(
                dataset=ds_name, pct_data=pct_data, seed=seed)
        communities, subgraphs = compute_communities(
            graphs, use_node_labels, community_detection_method)

    print("Number of communities: ", len(communities))
    print("dumping communities to", communities_load_path)
    lens = []
    for community in communities:
        lens.append(community.number_of_nodes())

    print("Average size: %.2f" % np.mean(lens))
    sys.stdout.flush()
    Q = []

    for idx, k in enumerate(kernels):
        model = Nystrom(k, n_components=embedding_dim)
        model.fit(communities)
        Q_t = model.transform(communities)
        Q_t = np.vstack([np.zeros(embedding_dim), Q_t])
        Q.append(Q_t)

    print("Dumping Nystrom output to", nystrom_load_path)
    pkl.dump((Q, subgraphs, labels, Q_t.shape), open(nystrom_load_path, 'wb'))
    return Q, subgraphs, labels, Q_t.shape
    lr_t = learning_rate
    min_avg_loss = np.inf
    avg_obj = 0.

    for t in range(1, n_epochs + 1):

        for batch_num in range(1, len(Xtr) // batch_size + 1):
            Xb, yb = get_next_batch(batch_size)
            di_obj, _ = sess.run([objective, train_step],
                                 feed_dict={
                                     x: Xb,
                                     y: yb,
                                     lr: lr_t
                                 })
            avg_obj = ((batch_num - 1) * avg_obj + di_obj) / batch_num

        print('Epoch %d with average KDI %f' % (t, avg_obj))
        if t % lr_epochs == 0:
            lr_t = lr_t * 0.1

    print('Fitting a predictor on the optimized Nystrom mapping.')
    kmap = Nystrom(kernel='rbf', gamma=gamma, n=n)
    kmap.fit(sess.run(kernel_func.X_rep))
    sess.close()

    clf = TFClassifier(loss='logistic', alpha=0.)
    clf.fit(kmap.transform(Xtr), ytr)
    Accuracy = clf.score(kmap.transform(Xte), yte)

    print('Accuracy for feature dimensionality %d: %f' % (n, Accuracy))
def mpgk_aa(Gs, h, n_clusters, limit):
    N = len(Gs)
    if use_node_labels:
        d = Gs[0].node[list(Gs[0].nodes())[0]]['label'].size
    else:
        d = Gs[0].node[list(Gs[0].nodes())[0]]['attributes'].size

    idx = np.zeros(N + 1, dtype=np.int64)
    nbrs = dict()
    ndata = []
    for i in range(N):
        n = Gs[i].number_of_nodes()
        idx[i + 1] = idx[i] + n

        nodes = list(Gs[i].nodes())
        M = np.zeros((n, d))
        nodes2idx = dict()
        for j in range(idx[i], idx[i + 1]):
            if use_node_labels:
                M[j - idx[i], :] = Gs[i].node[nodes[j - idx[i]]]['label']
            else:
                M[j - idx[i], :] = Gs[i].node[nodes[j - idx[i]]]['attributes']
            nodes2idx[nodes[j - idx[i]]] = j

        ndata.append(M)

        for node in nodes:
            nbrs[nodes2idx[node]] = list()
            for neighbor in Gs[i].neighbors(node):
                nbrs[nodes2idx[node]].append(nodes2idx[neighbor])

    graph_hists = list()
    X = np.vstack(ndata)

    for it in range(1, h + 1):
        print("Iteration:", it)
        hists, nbrs_hists = compute_histograms(X, nbrs, n_clusters, limit)
        X = np.zeros((X.shape[0], 200))

        ny = Nystrom(n_components=150)
        ny.fit(hists)
        X[:, :150] = ny.transform(hists)

        ny = Nystrom(n_components=50)
        ny.fit(nbrs_hists)
        X[:, 150:] = ny.transform(nbrs_hists)

        graph_hists.append(list())
        for i in range(N):
            d = dict()
            for j in range(idx[i], idx[i + 1]):
                for n in hists[j]:
                    if n in d:
                        d[n] += hists[j][n]
                    else:
                        d[n] = hists[j][n]
            graph_hists[it - 1].append(d)

    K = np.zeros((N, N))
    for it in range(h):
        for i in range(N):
            for j in range(i, N):
                for n in graph_hists[it][i]:
                    if n in graph_hists[it][j]:
                        K[i, j] += min(graph_hists[it][i][n],
                                       graph_hists[it][j][n])
                K[j, i] = K[i, j]

    return K