Beispiel #1
0
    def build_graph_from_matrix(self,
                                x,
                                is_directed=True,
                                remove_self_loops=False,
                                normalized_edge=True,
                                outward_prob_check=False):
        # The x matrix is exactly the relationship map between them.
        logger.info("Building graph.")
        self.edges = x  #For memmap compatibility
        if not issparse(self.edges):
            logger.info("Transforming into scipy sparse matrix")
            self.edges = csr_matrix(self.edges)

        # Iterate through matrix to get directed node -> node relationship
        logger.info("Build relation matrix (self.edges)")
        row_idxs, col_idxs = csr_matrix.nonzero(self.edges)
        assert len(row_idxs) == len(col_idxs)
        for i in range(len(row_idxs)):
            self.nodes[row_idxs[i]].append(col_idxs[i])

        self.make_consistent()

        if remove_self_loops:
            self.remove_self_loops()

        if not is_directed:
            self.make_bidirection()

        if normalized_edge:
            self.normalize_edges()

        self.check_valid(outward_prob_check=outward_prob_check)
Beispiel #2
0
def _apply_cfidf(csr_matrix):
    num_docs, num_concepts = csr_matrix.shape
    _, nz_concept_idx = csr_matrix.nonzero()
    cf = np.bincount(nz_concept_idx, minlength=num_concepts)
    icf = np.log(num_docs / cf)
    icf[np.isinf(icf)] = 0
    return safe_sparse_dot(csr_matrix, scipy.sparse.diags(icf))
Beispiel #3
0
def csr_to_symmetric(csr_matrix):
    """
    This is suuuuupper slow
    :param csr_matrix:
    :return:
    """
    rows, cols = csr_matrix.nonzero()
    csr_matrix[cols, rows] = csr_matrix[rows, cols]
    return csr_matrix
def save_sparse_file(csr_matrix,filename):
    data = csr_matrix.data
    rows, cols = csr_matrix.nonzero()
    f = open(filename,'w')
    
    
    for i in range(len(data)):
        f.write(str(rows[i]+1)+' ')
        f.write(str(cols[i]+1)+' ')
        f.write(str(data[i])+'\n')
        
    f.close()
Beispiel #5
0
def tpe(pc_xyz_noise, pc_weight_unit, pc_weight_sym):
    pc_smooth_plane = np.zeros_like(pc_xyz_noise)
    pc_smooth_patch = np.zeros_like(pc_xyz_noise)
    pc_avg_plane = pc_weight_unit @ pc_xyz_noise
    num_data = pc_xyz_noise.shape[0]
    normal_vec = np.zeros((num_data, 3))
    inter = np.zeros((num_data, 1))

    for ii in range(num_data):
        pc_neigh_plane_ii = pc_xyz_noise[csr_matrix.nonzero(pc_weight_unit[ii])[-1]].T
        pc_neigh_patch_ii = pc_xyz_noise[csr_matrix.nonzero(pc_weight_sym[ii])[-1]].T
        pc_neigh_mat_ii_tr = np.zeros_like(pc_neigh_plane_ii)[None, :, :]
        pc_neigh_mat_ii_tr[0, :, :] = pc_neigh_plane_ii
        pc_neigh_mat_ii = np.transpose(pc_neigh_mat_ii_tr, [1, 0, 2])
        pc_avg_ii = pc_avg_plane[ii][:, None]
        weight_ii = pc_weight_unit[ii] # sum to 1
        weight_ii = np.asarray(weight_ii[:, csr_matrix.nonzero(pc_weight_unit[ii])[-1]].todense())
        weight_mat_ii = np.tile(weight_ii, (3, 3, 1))
        M_ii = np.sum(pc_neigh_mat_ii * pc_neigh_mat_ii_tr * weight_mat_ii, axis=-1) - pc_avg_ii @ pc_avg_ii.T # k: # of xyz(3), m:1, n: # of nonzero
        D,V = np.linalg.eig(M_ii) # D: eigenvalue V: eigenvector
        D_sort = np.sort(D)[::-1]
        index = np.argsort(D)[::-1]
        
        V_sort = V[:,index]
        n_ii = V_sort[:, -1] # min eigenvector (normal vector)
        c_ii = pc_avg_ii.T @ n_ii # intercept
        normal_vec[ii , :] = n_ii
        inter[ii , :] = c_ii
        pc_neigh_plane_proj_ii = pc_xyz_noise[ii , :].T - (n_ii @ pc_xyz_noise[ii , :] - c_ii) * n_ii
        pc_neigh_patch_proj_ii = pc_neigh_patch_ii - n_ii[:, None] * np.tile((n_ii @ pc_neigh_patch_ii - np.tile(c_ii, (1 , pc_neigh_patch_ii.shape[-1]))), (3 , 1))
        weight_patch = np.asarray(pc_weight_sym[ii, csr_matrix.nonzero(pc_weight_sym[ii])[-1]].todense())
        pc_smooth_plane[ii] = pc_neigh_plane_proj_ii.T
        pc_smooth_patch[csr_matrix.nonzero(pc_weight_sym[ii])[-1], : ] = pc_smooth_patch[csr_matrix.nonzero(pc_weight_sym[ii])[-1], : ] + (weight_patch * pc_neigh_patch_proj_ii).T
    pc_smooth_patch = pc_smooth_patch/(np.sum(pc_weight_sym, axis=1))

    pc_xyz_denoise_wmp = np.array(pc_smooth_plane * 0.6 + pc_smooth_patch * 0.4)
    return pc_xyz_denoise_wmp, normal_vec, inter
Beispiel #6
0
def csr_to_graph(csr_matrix):
    graph = dict()
    fathers, sons = csr_matrix.nonzero()
    for father, son in zip(fathers, sons):
        if father not in graph:
            graph[father] = set([son])
        else:
            graph[father].add(son)
    # Add the reverse edges as well
    for son, father in zip(fathers, sons):
        if father not in graph:
            graph[father] = set([son])
        else:
            graph[father].add(son)
    return graph
Beispiel #7
0
def shrink_corpus(corpus, length):
    TfIdf_info = TfIdf(corpus)
    scores = TfIdf_info['scores']
    column_to_vocab = TfIdf_info['column_to_vocab']
    new_corpus = []
    for i in tqdm.tqdm(range(0, len(corpus))):
        # obtain nonzero columns:
        row = scores.getrow(i)  # use getrow since scores is csr matrix
        indices = csr_matrix.nonzero(row)[1]
        # obtain the Tfidf-score of each word-index
        entries = [scores[i, j] for j in indices]
        # sort those scores and keep only the top ones
        top_entries = np.argsort(entries)[-length:]
        # get the corresponding column-index for each score
        top_word_indices = [indices[j] for j in top_entries]
        # get the corresponding word for each column index:
        top_words = [column_to_vocab[j] for j in top_word_indices]
        # shrink the commentt
        shrunken_comment = [word for word in corpus[i] if word in top_words]
        new_corpus.append(shrunken_comment)
    return new_corpus
Beispiel #8
0
def wmp(pc_xyz_noise, pc_weight_unit, normal_vec, inter, lmd=2/3, eta=0.05):
    num_data = pc_xyz_noise.shape[0]
    pc_xyz_denoise = copy.deepcopy(pc_xyz_noise)

    for ii in range(num_data):
        pc_neigh_ii = csr_matrix.nonzero(pc_weight_unit[ii])[-1]
        mu = 0
        t = np.zeros(3)
        num_neigh = len(pc_neigh_ii)
        eta = np.log10(num_neigh)/num_neigh # might be changed
        sum_prod = 0
        for jj in pc_neigh_ii:
            proj = pc_xyz_denoise[ii] - (normal_vec[ii] @ pc_xyz_denoise[ii] - inter[ii]) # argmin_t||p-t||_2^2
            # t_prev = t
            t = 1 / (lmd*pc_weight_unit[ii, jj]+eta) * (lmd*pc_weight_unit[ii, jj] * pc_xyz_denoise[ii]
                + eta * (proj - mu/eta))
            
            # print((lmd*pc_weight_unit[ii, jj]+eta))
            # print(lmd*pc_weight_unit[ii, jj] * pc_xyz_noise[ii])
            # print(eta * (proj - mu/eta))
            mu += 2 * eta * (t - proj)
            sum_prod += pc_weight_unit[ii, jj] * t
        pc_xyz_denoise[ii] = 1 / (1+lmd) * (pc_xyz_noise[ii] + lmd * sum_prod)
    return pc_xyz_denoise
Beispiel #9
0
def get_node_neighbor_dict(adj, N):
    node_neighbors_dict = {}
    for i in range(N):
        node = adj[i]
        node_neighbors_dict[i] = csr_matrix.nonzero(node)[1]
    return node_neighbors_dict
Beispiel #10
0
def BGCN_model_trial(trials_per_partition, data_partition_seed, trial_index,
                     log_dir):
    def train_gcn_one_epoch(support_graph, epoch):
        # Prepare feed dict for training set
        t = time.time()
        feed_dict_train = construct_feed_dict(features, support_graph, y_train,
                                              train_mask, placeholders)
        feed_dict_train.update({placeholders['dropout']: FLAGS.dropout})

        # Training step
        outs = sess.run([model.opt_op, model.loss, model.accuracy],
                        feed_dict=feed_dict_train)

        # Validation set
        cost_val, acc_val, duration = evaluate(features, support_original,
                                               y_val, val_mask, placeholders)

        # Test set using the sampled graph
        test_cost, test_acc, test_duration = evaluate(features, support_graph,
                                                      y_test, test_mask,
                                                      placeholders)

        # Test set using the original graph
        test_cost_original_graph, test_acc_original_graph, _ = evaluate(
            features, support_original, y_test, test_mask, placeholders)

        #get the softmax using the sample graphs
        feed_dict_val = construct_feed_dict(features, support_graph, y_test,
                                            test_mask, placeholders)
        feed_dict_val.update({placeholders['dropout']: 0})
        soft_labels_sample_graphs = sess.run(tf.nn.softmax(model.outputs),
                                             feed_dict=feed_dict_val)

        #get the softmax of using the original graph
        feed_dict_OG = construct_feed_dict(features, support_original, y_test,
                                           test_mask, placeholders)
        feed_dict_OG.update({placeholders['dropout']: 0})
        soft_labels_OG_graph = sess.run(tf.nn.softmax(model.outputs),
                                        feed_dict=feed_dict_OG)

        # evaluate cross-entropy loss
        cross_entropy_loss_avg_soft_labels = log_loss(
            labels_value[test_set_index],
            soft_labels_sample_graphs[test_set_index])

        #  Print results
        if epoch % 10 == 9:
            print(
                "==================================================================="
            )
            print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
                  "{:.5f}".format(outs[1]), "train_acc=",
                  "{:.5f}".format(outs[2]), "time=",
                  "{:.5f}".format(time.time() - t))

            print("val_loss=", "{:.5f}".format(cost_val), "val_acc=",
                  "{:.5f}".format(acc_val))

        return cost_val, soft_labels_OG_graph, soft_labels_sample_graphs

    # data_partition_seed is used for data partition and generate a seed list for the neural network initial seed
    np.random.seed(data_partition_seed)
    trials_per_partition = trials_per_partition
    seed_list = np.random.randint(1, 1e6, trials_per_partition)

    for seed in seed_list:
        np.random.seed(data_partition_seed)  #decide the data partition seed
        # ===========================load data========================================
        timestamp = str(datetime.now())[0:10]
        log_file_name = 'trial_index_' + str(trial_index) + '_data_partition_seed_' + str(data_partition_seed) \
                        + '_seed_' + str(seed) + '_' + timestamp + '.txt'
        if FLAGS.save_log:
            sys.stdout = open(log_dir + log_file_name, 'w')

        if not FLAGS.random_data_partition:
            adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, labels, order = \
                data_partition_fixed(dataset_name=FLAGS.dataset, label_n_per_class=FLAGS.label_per_class_n)
        else:
            adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, labels, order = \
                data_partition_random(dataset_name=FLAGS.dataset, label_n_per_class=FLAGS.label_per_class_n)

        np.random.seed(seed)  # decide the seed for graph inference
        random.seed(seed)
        tf.set_random_seed(
            seed)  # decide the random seed for neural network initial weights

        print("The index number for this trial is {}".format(trial_index))
        print("The data partition seed for this trial is {}".format(
            data_partition_seed))
        print("The seed number for this trial is {}".format(seed))

        N = len(y_train)

        node_neighbors_dict = {}
        for i in range(N):
            node = adj[i]
            node_neighbors_dict[i] = csr_matrix.nonzero(node)[1]

        labels_value = labels.argmax(axis=1)  #get the label value

        test_set_index = np.where(test_mask == True)[0]

        # ===========================================GCNN model set up========================================
        features = preprocess_features(features)
        if FLAGS.model == 'gcn':
            support_original = [preprocess_adj(adj)]
            num_supports = 1
            model_func = GCN
        elif FLAGS.model == 'gcn_cheby':
            support_original = chebyshev_polynomials(adj, FLAGS.max_degree)
            num_supports = 1 + FLAGS.max_degree
            model_func = GCN
        else:
            raise ValueError('Invalid argument for model: ' + str(FLAGS.model))

        # Define placeholders
        placeholders = {
            'support':
            [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
            'features':
            tf.sparse_placeholder(tf.float32,
                                  shape=tf.constant(features[2],
                                                    dtype=tf.int64)),
            'labels':
            tf.placeholder(tf.float32, shape=(None, y_train.shape[1])),
            'labels_mask':
            tf.placeholder(tf.int32),
            'dropout':
            tf.placeholder_with_default(0., shape=()),
            'num_features_nonzero':
            tf.placeholder(tf.int32)  # helper variable for sparse dropout
        }

        order = labels_value.argsort()

        model = model_func(placeholders,
                           input_dim=features[2][1],
                           logging=True)
        upper_tri_index = np.triu_indices(N, k=1)

        # =======================================GCNN model initialization==============================================
        # Initialize session

        sess = tf.Session()

        # Define model evaluation function
        def evaluate(features, support, labels, mask, placeholders):
            t_test = time.time()
            feed_dict_val = construct_feed_dict(features, support, labels,
                                                mask, placeholders)
            outs_val = sess.run([model.loss, model.accuracy],
                                feed_dict=feed_dict_val)
            return outs_val[0], outs_val[1], (time.time() - t_test)

        # Init variables
        sess.run(tf.global_variables_initializer())

        cost_val_list = []

        soft_max_sum_OG_graph = 0  # store the softmax output of the GCNN from different weight samples (from original graph)
        soft_max_sum_sample_graph = 0  # store the softmax output of the GCNN from different weight samples (from sample graphs)

        print(
            "===============================Start training the GCNN Model========================"
        )
        for epoch in range(FLAGS.epochs):
            if FLAGS.graph_generation_mode == 'Copying':
                # =======================================GCNN pre train process=====================================
                if epoch < FLAGS.pretrain_n:
                    cost_val, soft_labels_OG_graph, soft_labels_sample_graphs = train_gcn_one_epoch(
                        support_original, epoch)
                    cost_val_list.append(cost_val)

                    obtained_labels = soft_labels_OG_graph.argmax(axis=1)

                if epoch == FLAGS.pretrain_n:

                    sampled_graph = sample_graph_copying(seed,
                                                         node_neighbors_dict,
                                                         obtained_labels,
                                                         order,
                                                         set_seed=True)

                    inferred_graph = csr_matrix(sampled_graph)
                    # pk.dump(MAP_graph, open(os.path.join(log_dir, MAP_graph), 'wb'))

                    if FLAGS.model == 'gcn_cheby':
                        support = chebyshev_polynomials(
                            inferred_graph, FLAGS.max_degree)
                    else:
                        support = [preprocess_adj(inferred_graph)]

                    cost_val, soft_labels_OG_graph, soft_labels_sample_graphs = train_gcn_one_epoch(
                        support, epoch)
                    cost_val_list.append(cost_val)

                if epoch > FLAGS.pretrain_n:

                    sampled_graph = sample_graph_copying(seed,
                                                         node_neighbors_dict,
                                                         obtained_labels,
                                                         order,
                                                         set_seed=False)

                    inferred_graph = csr_matrix(sampled_graph)
                    # pk.dump(MAP_graph, open(os.path.join(log_dir, MAP_graph), 'wb'))

                    if FLAGS.model == 'gcn_cheby':
                        support = chebyshev_polynomials(
                            inferred_graph, FLAGS.max_degree)
                    else:
                        support = [preprocess_adj(inferred_graph)]

                    cost_val, soft_labels_OG_graph, soft_labels_sample_graphs = train_gcn_one_epoch(
                        support, epoch)

                    # ===========save the softmax output from different weight samples=================
                    if epoch > FLAGS.epoch_to_start_collect_weights and epoch % FLAGS.weight_sample_interval == 0:

                        soft_max_sum_OG_graph += soft_labels_OG_graph
                        hard_label_OG_graph = soft_max_sum_OG_graph.argmax(
                            axis=1)
                        acc_OG_graph = accuracy_score(
                            labels_value[test_set_index],
                            hard_label_OG_graph[test_set_index])

                        soft_max_sum_sample_graph += soft_labels_sample_graphs
                        hard_label_sample_graph = soft_max_sum_sample_graph.argmax(
                            axis=1)
                        acc_sample_graph = accuracy_score(
                            labels_value[test_set_index],
                            hard_label_sample_graph[test_set_index])

                        obtained_labels = hard_label_sample_graph

                        if epoch % 10 == 9:
                            print(
                                "============= weight sampling results at iteration {}=========="
                                .format(epoch + 1))
                            print(
                                "The accuracy from avg weight sampling using the original graph is {}"
                                .format(acc_OG_graph))
                            print(
                                "The accuracy from avg weight sampling using the sample graph is {}"
                                .format(acc_sample_graph))

                    cost_val_list.append(cost_val)
            elif FLAGS.graph_generation_mode == 'None':
                cost_val, soft_labels_OG_graph, soft_labels_sample_graphs = train_gcn_one_epoch(
                    support_original, epoch)
                cost_val_list.append(cost_val)

                if epoch > FLAGS.epoch_to_start_collect_weights and epoch % FLAGS.weight_sample_interval == 0:

                    soft_max_sum_OG_graph += soft_labels_OG_graph
                    hard_label_OG_graph = soft_max_sum_OG_graph.argmax(axis=1)
                    acc_OG_graph = accuracy_score(
                        labels_value[test_set_index],
                        hard_label_OG_graph[test_set_index])

                    if epoch % 10 == 9:
                        print(
                            "========= weight sampling results at iteration {}========"
                            .format(epoch + 1))
                        print(
                            "The accuracy from avg weight sampling using the original graph is {}"
                            .format(acc_OG_graph))

            else:
                raise ValueError('Invalid argument for model: ' +
                                 str(FLAGS.graph_generation_mode))

        print("Optimization Finished!")
        print(
            "===============================Start evaluate the final model performance ============================="
        )

        if FLAGS.graph_generation_mode != 'None':
            softmax_log_file_name = 'BCGN_softmax_trial_index_' + str(
                trial_index) + '_data_partition_seed_' + str(
                    data_partition_seed) + '_seed_' + str(
                        seed) + '_' + timestamp + '.pk'
            pk.dump(soft_max_sum_OG_graph,
                    open(os.path.join(log_dir, softmax_log_file_name), 'wb'))
            # Test set using the sampled graph
            test_cost, test_acc, test_duration = evaluate(
                features, support, y_test, test_mask, placeholders)

            # Test set using the original graph
            test_cost_original_graph, test_acc_original_graph, _ = evaluate(
                features, support_original, y_test, test_mask, placeholders)
            print("Model: Bayesian GCNN")
            print(
                "============1) final result using the weight at the last iteration (OG graph)========="
            )
            print("The accuracy from original graph is {}".format(
                test_acc_original_graph))

            print(
                "============1) final result using the weight at the last iteration (Sample graph)========="
            )
            print("The accuracy from original graph is {}".format(test_acc))

            print(
                "============2) final result for weight samping and graph sampling========="
            )
            print(
                "The accuracy from avg weight sampling using the original graph is {}"
                .format(acc_OG_graph))
            print(
                "The accuracy from avg weight sampling using the sample graph is {}"
                .format(acc_sample_graph))

        else:
            softmax_log_file_name = 'Vanilla_softmax_trial_index_' + str(
                trial_index) + '_data_partition_seed_' + str(
                    data_partition_seed) + '_seed_' + str(
                        seed) + '_' + timestamp + '.pk'
            pk.dump(soft_max_sum_OG_graph,
                    open(os.path.join(log_dir, softmax_log_file_name), 'wb'))
            test_cost_original_graph, test_acc_original_graph, _ = evaluate(
                features, support_original, y_test, test_mask, placeholders)
            print("Model: Vanilla GCNN")
            print(
                "============1) final result using the weight at the last iteration========="
            )
            print("The accuracy from original graph is {}".format(
                test_acc_original_graph))
            print("============2) final result for weight samping===========")
            print(
                "The accuracy from avg weight sampling using the original graph is {}"
                .format(acc_OG_graph))

        sess.close()
        tf.reset_default_graph()
    return 0