Example #1
0
def test_edge_index_to_vector_and_vice_versa():
    # Create a fully-connected graph:
    N = 10
    row = torch.arange(N).view(-1, 1).repeat(1, N).view(-1)
    col = torch.arange(N).view(1, -1).repeat(N, 1).view(-1)
    edge_index = torch.stack([row, col], dim=0)

    idx, population = edge_index_to_vector(edge_index, (N, N), bipartite=True)
    assert population == N * N
    assert idx.tolist() == list(range(population))
    edge_index2 = vector_to_edge_index(idx, (N, N), bipartite=True)
    assert is_undirected(edge_index2)
    assert edge_index.tolist() == edge_index2.tolist()

    idx, population = edge_index_to_vector(edge_index, (N, N), bipartite=False)
    assert population == N * N - N
    assert idx.tolist() == list(range(population))
    mask = edge_index[0] != edge_index[1]  # Remove self-loops.
    edge_index2 = vector_to_edge_index(idx, (N, N), bipartite=False)
    assert is_undirected(edge_index2)
    assert edge_index[:, mask].tolist() == edge_index2.tolist()

    idx, population = edge_index_to_vector(edge_index, (N, N),
                                           bipartite=False,
                                           force_undirected=True)
    assert population == (N * (N + 1)) / 2 - N
    assert idx.tolist() == list(range(population))
    mask = edge_index[0] != edge_index[1]  # Remove self-loops.
    edge_index2 = vector_to_edge_index(idx, (N, N),
                                       bipartite=False,
                                       force_undirected=True)
    assert is_undirected(edge_index2)
    assert edge_index[:, mask].tolist() == to_undirected(edge_index2).tolist()
def test_is_undirected():
    row = torch.tensor([0, 1, 0])
    col = torch.tensor([1, 0, 0])

    assert is_undirected(torch.stack([row, col], dim=0))

    row = torch.tensor([0, 1, 1])
    col = torch.tensor([1, 0, 2])

    assert not is_undirected(torch.stack([row, col], dim=0))
Example #3
0
    def diffusion_matrix_exact(self, edge_index, edge_weight, num_nodes,
                               method, **kwargs):
        r"""Calculate the (dense) diffusion on a given sparse graph.
        Note that these exact variants are not scalable. They densify the
        adjacency matrix and calculate either its inverse or its matrix
        exponential.
        Args:
            edge_index (LongTensor): The edge indices.
            edge_weight (Tensor): One-dimensional edge weights.
            num_nodes (int): Number of nodes.
            method (str): Diffusion method:
                1. :obj:`"ppr"`: Use personalized PageRank as diffusion.
                   Additionally expects the parameter:
                   - **alpha** (*float*) - Return probability in PPR.
                     Commonly lies in :obj:`[0.05, 0.2]`.
                2. :obj:`"heat"`: Use heat kernel diffusion.
                   Additionally expects the parameter:
                   - **t** (*float*) - Time of diffusion. Commonly lies in
                     :obj:`[2, 10]`.
                3. :obj:`"coeff"`: Freely choose diffusion coefficients.
                   Additionally expects the parameter:
                   - **coeffs** (*List[float]*) - List of coefficients
                     :obj:`theta_k` for each power of the transition matrix
                     (starting at :obj:`0`).
        :rtype: (:class:`Tensor`)
        """
        if method == 'ppr':
            # α (I_n + (α - 1) A)^-1
            edge_weight = (kwargs['alpha'] - 1) * edge_weight
            edge_index, edge_weight = add_self_loops(edge_index,
                                                     edge_weight,
                                                     fill_value=1,
                                                     num_nodes=num_nodes)
            mat = to_dense_adj(edge_index, edge_attr=edge_weight).squeeze()
            diff_matrix = kwargs['alpha'] * torch.inverse(mat)

        elif method == 'heat':
            # exp(t (A - I_n))
            edge_index, edge_weight = add_self_loops(edge_index,
                                                     edge_weight,
                                                     fill_value=-1,
                                                     num_nodes=num_nodes)
            edge_weight = kwargs['t'] * edge_weight
            mat = to_dense_adj(edge_index, edge_attr=edge_weight).squeeze()
            undirected = is_undirected(edge_index, edge_weight, num_nodes)
            diff_matrix = self.__expm__(mat, undirected)

        elif method == 'coeff':
            adj_matrix = to_dense_adj(edge_index,
                                      edge_attr=edge_weight).squeeze()
            mat = torch.eye(num_nodes, device=edge_index.device)

            diff_matrix = kwargs['coeffs'][0] * mat
            for coeff in kwargs['coeffs'][1:]:
                mat = mat @ adj_matrix
                diff_matrix += coeff * mat
        else:
            raise ValueError('Exact GDC diffusion {} unknown.'.format(method))

        return diff_matrix
Example #4
0
    def __getitem__(self, item) -> torch.Tensor:
        """
        :param item:
        :return: Draw negative samples, and return [2, E * 0.8 * 2] tensor
        """
        datum = super().__getitem__(item)

        # Sample negative training samples from the negative sample pool
        train_neg_edge_index = self._sample_train_neg_edge_index(
            is_undirected(self.test_edge_index))
        train_edge_index = torch.cat(
            [self.train_pos_edge_index, train_neg_edge_index], dim=1)
        train_edge_y = self.get_edge_y(
            train_edge_index.size(1),
            pos_num_or_ratio=self.train_pos_edge_index.size(1),
            device=train_edge_index.device)

        # Add attributes for edge prediction
        datum.__setitem__("train_edge_index", train_edge_index)
        datum.__setitem__("train_edge_y", train_edge_y)
        datum.__setitem__("val_edge_index", self.val_edge_index)
        datum.__setitem__("val_edge_y", self.val_edge_y)
        datum.__setitem__("test_edge_index", self.test_edge_index)
        datum.__setitem__("test_edge_y", self.test_edge_y)
        return datum
Example #5
0
    def norm(edge_index,
             num_nodes,
             edge_weight=None,
             improved=False,
             dtype=None):
        if edge_weight is None:
            edge_weight = torch.ones((edge_index.size(1), ),
                                     dtype=dtype,
                                     device=edge_index.device)

        fill_value = 1.0 if not improved else 2.0
        edge_index, edge_weight = add_remaining_self_loops(
            edge_index, edge_weight, fill_value, num_nodes)

        row, col = edge_index
        if is_undirected(edge_index, num_nodes=num_nodes):
            deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes)
            deg_inv_sqrt = deg.pow(-0.5)
            deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
            norm = deg_inv_sqrt[row] * edge_weight * deg_inv_sqrt[col]
        else:
            if cfg.gnn.flow == 'source_to_target':
                deg = scatter_add(edge_weight, row, dim=0, dim_size=num_nodes)
            else:
                deg = scatter_add(edge_weight, col, dim=0, dim_size=num_nodes)
            deg_inv_sqrt = deg.pow(-1.0)
            deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
            norm = (deg_inv_sqrt[row] if cfg.gnn.flow == 'source_to_target'
                    else deg_inv_sqrt[col]) * edge_weight

        return edge_index, norm
def test_negative_sampling():
    edge_index = torch.as_tensor([[0, 0, 1, 2], [0, 1, 2, 3]])

    neg_edge_index = negative_sampling(edge_index)
    assert neg_edge_index.size(1) == edge_index.size(1)

    adj = torch.zeros(4, 4, dtype=torch.bool)
    adj[edge_index[0], edge_index[1]] = 1

    neg_adj = torch.zeros(4, 4, dtype=torch.bool)
    neg_adj[neg_edge_index[0], neg_edge_index[1]] = 1
    assert (adj & neg_adj).sum() == 0

    neg_edge_index = negative_sampling(edge_index, num_neg_samples=2)
    assert neg_edge_index.size(1) == 2

    undirected_edge_index = to_undirected(edge_index)
    undirected_neg_edge_index = negative_sampling(undirected_edge_index,
                                                  force_undirected=True)
    assert is_undirected(undirected_neg_edge_index)
    assert undirected_neg_edge_index.size(1) <= undirected_edge_index.size(1)

    undirected_adj = torch.zeros(4, 4, dtype=torch.bool)
    undirected_adj[undirected_edge_index[0], undirected_edge_index[1]] = 1

    undirected_neg_adj = torch.zeros(4, 4, dtype=torch.bool)
    undirected_neg_adj[undirected_neg_edge_index[0],
                       undirected_neg_edge_index[1]] = 1
    assert (undirected_adj & undirected_neg_adj).sum() == 0
    def is_undirected(self) -> bool:
        if self.is_bipartite():  # TODO check for inverse storage.
            return False

        for value in self.values('adj', 'adj_t'):
            return value.is_symmetric()

        edge_index = self.edge_index
        edge_attr = self.edge_attr if 'edge_attr' in self else None
        return is_undirected(edge_index, edge_attr, num_nodes=self.size(0))
Example #8
0
    def forward(self, data, return_hidden_feature=False):

        #import pdb
        #pdb.set_trace()
        if torch.cuda.is_available():
            data.x = data.x.cuda()
            data.edge_attr = data.edge_attr.cuda()
            data.edge_index = data.edge_index.cuda()
            data.batch = data.batch.cuda()

        # make sure that we have undirected graph
        if not is_undirected(data.edge_index):
            data.edge_index = to_undirected(data.edge_index)

        # make sure that nodes can propagate messages to themselves
        if not contains_self_loops(data.edge_index):
            data.edge_index, data.edge_attr = add_self_loops(
                data.edge_index, data.edge_attr.view(-1))

        # covalent_propagation
        # add self loops to enable self propagation
        covalent_edge_index, covalent_edge_attr = self.covalent_neighbor_threshold(
            data.edge_index, data.edge_attr)
        (
            non_covalent_edge_index,
            non_covalent_edge_attr,
        ) = self.non_covalent_neighbor_threshold(data.edge_index,
                                                 data.edge_attr)

        # covalent_propagation and non_covalent_propagation
        covalent_x = self.covalent_propagation(data.x, covalent_edge_index,
                                               covalent_edge_attr)
        non_covalent_x = self.non_covalent_propagation(
            covalent_x, non_covalent_edge_index, non_covalent_edge_attr)

        # zero out the protein features then do ligand only gather...hacky sure but it gets the job done
        non_covalent_ligand_only_x = non_covalent_x
        non_covalent_ligand_only_x[data.x[:, 14] == -1] = 0
        pool_x = self.global_add_pool(non_covalent_ligand_only_x, data.batch)

        # fully connected and output layers
        if return_hidden_feature or self.always_return_hidden_feature:
            # return prediction and atomistic features (covalent result, non-covalent result, pool result)

            avg_covalent_x, _ = avg_pool_x(data.batch, covalent_x, data.batch)
            avg_non_covalent_x, _ = avg_pool_x(data.batch, non_covalent_x,
                                               data.batch)

            fc0_x, fc1_x, output_x = self.output(pool_x,
                                                 return_hidden_feature=True)

            return avg_covalent_x, avg_non_covalent_x, pool_x, fc0_x, fc1_x, output_x
        else:
            return self.output(pool_x)
Example #9
0
    def _create_data(self):

        # Include rows that are explicitly provided
        df = pd.read_json(self.file_in, lines=True)

        counter = 0
        for node_props_x, node_props_y, e_index_1, e_index_2, edge_props, n_scored, sn_val, sn_filter in graph_props_x_(
                df, self.consider_loop_type):
            edge_index = torch.tensor([e_index_1, e_index_2], dtype=torch.long)
            if not is_undirected(edge_index):
                raise RuntimeError(
                    'Directed graph created, should be impossible')

            # Ignore all data that does not meet signal-noise threshold
            if self.filter_noise:
                if sn_filter != 1:
                    continue

            # Add feature of incoming edges to node as a node feature
            if self.nonbond_nodefeature:
                node_nbond_feature = create_node_nbond(node_props_x, e_index_1,
                                                       e_index_2, edge_props)
                x = torch.tensor(node_nbond_feature, dtype=torch.float)
            else:
                x = torch.tensor(node_props_x, dtype=torch.float)
            edge_x = torch.tensor(edge_props, dtype=torch.float)

            # Add ground-truth node features
            if not node_props_y is None:
                y = torch.tensor(node_props_y, dtype=torch.float)
            else:
                y = None

            # Create mask for which nodes are to be predicted
            train_mask = torch.zeros(x.shape[0], dtype=torch.bool)
            train_mask[:n_scored] = True

            data = Data(x=x,
                        edge_index=edge_index,
                        y=y,
                        edge_attr=edge_x,
                        train_mask=train_mask,
                        sn_val=sn_val)
            torch.save(
                data, '{}/{}_{}.pt'.format(self.save_dir, ALL_DATA_NAME,
                                           counter))
            counter += 1

        self.total_processed = counter
Example #10
0
def test_negative_sampling():
    edge_index = torch.as_tensor([[0, 0, 1, 2], [0, 1, 2, 3]])

    neg_edge_index = negative_sampling(edge_index)
    assert neg_edge_index.size(1) == edge_index.size(1)
    assert is_negative(edge_index, neg_edge_index, (4, 4), bipartite=False)

    neg_edge_index = negative_sampling(edge_index, num_neg_samples=2)
    assert neg_edge_index.size(1) == 2
    assert is_negative(edge_index, neg_edge_index, (4, 4), bipartite=False)

    edge_index = to_undirected(edge_index)
    neg_edge_index = negative_sampling(edge_index, force_undirected=True)
    assert neg_edge_index.size(1) == edge_index.size(1) - 1
    assert is_undirected(neg_edge_index)
    assert is_negative(edge_index, neg_edge_index, (4, 4), bipartite=False)
    def negative_sampling(self, edge_index: Tensor, num_nodes: int,
                          batch: OptTensor = None) -> Tensor:

        num_neg_samples = int(self.neg_sample_ratio * self.edge_sample_ratio *
                              edge_index.size(1))

        if not self.is_undirected and not is_undirected(
                edge_index, num_nodes=num_nodes):
            edge_index = to_undirected(edge_index, num_nodes=num_nodes)

        if batch is None:
            neg_edge_index = negative_sampling(edge_index, num_nodes,
                                               num_neg_samples=num_neg_samples)
        else:
            neg_edge_index = batched_negative_sampling(
                edge_index, batch, num_neg_samples=num_neg_samples)

        return neg_edge_index
Example #12
0
def print_statistics(data):
    print('Original feature size:', data.x.shape[1])
    remove_unique_feature(data)
    print('Current feature size:', data.x.shape[1])
    edge_direction = 'Undirected' if is_undirected(
        data.edge_index) else 'Directed'
    print('{} graph'.format(edge_direction))
    print('{} graph'.format('Weighted' if is_weighted(data) else 'Unweighted'))
    print('Number of nodes: ', data.x.shape[0])
    num_edges = data.edge_index.shape[1]
    if edge_direction == 'Undirected':
        num_edges = num_edges // 2
    print('Number of edges: ', num_edges)
    print('Number of classes: {}'.format(int(max(data.y)) + 1))
    if hasattr(data, 'train_mask'):
        print('Number of training nodes:', data.train_mask.sum().item())
    if hasattr(data, 'val_mask'):
        print('Number of validation nodes:', data.val_mask.sum().item())
    if hasattr(data, 'test_mask'):
        print('Number of test nodes:', data.test_mask.sum().item())
Example #13
0
    def get_edge_info(self, df):
        """
        Get information of the edges: number of edges, if weighted, if directed, Max / Min weight, etc.
        """
        self.info['num_edges'] = df.shape[0]
        min_weight, max_weight = df['edge_weight'].min(), df['edge_weight'].max()
        if min_weight != max_weight:
            self.info['weighted'] = True
        else:
            self.info['weighted'] = False

        edge_index = df[['src_idx', 'dst_idx']].to_numpy()
        edge_index = sorted(edge_index, key=lambda d: d[0])
        edge_index = torch.tensor(edge_index, dtype=torch.long).transpose(0, 1)

        self.info['directed'] = not is_undirected(edge_index, num_nodes=self.info['num_nodes'])

        print('Number of Edges:', self.info['num_edges'])
        print('Is Directed Graph:', self.info['directed'])
        print('Is Weighted Graph:',self.info['weighted'])
        print('Max Weight:', max_weight, 'Min Weight:', min_weight)
Example #14
0
def print_stats():
    for data in DATASETS:
        out = load_data(data)
        num_graphs = len(out)
        avg_nodes = out.data.x.size(0) / num_graphs
        avg_edges = out.data.edge_index.size(1) / num_graphs
        num_features = out.num_features
        num_classes = out.num_classes
        print(
            f'{data}\t{num_graphs}\t{avg_nodes}\t{avg_edges}\t{num_features}\t{num_classes}',
            end='\t')

        undirected, self_loops, isolated_nodes, onehot = True, False, False, True
        for graph in out:
            if not is_undirected(graph.edge_index, num_nodes=graph.num_nodes):
                undirected = False
            if contains_self_loops(graph.edge_index):
                self_loops = True
            if contains_isolated_nodes(graph.edge_index,
                                       num_nodes=graph.num_nodes):
                isolated_nodes = True
            if ((graph.x > 0).sum(dim=1) != 1).sum() > 0:
                onehot = False
        print(f'{undirected}\t{self_loops}\t{isolated_nodes}\t{onehot}')
Example #15
0
    def generate_pyg_data(self, data):
        # get x feature table
        x = data['fea_table'].copy()
        df = data['edge_file']
        edges = df[['src_idx', 'dst_idx', 'edge_weight']]

        # get indices first
        train_indices = data['train_indices']
        if self.config.use_valid:
            train_indices, valid_indices = train_test_split(train_indices, test_size=0.2, shuffle=False)

        try:
            if x.shape[1] == 1:        # 0-dimensional feature
                x = x.set_index(keys="node_index")
                x = feat_engineering(
                    x,
                    edges=edges,
                    num_nodes=self.metadata["n_node"].iloc[0]
                )
            else:
                x_feat = x.drop('node_index', axis=1).to_numpy()
                conf_name = self.config.filename.split("/")[-1].split(".")[0]
                is_only_one_zero = not ((x_feat != 0) & (x_feat != 1)).any()
                logger.info("use {} config".format(conf_name))
                logger.info(
                    "feature only contains zero: {}, only one and zero: {}".format((x_feat == 0).all(), is_only_one_zero))

                if conf_name in self.citation_configs:  # Judge whether it is a citation graph
            # if True:
                    if is_only_one_zero:
                        logger.info("Normalize features")
                        normal_feat = feat_row_sum_inv_normalize(x_feat)
                        normal_df = pd.DataFrame(data=normal_feat)
                        normal_df["node_index"] = x["node_index"]
                        x = normal_df

                    pre_feat = prepredict(data, train_indices=train_indices, use_valid=self.config.use_valid, use_ohe=False)
                    x = x.set_index(keys="node_index")
                    x_index = x.index.tolist()
                    lpa_preds, lpa_train_acc = lpa_predict(data, n_class=self._n_class, train_indices=train_indices, use_valid=self.config.use_valid)
                    if not np.isnan(lpa_train_acc) and lpa_train_acc > 0.8:
                        logger.info("Use LPA predicts")
                        x = pd.concat([x, pre_feat, lpa_preds], axis=1).values[x_index]
                    else:
                        x = pd.concat([x, pre_feat], axis=1).values[x_index]
                else:
                    x = x.set_index(keys="node_index")
                    x = feat_engineering(
                        x,
                        edges=edges,
                        num_nodes=self.metadata["n_node"].iloc[0]
                    )
        except Exception as e:
            logger.error(e)
            if x.shape[1] == 0:
                x = np.zeros((x.shape[0], 64), dtype=np.float)
            else:
                x = x.to_numpy()

        logger.info("x shape: {}".format(x.shape))
        node_index = torch.tensor(data['fea_table']['node_index'].to_numpy(), dtype=torch.long)
        x = torch.tensor(x, dtype=torch.float)

        # get edge_index, edge_weight
        edges = edges.to_numpy()
        edge_index = edges[:, :2].astype(np.int)
        # transpose from [edge_num, 2] to [2, edge_num] which is required by PyG
        edge_index = torch.tensor(edge_index, dtype=torch.long).transpose(0, 1)
        edge_weight = edges[:, 2]
        edge_weight = torch.tensor(edge_weight, dtype=torch.float32)

        undirected = gtils.is_undirected(edge_index)

        edge_index, edge_weight = gtils.sort_edge_index(edge_index, edge_weight)
        logger.info(f"is undirected ? {undirected}")
        logger.info(f"edge index {edge_index.shape}, edge weight {edge_weight.shape}")

        # get train/test mask
        num_nodes = x.size(0)
        self._num_nodes = num_nodes
        y = torch.zeros(num_nodes, dtype=torch.long)
        inds = data['train_label'][['node_index']].to_numpy()
        train_y = data['train_label'][['label']].to_numpy()
        self.y_train = train_y
        y[inds] = torch.tensor(train_y, dtype=torch.long)

        # train_indices = data['train_indices']
        self._origin_graph_data_indices = copy.deepcopy(data['train_indices'])
        if self.config.use_valid:
            # train_indices, valid_indices = train_test_split(train_indices, test_size=0.2)
            # train_indices, valid_indices = train_test_split(train_indices, test_size=0.2, shuffle=False)
            self.y_train = data['train_label'].set_index('node_index').loc[train_indices][['label']].to_numpy()
        test_indices = data['test_indices']

        data = Data(x=x, node_index=node_index, edge_index=edge_index, y=y, edge_weight=edge_weight)

        data.num_nodes = num_nodes

        train_mask = torch.zeros(num_nodes, dtype=torch.bool)
        train_mask[train_indices] = 1
        data.train_indices = np.asarray(train_indices)
        data.train_mask = train_mask
        self._train_indices = np.asarray(train_indices)
        self._train_mask = train_mask

        if self.config.use_valid:
            valid_mask = torch.zeros(num_nodes, dtype=torch.bool)
            valid_mask[valid_indices] = 1
            data.valid_indices = valid_indices
            data.valid_mask = valid_mask
            self._valid_indices = valid_indices
            self._valid_mask = valid_mask

        self._test_mask = np.zeros(num_nodes, dtype=np.bool)
        self._test_mask[test_indices] = True
        test_mask = torch.zeros(num_nodes, dtype=torch.bool)
        test_mask[test_indices] = 1
        data.test_mask = test_mask
        data.test_indices = np.asarray(test_indices)

        self._sampler = Sampler(data, self.metadata["n_edge"].iloc[0], self.device)

        return data
Example #16
0
ac_label = np.array(hkl.load(ac_label_fp))

# binary feature for each gene
ac_gene_list = []
for i in range(len(ac_gene_fp)):
    ac_gene_list.append(hkl.load(ac_gene_fp[i]))
    
    
#Loads PPI 
ppi_fp = './dataset/onecc_noiso_string_matrix_interactome_data_filtered.hkl'
ppi = hkl.load(ppi_fp)


# generate graph topology and edge_attr
edge_index = torch.from_numpy(ppi[:, 0:2]).transpose(dim0=0, dim1=1)
assert (is_undirected(edge_index) is True), "ppi graph should be undirected graph"
edge_attr = torch.from_numpy(ppi[:, 2]).to(torch.float32) / 1000.0


# create data with single feature 
ac_data_list = list([] for i in range(ac_gene_list[0].shape[0]))

for i in range(len(ac_gene_list)):
    for idx, vector in enumerate(ac_gene_list[i]):
        ac_data_list[idx].append(torch.from_numpy(vector))                


new_ac_data_list = []
for data in ac_data_list:
    new_ac_data_list.append(torch.stack(data, dim=0).t())
    
Example #17
0
    def forward(self,
                x,
                edge_index,
                size=None,
                batch=None,
                neg_edge_index=None,
                attention_edge_index=None):
        """
        :param x: [N, F]
        :param edge_index: [2, E]
        :param size:
        :param batch: None or [B]
        :param neg_edge_index: When using explicitly given negative edges.
        :param attention_edge_index: [2, E'], Use for link prediction
        :return:
        """
        if self.pretraining and self.pretraining_noise_ratio > 0.0:
            edge_index, _ = dropout_adj(
                edge_index,
                p=self.pretraining_noise_ratio,
                force_undirected=is_undirected(edge_index),
                num_nodes=x.size(0),
                training=self.training)

        if size is None and torch.is_tensor(x):
            edge_index, _ = remove_self_loops(edge_index)
            edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))

        # [N, F0] * [F0, heads * F] = [N, heads * F]
        if torch.is_tensor(x):
            x = torch.matmul(x, self.weight)
        else:
            x = (None if x[0] is None else torch.matmul(x[0], self.weight),
                 None if x[1] is None else torch.matmul(x[1], self.weight))

        propagated = self.propagate(edge_index, size=size, x=x)

        if (self.is_super_gat
                and self.training) or (attention_edge_index
                                       is not None) or (neg_edge_index
                                                        is not None):

            device = next(self.parameters()).device
            num_pos_samples = int(self.edge_sample_ratio * edge_index.size(1))
            num_neg_samples = int(self.neg_sample_ratio *
                                  self.edge_sample_ratio * edge_index.size(1))

            if attention_edge_index is not None:
                neg_edge_index = None

            elif neg_edge_index is not None:
                pass

            elif batch is None:
                if self.to_undirected_at_neg:
                    edge_index_for_ns = to_undirected(edge_index,
                                                      num_nodes=x.size(0))
                else:
                    edge_index_for_ns = edge_index
                neg_edge_index = negative_sampling(
                    edge_index=edge_index_for_ns,
                    num_nodes=x.size(0),
                    num_neg_samples=num_neg_samples,
                )
            else:
                neg_edge_index = batched_negative_sampling(
                    edge_index=edge_index,
                    batch=batch,
                    num_neg_samples=num_neg_samples,
                )

            if self.edge_sample_ratio < 1.0:
                pos_indices = random.sample(range(edge_index.size(1)),
                                            num_pos_samples)
                pos_indices = torch.tensor(pos_indices).long().to(device)
                pos_edge_index = edge_index[:, pos_indices]
            else:
                pos_edge_index = edge_index

            att_with_negatives = self._get_attention_with_negatives(
                x=x,
                edge_index=pos_edge_index,
                neg_edge_index=neg_edge_index,
                total_edge_index=attention_edge_index,
            )  # [E + neg_E, heads]

            # Labels
            if self.training and (self.cache["att_label"] is None
                                  or not self.cache_label):
                att_label = torch.zeros(
                    att_with_negatives.size(0)).float().to(device)
                att_label[:pos_edge_index.size(1)] = 1.
            elif self.training and self.cache["att_label"] is not None:
                att_label = self.cache["att_label"]
            else:
                att_label = None
            self._update_cache("att_label", att_label)
            self._update_cache("att_with_negatives", att_with_negatives)

        return propagated
Example #18
0
def main():
    parser = argparse.ArgumentParser(description='Prepare data for Giant-XRT')
    parser.add_argument(
        '--raw-text-path',
        type=str,
        required=True,
        help="Path of raw text (.txt file, each raw correspond to a node)")
    parser.add_argument(
        '--vectorizer-config-path',
        type=str,
        required=True,
        help="a path to a json file that specify the tfidf hyper-paramters")
    parser.add_argument('--data-root-dir', type=str, default="./dataset")
    parser.add_argument('--xrt-data-dir', type=str, default="./proc_data_xrt")
    parser.add_argument('--dataset', type=str, default="ogbn-arxiv")
    parser.add_argument('--max-deg', type=int, default=1000)
    args = parser.parse_args()
    print(args)

    # Change args.save_data_dir to args.save_data_dir/args.dataset
    save_data_dir = os.path.join(args.xrt_data_dir, args.dataset)
    dataset = PygNodePropPredDataset(name=args.dataset,
                                     root=args.data_root_dir)
    data = dataset[0]
    edge_index = data.edge_index

    # Make sure edge_index is undirected!!!
    if not is_undirected(edge_index):
        edge_index = to_undirected(edge_index)
    # Filtering nodes whose number of edges >= max_degree
    Degree = degree(edge_index[0])
    Filtered_idx = torch.where(Degree < args.max_deg)[0]
    print('Number of original nodes:{}'.format(data.x.shape[0]))
    print('Number of filtered nodes:{}'.format(len(Filtered_idx)))

    # # Construct and save label matrix (adjacencey matrix) Y.
    Y_csr_all = smat.csr_matrix(to_scipy_sparse_matrix(edge_index))
    Y_csr_trn = Y_csr_all[Filtered_idx]
    smat_util.save_matrix(f"{save_data_dir}/Y.trn.npz", Y_csr_trn)
    smat_util.save_matrix(f"{save_data_dir}/Y.all.npz", Y_csr_all)
    print("Saved Y.trn.npz and Y.all.npz")

    # Apply the same filtering for raw text
    with open(args.raw_text_path, "r") as fin:
        node_text_list = fin.readlines()
    print("|node_text_list={}".format(len(node_text_list)))
    count = 0
    with open(f"{save_data_dir}/X.trn.txt", "w") as fout:
        for cur_idx, line in enumerate(node_text_list):
            if Filtered_idx[count].item() == cur_idx:
                fout.writelines(line)
                count += 1
    assert count == len(Filtered_idx), "count={}, len(Filtered_idx)={}".format(
        count, len(Filtered_idx))
    print("Saved X.trn.txt")

    # Apply the same filtering for tfidf features
    vectorizer_config = Vectorizer.load_config_from_args(
        args)  # using args.vectorizer_config_path
    preprocessor = Preprocessor.train(node_text_list,
                                      vectorizer_config,
                                      dtype=np.float32)
    preprocessor.save(f"{save_data_dir}/tfidf-model")
    X_tfidf_all = preprocessor.predict(node_text_list)
    X_tfidf_trn = X_tfidf_all[Filtered_idx]
    smat_util.save_matrix(f"{save_data_dir}/X.all.tfidf.npz", X_tfidf_all)
    smat_util.save_matrix(f"{save_data_dir}/X.trn.tfidf.npz", X_tfidf_trn)
    print("Saved X.trn.npz and X.all.npz")
Example #19
0
 def is_undirected(self):
     return is_undirected(self.edge_index, self.num_nodes)
Example #20
0
def graph_data(
    edge_list_path,
    node_features_path,
    protein_ids_path,
    protein_id_col_node="Gene",
    protein_id_col_prot="ensembl.gene",
    sparse_tensor=True,
    cut=0,
):
    """Creates a data object from the given tsv files.

    Parameters
    ----------
    edge_list_path : str
        Path to edge list file. The first two columns -- edges, the rest are
        edge attributes.
    node_features_path : str
        Path to a file with node features.
    protein_ids_path : str
        Protein ids filepath. File should contain
    protein_id_col_node : str, optional
        Column with ids in the node features file, by default "Gene"
    protein_id_col_prot : str, optional
        Column with ids in the protein_ids_file, by default "ensembl.gene"
    sparse_tensor : bool, optional
        If true, a sparse tensor will be constructed instead of edge index and
        edge_weight, by default True
    cut : int, optional
        Edges with values below the cut will be dropped, by default 0

    Returns
    -------
    torch_geometric.data.Data
        Data file with a graph
    """
    a = pd.read_csv(edge_list_path).values
    edge_attr = a[:, 2:] / 1000.0

    # cut the edges
    cut_mask = edge_attr[:, -1] > cut
    edge_ind = torch.tensor(a[:, :2][cut_mask], dtype=torch.long)
    edge_attr = torch.tensor(edge_attr[cut_mask], dtype=torch.float32)

    # force undirected
    if not is_undirected(edge_ind):
        edge_ind = torch.cat([edge_ind, edge_ind[:, [1, 0]]], 0)
        edge_attr = torch.cat([edge_attr, edge_attr], 0)

    # features
    protein_ids = pd.read_csv(protein_ids_path,
                              sep="\t")[["id", protein_id_col_prot]]
    x = pd.read_csv(node_features_path, sep="\t")
    feature_columns = x.drop(protein_id_col_node, 1).columns
    x = pd.merge(
        protein_ids,
        x,
        how="left",
        left_on=protein_id_col_prot,
        right_on=protein_id_col_node,
    ).sort_values("id")[feature_columns]
    x.fillna(x.mean(), inplace=True)
    x = torch.tensor(((x - x.mean()) / x.std()).values, dtype=torch.float32)
    data = Data(x, edge_ind.T, edge_attr, id=torch.arange(x.shape[0]))

    if sparse_tensor:
        tsp = ToSparseTensor(False)
        data = tsp(data)

    return data
Example #21
0
def get_graph_property(graph_property_list,
                       dataset_class,
                       dataset_name,
                       data_root,
                       verbose=True,
                       **kwargs):
    _data_attr = get_dataset_or_loader(dataset_class,
                                       dataset_name,
                                       data_root,
                                       seed=42,
                                       **kwargs)
    train_d, val_d, test_d = _data_attr

    if dataset_name in ["PPI", "WebKB4Univ", "CLUSTER"]:
        cum_sum = 0
        y_list, edge_index_list = [], []
        for _data in chain(train_d, val_d, test_d):
            y_list.append(_data.y)
            edge_index_list.append(_data.edge_index + cum_sum)
            cum_sum += _data.y.size(0)
        y = torch.cat(y_list, dim=0)
        edge_index = torch.cat(edge_index_list, dim=1)

    else:
        data = train_d[0]
        y, edge_index = data.y, data.edge_index
        y_list, edge_index_list = [y], [edge_index]

    # to_undirected
    one_nxg = to_networkx(Data(edge_index=edge_index),
                          to_undirected=is_undirected(edge_index))
    nxg_list = [
        to_networkx(Data(edge_index=ei),
                    to_undirected=is_undirected(edge_index))
        for ei in edge_index_list
    ]

    ni_nxg_list = [deepcopy(nxg) for nxg in nxg_list]
    for ni_nxg in ni_nxg_list:
        ni_nxg.remove_nodes_from(list(nx.isolates(ni_nxg)))

    gp_dict = {}
    if graph_property_list is None or "diameter" in graph_property_list:
        diameter_list = []
        for ni_nxg in ni_nxg_list:
            ni_nxg = ni_nxg.to_undirected()  # important for computing cc.
            for cc in nx.connected_components(ni_nxg):
                ni_nxg_cc = ni_nxg.subgraph(cc).copy()
                diameter_list.append(
                    nx.algorithms.distance_measures.diameter(ni_nxg_cc))
        gp_dict["diameter_mean"] = float(np.mean(diameter_list))
        gp_dict["diameter_std"] = float(np.std(diameter_list))
        gp_dict["diameter_max"] = float(np.max(diameter_list))
        gp_dict["diameter_min"] = float(np.min(diameter_list))
        gp_dict["diameter_n"] = len(diameter_list)

    if graph_property_list is None or "average_clustering_coefficient" in graph_property_list:
        gp_dict["average_clustering_coefficient"] = nx.average_clustering(
            one_nxg)

    if verbose:
        print(f"{dataset_class} / {dataset_name} / {data_root}")
        pprint(gp_dict)

    if graph_property_list is None or "centrality" in graph_property_list:
        dc = nx.degree_centrality(one_nxg)
        gp_dict["degree_centrality_mean"] = float(np.mean(list(dc.values())))
        gp_dict["degree_centrality_std"] = float(np.std(list(dc.values())))
        cc = nx.closeness_centrality(one_nxg)
        gp_dict["closeness_centrality_mean"] = float(np.mean(list(
            cc.values())))
        gp_dict["closeness_centrality_std"] = float(np.std(list(cc.values())))

    if graph_property_list is None or "assortativity" in graph_property_list:
        gp_dict[
            "degree_assortativity_coefficient"] = nx.degree_assortativity_coefficient(
                one_nxg)

    if verbose:
        print(f"{dataset_class} / {dataset_name} / {data_root}")
        pprint(gp_dict)

    return gp_dict
Example #22
0
def generate_pyg_data(data, n_class, time_budget, use_dim_reduction=True, use_feature_generation=True,
                      use_label_distribution=False, use_node_degree=False, use_node_degree_binary=False,
                      use_node_embed=True, use_one_hot_label=False):
    other_needed = dict()

    x = data['fea_table']

    df = data['edge_file']
    edge_index = df[['src_idx', 'dst_idx']].to_numpy()
    edge_index = sorted(edge_index, key=lambda d: d[0])
    edge_index = torch.tensor(edge_index, dtype=torch.long).transpose(0, 1)

    edge_weight = df['edge_weight'].to_numpy()
    edge_weight = torch.tensor(edge_weight, dtype=torch.float32)

    num_nodes = x.shape[0]
    y = n_class * torch.ones(num_nodes, dtype=torch.long)
    inds = data['train_label'][['node_index']].to_numpy()
    train_y = data['train_label'][['label']].to_numpy()
    y[inds] = torch.tensor(train_y, dtype=torch.long)

    train_indices = data['train_indices']
    test_indices = data['test_indices']


    flag_directed_graph = not is_undirected(edge_index)

    ###   feature engineering  ###
    flag_none_feature = False
    if use_dim_reduction:
        x, flag_none_feature = dim_reduction(x)
    else:
        x = x.to_numpy()
        flag_none_feature = (x.shape[1] == 1)

    if use_feature_generation:
        other_needed["x"] = x
        other_needed["y"] = y
        other_needed["n_class"] = n_class
        other_needed["edge_index"] = edge_index
        other_needed["edge_weight"] = edge_weight
        other_needed["time_budget"] = time_budget
        other_needed["flag_none_feature"] = flag_none_feature
        other_needed["flag_directed_graph"] = flag_directed_graph
        other_needed["use_label_distribution"] = use_label_distribution
        other_needed["use_node_degree"] = use_node_degree
        other_needed["use_node_degree_binary"] = use_node_degree_binary
        other_needed["use_node_embed"] = use_node_embed
        other_needed["use_one_hot_label"] = use_one_hot_label
        added_features = feature_generation(x, y, n_class, edge_index, edge_weight,
                                            flag_none_feature, flag_directed_graph, time_budget,
                                            use_label_distribution=use_label_distribution,
                                            use_node_degree=use_node_degree,
                                            use_node_degree_binary=use_node_degree_binary,
                                            use_node_embed=use_node_embed,
                                            use_one_hot_label=use_one_hot_label)
        if added_features:
            x = np.concatenate([x]+added_features, axis=1)

    only_one_hot_id = False
    if x.shape[1] != 1:
        #remove raw node_index 
        x = x[:,1:]
    else:
        #one hot encoder of node_index (backup plan)
        x = np.eye(num_nodes)
        only_one_hot_id = True

    logger.info('x.shape after feature engineering: {}'.format(x.shape))
    x = torch.tensor(x, dtype=torch.float)

    non_zero_index = torch.nonzero(edge_weight).reshape(-1)
    edge_weight = edge_weight[non_zero_index]
    edge_index = edge_index[:,non_zero_index]
    data = Data(x=x, edge_index=edge_index, y=y, edge_weight=edge_weight)

    data.num_nodes = num_nodes
    data.train_indices = train_indices
    data.test_indices = test_indices

    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    train_mask[train_indices] = 1
    data.train_mask = train_mask

    test_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask[test_indices] = 1
    data.test_mask = test_mask

    data["directed"] = flag_directed_graph  # used for directed DGL-GCN

    return data, other_needed, only_one_hot_id
Example #23
0
def test_random_link_split_on_hetero_data():
    data = HeteroData()

    data['p'].x = torch.arange(100)
    data['a'].x = torch.arange(100, 300)

    data['p', 'p'].edge_index = get_edge_index(100, 100, 500)
    data['p', 'p'].edge_index = to_undirected(data['p', 'p'].edge_index)
    data['p', 'p'].edge_attr = torch.arange(data['p', 'p'].num_edges)
    data['p', 'a'].edge_index = get_edge_index(100, 200, 1000)
    data['p', 'a'].edge_attr = torch.arange(500, 1500)
    data['a', 'p'].edge_index = data['p', 'a'].edge_index.flip([0])
    data['a', 'p'].edge_attr = torch.arange(1500, 2500)

    transform = RandomLinkSplit(num_val=0.2, num_test=0.2, is_undirected=True,
                                edge_types=('p', 'p'))
    train_data, val_data, test_data = transform(data)

    assert len(train_data['p']) == 1
    assert len(train_data['a']) == 1
    assert len(train_data['p', 'p']) == 4
    assert len(train_data['p', 'a']) == 2
    assert len(train_data['a', 'p']) == 2

    assert is_undirected(train_data['p', 'p'].edge_index,
                         train_data['p', 'p'].edge_attr)
    assert is_undirected(val_data['p', 'p'].edge_index,
                         val_data['p', 'p'].edge_attr)
    assert is_undirected(test_data['p', 'p'].edge_index,
                         test_data['p', 'p'].edge_attr)

    transform = RandomLinkSplit(num_val=0.2, num_test=0.2,
                                edge_types=('p', 'a'),
                                rev_edge_types=('a', 'p'))
    train_data, val_data, test_data = transform(data)

    assert len(train_data['p']) == 1
    assert len(train_data['a']) == 1
    assert len(train_data['p', 'p']) == 2
    assert len(train_data['p', 'a']) == 4
    assert len(train_data['a', 'p']) == 2

    assert train_data['p', 'a'].edge_index.size() == (2, 600)
    assert train_data['p', 'a'].edge_attr.size() == (600, )
    assert train_data['p', 'a'].edge_attr.min() >= 500
    assert train_data['p', 'a'].edge_attr.max() <= 1500
    assert train_data['a', 'p'].edge_index.size() == (2, 600)
    assert train_data['a', 'p'].edge_attr.size() == (600, )
    assert train_data['a', 'p'].edge_attr.min() >= 500
    assert train_data['a', 'p'].edge_attr.max() <= 1500
    assert train_data['p', 'a'].edge_label_index.size() == (2, 1200)
    assert train_data['p', 'a'].edge_label.size() == (1200, )

    assert val_data['p', 'a'].edge_index.size() == (2, 600)
    assert val_data['p', 'a'].edge_attr.size() == (600, )
    assert val_data['p', 'a'].edge_attr.min() >= 500
    assert val_data['p', 'a'].edge_attr.max() <= 1500
    assert val_data['a', 'p'].edge_index.size() == (2, 600)
    assert val_data['a', 'p'].edge_attr.size() == (600, )
    assert val_data['a', 'p'].edge_attr.min() >= 500
    assert val_data['a', 'p'].edge_attr.max() <= 1500
    assert val_data['p', 'a'].edge_label_index.size() == (2, 400)
    assert val_data['p', 'a'].edge_label.size() == (400, )

    assert test_data['p', 'a'].edge_index.size() == (2, 800)
    assert test_data['p', 'a'].edge_attr.size() == (800, )
    assert test_data['p', 'a'].edge_attr.min() >= 500
    assert test_data['p', 'a'].edge_attr.max() <= 1500
    assert test_data['a', 'p'].edge_index.size() == (2, 800)
    assert test_data['a', 'p'].edge_attr.size() == (800, )
    assert test_data['a', 'p'].edge_attr.min() >= 500
    assert test_data['a', 'p'].edge_attr.max() <= 1500
    assert test_data['p', 'a'].edge_label_index.size() == (2, 400)
    assert test_data['p', 'a'].edge_label.size() == (400, )

    transform = RandomLinkSplit(num_val=0.2, num_test=0.2, is_undirected=True,
                                edge_types=[('p', 'p'), ('p', 'a')],
                                rev_edge_types=[None, ('a', 'p')])
    train_data, val_data, test_data = transform(data)

    assert len(train_data['p']) == 1
    assert len(train_data['a']) == 1
    assert len(train_data['p', 'p']) == 4
    assert len(train_data['p', 'a']) == 4
    assert len(train_data['a', 'p']) == 2

    assert is_undirected(train_data['p', 'p'].edge_index,
                         train_data['p', 'p'].edge_attr)
    assert train_data['p', 'a'].edge_index.size() == (2, 600)
    assert train_data['a', 'p'].edge_index.size() == (2, 600)
Example #24
0
 def is_undirected(self) -> bool:
     r"""Returns :obj:`True` if graph edges are undirected."""
     edge_index, _, _ = to_homogeneous_edge_index(self)
     return is_undirected(edge_index, num_nodes=self.num_nodes)
Example #25
0
 def is_undirected(self):
     r"""Returns :obj:`True`, if graph edges are undirected."""
     return is_undirected(self.edge_index, self.num_nodes)
Example #26
0
    def forward(self, data, return_hidden_feature=False):

        data.x = data.x.cuda()
        data.edge_attr = data.edge_attr.cuda()
        data.edge_index = data.edge_index.cuda()
        data.batch = data.batch.cuda()

        # make sure that we have undirected graph
        if not is_undirected(data.edge_index):
            data.edge_index = to_undirected(data.edge_index)

        # make sure that nodes can propagate messages to themselves
        if not contains_self_loops(data.edge_index):
            data.edge_index, data.edge_attr = add_self_loops(
                data.edge_index, data.edge_attr.view(-1))
        """
        # now select the top 5 closest neighbors to each node


        dense_adj = sparse_to_dense(edge_index=data.edge_index, edge_attr=data.edge_attr)

        #top_k_vals, top_k_idxs = torch.topk(dense_adj, dim=0, k=5, largest=False)

        #dense_adj = torch.zeros_like(dense_adj).scatter(1, top_k_idxs, top_k_vals)
        
        dense_adj[dense_adj == 0] = 10000   # insert artificially large values for 0 valued entries that will throw off NN calculation
        top_k_vals, top_k_idxs = torch.topk(dense_adj, dim=1, k=15, largest=False)
        dense_adj = torch.zeros_like(dense_adj).scatter(1, top_k_idxs, top_k_vals)
        
        data.edge_index, data.edge_attr = dense_to_sparse(dense_adj)
        """

        # covalent_propagation
        # add self loops to enable self propagation
        covalent_edge_index, covalent_edge_attr = self.covalent_neighbor_threshold(
            data.edge_index, data.edge_attr)
        (
            non_covalent_edge_index,
            non_covalent_edge_attr,
        ) = self.non_covalent_neighbor_threshold(data.edge_index,
                                                 data.edge_attr)

        # covalent_propagation and non_covalent_propagation
        covalent_x = self.covalent_propagation(data.x, covalent_edge_index,
                                               covalent_edge_attr)
        non_covalent_x = self.non_covalent_propagation(
            covalent_x, non_covalent_edge_index, non_covalent_edge_attr)

        # zero out the protein features then do ligand only gather...hacky sure but it gets the job done
        non_covalent_ligand_only_x = non_covalent_x
        non_covalent_ligand_only_x[data.x[:, 14] == -1] = 0
        pool_x = self.global_add_pool(non_covalent_ligand_only_x, data.batch)

        # fully connected and output layers
        if return_hidden_feature:
            # return prediction and atomistic features (covalent result, non-covalent result, pool result)

            avg_covalent_x, _ = avg_pool_x(data.batch, covalent_x, data.batch)
            avg_non_covalent_x, _ = avg_pool_x(data.batch, non_covalent_x,
                                               data.batch)

            fc0_x, fc1_x, output_x = self.output(pool_x,
                                                 return_hidden_feature=True)

            return avg_covalent_x, avg_non_covalent_x, pool_x, fc0_x, fc1_x, output_x
        else:
            return self.output(pool_x)