예제 #1
0
파일: cora_data.py 프로젝트: xssstory/cogdl
    def __init__(self, args=None):
        dataset = "jknet_cora"
        path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
        if not osp.exists(path):
            os.makedirs(path)
        super(CoraDataset, self).__init__(path)
        with open(self.processed_paths[0], 'rb') as fin:
            load_data = pickle.load(fin)
        self.num_nodes = load_data['node_num']

        data = Data()
        data.x = load_data['xs']
        data.y = load_data['ys']

        train_size = int(self.num_nodes * 0.8)
        train_mask = np.zeros((self.num_nodes, ), dtype=bool)
        train_idx = np.random.choice(np.arange(self.num_nodes), size=train_size, replace=False)
        train_mask[train_idx] = True
        test_mask = np.ones((self.num_nodes, ), dtype=bool)
        test_mask[train_idx] = False
        val_mask = test_mask

        edges = load_data['edges']
        edges = np.array(edges, dtype=int).transpose((1, 0))

        data.edge_index = torch.from_numpy(edges)
        data.train_mask = torch.from_numpy(train_mask)
        data.test_mask = torch.from_numpy(test_mask)
        data.val_mask = torch.from_numpy(val_mask)
        data.x = torch.Tensor(data.x)
        data.y = torch.LongTensor(data.y)

        self.data = data
        self.num_classes = torch.max(self.data.y).item() + 1
예제 #2
0
파일: kg_data.py 프로젝트: qazcy1983/cogdl
def read_triplet_data(folder):
    filenames = ["train2id.txt", "valid2id.txt", "test2id.txt"]
    count = 0
    edge_index = []
    edge_attr = []
    count_list = []
    for filename in filenames:
        with open(osp.join(folder, filename), "r") as f:
            num = int(f.readline().strip())
            for line in f:
                items = line.strip().split()
                edge_index.append([int(items[0]), int(items[1])])
                edge_attr.append(int(items[2]))
                count += 1
            count_list.append(count)

    edge_index = torch.LongTensor(edge_index).t()
    edge_attr = torch.LongTensor(edge_attr)
    data = Data()
    data.edge_index = edge_index
    data.edge_attr = edge_attr

    def generate_mask(start, end):
        mask = torch.BoolTensor(count)
        mask[:] = False
        mask[start:end] = True
        return mask

    data.train_mask = generate_mask(0, count_list[0])
    data.val_mask = generate_mask(count_list[0], count_list[1])
    data.test_mask = generate_mask(count_list[1], count_list[2])
    return data
예제 #3
0
    def get_subgraph(self, phase, require_norm=True):
        """
        Generate one minibatch for model. In the 'train' mode, one minibatch corresponds
        to one subgraph of the training graph. In the 'valid' or 'test' mode, one batch
        corresponds to the full graph (i.e., full-batch rather than minibatch evaluation
        for validation / test sets).

        Inputs:
            mode                str, can be 'train', 'valid', 'test'
            require_norm        boolean

        Outputs:
            data                Data object, modeling the sampled subgraph
            data.norm_aggr      aggregation normalization
            data.norm_loss      normalization normalization
        """
        if phase in ['val', 'test']:
            node_subgraph = np.arange(self.data.num_nodes)
            data = self.data
            if require_norm:
                data.norm_aggr = torch.ones(self.num_edges)
                data.norm_loss = self.norm_loss_test
        else:
            if len(self.subgraphs_nodes) == 0:
                self.gen_subgraph()

            node_subgraph = self.subgraphs_nodes.pop()
            edge_subgraph = self.subgraphs_edge_index.pop()
            num_nodes_subgraph = node_subgraph.size
            adj = sp.csr_matrix(
                (self.subgraphs_data.pop(), self.subgraphs_indices.pop(),
                 self.subgraphs_indptr.pop()),
                shape=(num_nodes_subgraph, num_nodes_subgraph))

            if require_norm:
                adj.data[:] = self.norm_aggr_train[edge_subgraph][:]
                #normalization
                D = adj.sum(1).flatten()
                norm_diag = sp.dia_matrix((1 / D, 0), shape=adj.shape)
                adj = norm_diag.dot(adj)
                adj.sort_indices()

            adj = adj.tocoo()
            data = Data(
                self.data.x[node_subgraph],
                torch.LongTensor(np.vstack(
                    (adj.row, adj.col))), None if self.data.edge_attr is None
                else self.data.edge_attr[edge_subgraph],
                self.data.y[node_subgraph], None
                if self.data.pos is None else self.data.pos[node_subgraph])

            if require_norm:
                data.norm_aggr = torch.FloatTensor(adj.data)
                data.norm_loss = self.norm_loss_train[node_subgraph]
            data.train_mask = self.data.train_mask[node_subgraph]
            data.val_mask = self.data.val_mask[node_subgraph]
            data.test_mask = self.data.test_mask[node_subgraph]

        return data
예제 #4
0
def read_planetoid_data(folder, prefix):
    prefix = prefix.lower()
    names = ["x", "tx", "allx", "y", "ty", "ally", "graph", "test.index"]
    objects = []
    for item in names[:-1]:
        with open(f"{folder}/ind.{prefix}.{item}", "rb") as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding="latin1"))
            else:
                objects.append(pkl.load(f))
    test_index = parse_index_file(f"{folder}/ind.{prefix}.{names[-1]}")
    test_index = torch.Tensor(test_index).long()
    test_index_reorder = test_index.sort()[0]

    x, tx, allx, y, ty, ally, graph = tuple(objects)
    x, tx, allx = tuple(
        [torch.from_numpy(item.todense()).float() for item in [x, tx, allx]])
    y, ty, ally = tuple(
        [torch.from_numpy(item).float() for item in [y, ty, ally]])

    train_index = torch.arange(y.size(0), dtype=torch.long)
    val_index = torch.arange(y.size(0), y.size(0) + 500, dtype=torch.long)

    if prefix.lower() == "citeseer":
        # There are some isolated nodes in the Citeseer graph, resulting in
        # none consecutive test indices. We need to identify them and add them
        # as zero vectors to `tx` and `ty`.
        len_test_indices = (test_index.max() - test_index.min()).item() + 1

        tx_ext = torch.zeros(len_test_indices, tx.size(1))
        tx_ext[test_index_reorder - test_index.min(), :] = tx
        ty_ext = torch.zeros(len_test_indices, ty.size(1))
        ty_ext[test_index_reorder - test_index.min(), :] = ty

        tx, ty = tx_ext, ty_ext

    x = torch.cat([allx, tx], dim=0).float()
    y = torch.cat([ally, ty], dim=0).max(dim=1)[1].long()

    x[test_index] = x[test_index_reorder]
    y[test_index] = y[test_index_reorder]

    train_mask = index_to_mask(train_index, size=y.size(0))
    val_mask = index_to_mask(val_index, size=y.size(0))
    test_mask = index_to_mask(test_index, size=y.size(0))

    edge_index = edge_index_from_dict(graph, num_nodes=y.size(0))

    data = Data(x=x, edge_index=edge_index, y=y)
    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask

    return data
예제 #5
0
파일: kg_data.py 프로젝트: zrt/cogdl
def read_triplet_data(folder):
    filenames = ["train2id.txt", "valid2id.txt", "test2id.txt"]
    count = 0
    edge_index = []
    edge_attr = []
    count_list = []
    triples = []
    num_entities = 0
    num_relations = 0
    entity_dic = {}
    relation_dic = {}
    for filename in filenames:
        with open(osp.join(folder, filename), "r") as f:
            _ = int(f.readline().strip())
            if "train" in filename:
                train_start_idx = len(triples)
            elif "valid" in filename:
                valid_start_idx = len(triples)
            elif "test" in filename:
                test_start_idx = len(triples)
            for line in f:
                items = line.strip().split()
                edge_index.append([int(items[0]), int(items[1])])
                edge_attr.append(int(items[2]))
                triples.append((int(items[0]), int(items[2]), int(items[1])))
                if items[0] not in entity_dic:
                    entity_dic[items[0]] = num_entities
                    num_entities += 1
                if items[1] not in entity_dic:
                    entity_dic[items[1]] = num_entities
                    num_entities += 1
                if items[2] not in relation_dic:
                    relation_dic[items[2]] = num_relations
                    num_relations += 1
                count += 1
            count_list.append(count)

    edge_index = torch.LongTensor(edge_index).t()
    edge_attr = torch.LongTensor(edge_attr)
    data = Data()
    data.edge_index = edge_index
    data.edge_attr = edge_attr

    def generate_mask(start, end):
        mask = torch.BoolTensor(count)
        mask[:] = False
        mask[start:end] = True
        return mask

    data.train_mask = generate_mask(0, count_list[0])
    data.val_mask = generate_mask(count_list[0], count_list[1])
    data.test_mask = generate_mask(count_list[1], count_list[2])
    return data, triples, train_start_idx, valid_start_idx, test_start_idx, num_entities, num_relations
예제 #6
0
    def process(self):
        data = np.load(osp.join(self.raw_dir, "reddit_data.npz"))
        x = torch.from_numpy(data["feature"]).to(torch.float)
        y = torch.from_numpy(data["label"]).to(torch.long)
        split = torch.from_numpy(data["node_types"])

        adj = sp.load_npz(osp.join(self.raw_dir, "reddit_graph.npz"))
        row = torch.from_numpy(adj.row).to(torch.long)
        col = torch.from_numpy(adj.col).to(torch.long)
        edge_index = torch.stack([row, col], dim=0)
        edge_index, _ = coalesce(edge_index, None, x.size(0), x.size(0))

        data = Data(x=x, edge_index=edge_index, y=y)
        data.train_mask = split == 1
        data.val_mask = split == 2
        data.test_mask = split == 3

        torch.save(self.collate([data]), self.processed_paths[0])
예제 #7
0
def read_planetoid_data(folder, prefix):
    names = ['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index']
    items = [read_file(folder, prefix, name) for name in names]
    x, tx, allx, y, ty, ally, graph, test_index = items
    train_index = torch.arange(y.size(0), dtype=torch.long)
    val_index = torch.arange(y.size(0), y.size(0) + 500, dtype=torch.long)
    sorted_test_index = test_index.sort()[0]

    if prefix.lower() == 'citeseer':
        # There are some isolated nodes in the Citeseer graph, resulting in
        # none consecutive test indices. We need to identify them and add them
        # as zero vectors to `tx` and `ty`.
        len_test_indices = (test_index.max() - test_index.min()).item() + 1

        tx_ext = torch.zeros(len_test_indices, tx.size(1))
        tx_ext[sorted_test_index - test_index.min(), :] = tx
        ty_ext = torch.zeros(len_test_indices, ty.size(1))
        ty_ext[sorted_test_index - test_index.min(), :] = ty

        tx, ty = tx_ext, ty_ext

    x = torch.cat([allx, tx], dim=0)
    y = torch.cat([ally, ty], dim=0).max(dim=1)[1]

    x[test_index] = x[sorted_test_index]
    y[test_index] = y[sorted_test_index]

    train_mask = sample_mask(train_index, num_nodes=y.size(0))
    val_mask = sample_mask(val_index, num_nodes=y.size(0))
    test_mask = sample_mask(test_index, num_nodes=y.size(0))

    edge_index = edge_index_from_dict(graph, num_nodes=y.size(0))

    data = Data(x=x, edge_index=edge_index, y=y)
    data.train_mask = train_mask
    data.val_mask = val_mask
    data.test_mask = test_mask

    return data