forked from anhngml/DataUtils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_convert.py
253 lines (193 loc) · 8.75 KB
/
data_convert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import numpy as np
import pickle as pkl
import scipy.sparse as sp
import sys
import os
import networkx as nx
from sklearn import preprocessing
import json
from networkx.readwrite import json_graph
import utils_gcn
import utils_sage
from my_encoder import my_encoder
def get_one_hot(targets, nb_classes):
return np.eye(nb_classes)[np.array(targets).reshape(-1)]
def save_object(obj, filename):
with open(filename, 'wb') as output: # Overwrites any existing file.
pkl.dump(obj, output, pkl.HIGHEST_PROTOCOL)
"""
GCN Format:
In order to use your own data, you have to provide
an N by N adjacency matrix (N is the number of nodes),
an N by D feature matrix (D is the number of features per node), and
an N by E binary label matrix (E is the number of classes).
Have a look at the load_data() function in utils.py for an example.
~~~~~~~~~~~~~~~~~~~~~~~
The input to the inductive model contains:
x, the feature vectors of the labeled training instances,
y, the one-hot labels of the labeled training instances,
allx, the feature vectors of both labeled and unlabeled training instances (a superset of x),
graph, a dict in the format {index: [index_of_neighbor_nodes]}.
Let n be the number of both labeled and unlabeled training instances. These n instances should be indexed from 0 to n - 1 in graph with the same order as in allx.
~~~~~~~~~~~~~~~~~~~~~~~
In addition to x, y, allx, and graph as described above, the preprocessed datasets also include:
tx, the feature vectors of the test instances,
ty, the one-hot labels of the test instances,
test.index, the indices of test instances in graph, for the inductive setting,
ally, the labels for instances in allx.
The indices of test instances in graph for the transductive setting are from #x to #x + #tx - 1, with the same order as in tx.
You can use cPickle.load(open(filename)) to load the numpy/scipy objects x, y, tx, ty, allx, ally, and graph. test.index is stored as a text file.
"""
def sage_format_to_GCN_format(num_of_x=1000, sage_prefix='ppi', root_dir='data'):
train_data = utils_sage.load_data(sage_prefix, root_dir='data')
G = train_data[0]
features = train_data[1]
id_map = train_data[2]
class_map = train_data[4]
print('num of nodes: {}'.format(len(G.node)))
print('num of features: {}'.format(len(features)))
print('num of ids: {}'.format(len(id_map)))
print('num of classes: {}'.format(len(class_map)))
print('data file is read successfully')
test_index = []
all_x_index = []
labels = {}
print('splitting data...')
for key, value in id_map.items():
labels.update({value: class_map[key]})
indexed_labels = []
size = len(labels)
print("num of nodes: {}".format(size))
for i in range(size):
if i in labels:
indexed_labels.append(labels[i])
else:
print("have not label for node {}".format(i))
indexed_labels.append(-1)
if not isinstance(indexed_labels[0], int):
print("Not support multi class")
return
set_labels = list(set(indexed_labels))
ally = get_one_hot(indexed_labels, len(set_labels))
# ======================
vertices = G.node
for key, value in vertices.items():
if value['test']:
test_index.append(id_map[key])
# else:
# all_x_index.append(id_map[key])
for i in range(size):
if i not in test_index:
all_x_index.append(i)
x_index = np.array(np.random.choice(
len(all_x_index), num_of_x, replace=False))
x_index = np.array(all_x_index)[x_index]
x = sp.csr_matrix(features[x_index])
y = ally[x_index]
ty = ally[test_index]
tx = sp.csr_matrix(features[test_index])
allx = sp.csr_matrix(features[all_x_index])
ally = ally[all_x_index]
H = nx.relabel_nodes(G, id_map)
graph = nx.to_dict_of_lists(H)
# root_dir = 'tmp'
save_object(allx, "{}/ind.{}.allx".format(root_dir, sage_prefix))
save_object(ally, "{}/ind.{}.ally".format(root_dir, sage_prefix))
save_object(graph, "{}/ind.{}.graph".format(root_dir, sage_prefix))
save_object(tx, "{}/ind.{}.tx".format(root_dir, sage_prefix))
save_object(ty, "{}/ind.{}.ty".format(root_dir, sage_prefix))
save_object(x, "{}/ind.{}.x".format(root_dir, sage_prefix))
save_object(y, "{}/ind.{}.y".format(root_dir, sage_prefix))
np.savetxt("{}/ind.{}.test.index".format(root_dir, sage_prefix),
test_index, delimiter='\n', fmt='%s')
# save_object(test_index, "data/ind.{}.test.index".format(sage_prefix))
print('Done')
"""
Sage Format:
As input, at minimum the code requires that a --train_prefix option is specified which specifies the following data files:
<train_prefix>-G.json -- A networkx-specified json file describing the input graph. Nodes have 'val' and 'test' attributes specifying if they are a part of the validation and test sets, respectively.
<train_prefix>-id_map.json -- A json-stored dictionary mapping the graph node ids to consecutive integers.
<train_prefix>-class_map.json -- A json-stored dictionary mapping the graph node ids to classes.
<train_prefix>-feats.npy [optional] --- A numpy-stored array of node features; ordering given by id_map.json. Can be omitted and only identity features will be used.
<train_prefix>-walks.txt [optional] --- A text file specifying random walk co-occurrences (one pair per line) (*only for unsupervised version of graphsage)
To run the model on a new dataset, you need to make data files in the format described above. To run random walks for the unsupervised model and to generate the -walks.txt file) you can use the run_walks function in graphsage.utils.
"""
def GCN_format_to_sage_format(data_name='citeseer', root_dir='data'):
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = utils_gcn.load_data(
data_name, root_dir=root_dir)
G = nx.from_scipy_sparse_matrix(adj)
train_index = np.where(train_mask)[0]
val_index = np.where(val_mask)[0]
test_index = np.where(test_mask)[0]
y = y_train + y_val + y_test
y = np.argmax(y, axis=1)
for i in range(len(y)):
if i in val_index:
G.node[i]['val'] = True
G.node[i]['test'] = False
G.node[i]['train'] = False
elif i in test_index:
G.node[i]['test'] = True
G.node[i]['val'] = False
G.node[i]['train'] = False
elif i in train_index:
G.node[i]['test'] = False
G.node[i]['val'] = False
G.node[i]['train'] = True
else:
G.node[i]['test'] = False
G.node[i]['val'] = False
G.node[i]['train'] = False
data = json_graph.node_link_data(G)
with open("{}/{}-G.json".format(root_dir, data_name), "w", encoding="utf8") as f:
json.dump(data, f, cls=my_encoder)
classMap = {}
idMap = {}
for i in range(len(y)):
classMap[i] = y[i]
idMap[i] = i
with open("{}/{}-id_map.json".format(root_dir, data_name), "w", encoding="utf8") as f:
json.dump(idMap, f, cls=my_encoder)
with open("{}/{}-class_map.json".format(root_dir, data_name), "w", encoding="utf8") as f:
json.dump(classMap, f, cls=my_encoder)
np.save(open("{}/{}-feats.npy".format(root_dir, data_name), "wb"),
features.todense())
"""
node2vec Format:
Input
The supported input format is an edgelist:
node1_id_int node2_id_int <weight_float, optional>
The graph is assumed to be undirected and unweighted by default. These options can be changed by setting the appropriate flags.
Output
The output file has n+1 lines for a graph with n vertices. The first line has the following format:
num_of_nodes dim_of_representation
The next n lines are as follows:
node_id dim1 dim2 ... dimd
where dim1, ... , dimd is the d-dimensional representation learned by node2vec.
"""
def sage_format_to_node2vec_format(sage_prefix='ppi', root_dir='data'):
train_data = utils_sage.load_data(sage_prefix, root_dir='data')
G = train_data[0]
features = train_data[1]
id_map = train_data[2]
class_map = train_data[4]
try:
all_labels = np.fromiter(class_map.values(), dtype=int)
except Exception:
print("Not support multiple classes")
return
graph = nx.to_dict_of_lists(G)
edges = []
for key, value in graph.items():
if isinstance(value, int):
edges.append([key, value])
else:
for v in value:
edges.append([key, v])
np.savetxt("{}/{}.edgelist".format(root_dir, sage_prefix),
edges, delimiter=' ', fmt='%s')
return None
if __name__ == '__main__':
# sage_format_to_GCN_format(sage_prefix='ppi', root_dir='data')
GCN_format_to_sage_format(data_name='pubmed', root_dir='data')
# sage_format_to_node2vec_format(sage_prefix='citeseer', root_dir='data')