def load_pdata(dataset_str): names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in xrange(len(names)): objects.append(pkl.load(open("./data/ind.{}.{}".format(dataset_str, names[i])))) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file("./data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range-min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) train_mask = sample_mask(idx_train, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_test[test_mask, :] = labels[test_mask, :] train_out = [] for i in idx_train: ll = y_train[i].tolist() ll = ll.index(1) + 1 train_out.append([i, ll]) train_out = np.array(train_out) np.random.shuffle(train_out) test_out = [] for i in idx_test: ll = y_test[i].tolist() ll = ll.index(1) + 1 test_out.append([i, ll]) test_out = np.array(test_out) adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape) adj.eliminate_zeros() # Check that diag is zero: assert np.diag(adj.todense()).sum() == 0 adj_triu = sp.triu(adj) adj_tuple = sparse_to_tuple(adj_triu) edges = adj_tuple[0] edges_all = sparse_to_tuple(adj)[0] num_mask = int(np.floor(edges.shape[0] / 10.)) return graph, features, train_out, test_out
def skeleton_to_nx_graph(skeleton): """Converts a binary skeleton image to a networkx graph Arguments: skeleton (array): 2d/3d binary skeleton image Returns: dict: dict of adjacency information with entries node_id : [neighbours] """ ids,nh = skeleton_to_list(skeleton, with_neighborhoods = True); print('ids done...'); if len(ids) == 0: return nx.Graph(); elif len(ids) == 1: adj = {}; adj[tuple(ids[0])] = []; return nx.from_dict_of_lists(adj); else: g = nx.Graph(); for i,pos in enumerate(ids): if i % 500 == 0: print('%d/%d nodes constructed...' % (i, len(ids))); p = tuple(pos); g.add_node(p); posnh = np.where(nh[i]); for pp in np.transpose(posnh): g.add_edge(p, tuple(pp+pos-1)); return g;
def draw_graph(graphDic, nodesStatus, imageName): node_colors = [] #first writing the number of nodes #nx.draw(G) #select the color newGraphDic = {} #without the status for element in graphDic.keys(): status = nodesStatus[element[0] - 1] if status == "INACTIVE": node_colors +=['white'] if status == "ACTIVE": node_colors +=['red'] if status == "SELECTED": node_colors +=['green'] #generating the graph from the dictionary G = nx.from_dict_of_lists(graphDic) nx.draw_circular(G, node_color = node_colors, with_labels=True, node_size = 50) #G.text(3, 8, 'boxed italics text in data coords', style='italic', bbox={'facecolor':'red', 'alpha':0.5, 'pad':10}) # plt.legend(handles=[ green_patch]) # nx.draw_networkx(G, node_color=node_colors, with_labels=True) #nx.draw_networkx(G) #save the result semiSparseRep print "image name is" + imageName plt.savefig(imageName);
def shortest_path(self, target): #checkmark 1 d0 = time.clock() dict_links = {self.url[24:]:WikiWeb(self.url).links()} links = WikiWeb(self.url).links() wiki = 'https://en.wikipedia.org' print(time.clock()-d0) #checkmark 2 d0 = time.clock() count=0 while target[24:] not in links: link = links[count] dict_links.update({link:WikiWeb(wiki+link).links()}) for link1 in dict_links[link]: if link1 not in links: links.append(link1) count+=1 print(time.clock()-d0) #checkmark 3 d0 = time.clock() gr = nx.from_dict_of_lists(dict_links) sp = nx.shortest_path(gr, self.url[24:], target[24:]) print(time.clock()-d0) return sp '''
def from_ajacency_map(amap, directed = False): """ Turns a map of adjacencies into a graph. amap: Adjacency Dict direct: If set to true, an undirected graph will be created. """ return nx.from_dict_of_lists(amap, nx.DiGraph() if directed else nx.Graph())
def parse_graph(file_string): # Open the file and decode the json information with open(file_string, 'r') as f: data = json.load(f) # Create a networkx graph from our adjacency list data G = nx.from_dict_of_lists(data) return G
def neighbourhoods(distribution, areal_units, classes=None): """ Return the neighbourhoods where different classes gather Parameter --------- distribution: nested dictionaries Number of people per class, per areal unit as given in the raw data (ungrouped). The dictionary must have the following formatting: > {areal_id: {class_id: number}} areal_units: dictionnary Dictionnary of areal unit ids with shapely polygon object representing the unit's geometry as values. classes: dictionary of lists When the original categories need to be aggregated into different classes. > {class: [categories belonging to this class]} This can be arbitrarily imposed, or computed with uncover_classes function of this package. Returns ------- neighbourhoods: dictionary Dictionary of classes names with list of neighbourhoods (that are each represented by a list of areal unit) > {'class': [ [areal units in cluster i], ...]} """ # Regroup into classes if specified. Otherwise return categories indicated # in the data if not classes: classes = return_categories(distribution) ## Find the areal units where classes are overrepresented or_units = overrepresented_units(distribution, classes) ## Compute the adjacency list adjacency = _adjacency(areal_units) ## Extract neighbourhooods as connected components G = nx.from_dict_of_lists(adjacency) # Graph from adjacency neighbourhoods = {cl: [list(subgraph) for subgraph in nx.connected_component_subgraphs(G.subgraph(or_units[cl]))] for cl in classes} return neighbourhoods
def load_data(dataset_str): """Load data.""" names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range-min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y)+500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
def edges_to_matrix(filename): data = np.loadtxt(filename) adjdict = dict() data = data.astype(int) for i in xrange(data.shape[0]): if adjdict.has_key(data[i,0]): adjdict[data[i,0]].append(data[i,1]) else: adjdict[data[i,0]] = [data[i,1]] for i in xrange(data.shape[0]): if not adjdict.has_key(data[i,1]): adjdict[data[i,1]] = [] nodes_set = set() for i in xrange(data.shape[0]): for j in xrange(data.shape[1]): nodes_set.add(data[i,j]) nodes_list = list(nodes_set) values = dict() for i in xrange(len(nodes_list)): values[nodes_list[i]] = i refactored = dict() for node in adjdict.keys(): edges = adjdict[node] refactored_edges = [] for e in edges: refactored_edges.append(values[e]) refactored[values[node]] = refactored_edges G = nx.from_dict_of_lists(refactored, create_using=nx.DiGraph()) n = len(G.nodes()) A = np.zeros((n,n)) for u in G.nodes(): for v in refactored[u]: A[u,v] = 1 return A import numpy as np import networkx as nx
def game_from_file(filename): game = Game() game.network = nx.from_dict_of_lists(json.loads(open(filename).read())) # We split up the graphname, to get # -the number of players, # -the number of seed per player, # -the graph id. basename = os.path.basename(filename) num_list = map(int, basename.split(".")[:3]) game.num_players = num_list[0] game.num_seeds = num_list[1] game.id = num_list[2] return game
def load_data(dataset): names = ['x', 'tx', 'allx', 'graph'] objects = [] for i in range(len(names)): objects.append(pkl.load(open("data/ind.{}.{}".format(dataset, names[i])))) x, tx, allx, graph = tuple(objects) test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset)) test_idx_range = np.sort(test_idx_reorder) test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) return adj, features
def get_networkx_graph_from_array(binary_arr): """ Return a networkx graph from a binary numpy array Parameters ---------- binary_arr : numpy array binary numpy array can only be 2D Or 3D Returns ------- networkx_graph : Networkx graph graphical representation of the input array after clique removal """ assert np.max(binary_arr) in [0, 1], "input must always be a binary array" start = time.time() dict_of_indices_and_adjacent_coordinates = _set_adjacency_list(binary_arr) networkx_graph = nx.from_dict_of_lists(dict_of_indices_and_adjacent_coordinates) _remove_clique_edges(networkx_graph) print("time taken to obtain networkxgraph is %0.3f seconds" % (time.time() - start)) return networkx_graph
def load_data(dataset): # load the data: x, tx, allx, graph names = ['x', 'tx', 'allx', 'graph'] objects = [] for i in range(len(names)): objects.append(pkl.load(open("data/ind.{}.{}".format(dataset, names[i])))) x, tx, allx, graph = tuple(objects) test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset)) test_idx_range = np.sort(test_idx_reorder) if dataset == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) return adj, features
def process(self): """Loads input data from data directory ind.name.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object; ind.name.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object; ind.name.allx => the feature vectors of both labeled and unlabeled training instances (a superset of ind.name.x) as scipy.sparse.csr.csr_matrix object; ind.name.y => the one-hot labels of the labeled training instances as numpy.ndarray object; ind.name.ty => the one-hot labels of the test instances as numpy.ndarray object; ind.name.ally => the labels for instances in ind.name.allx as numpy.ndarray object; ind.name.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object; ind.name.test.index => the indices of test instances in graph, for the inductive setting as list object. """ root = self.raw_path objnames = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(objnames)): with open("{}/ind.{}.{}".format(root, self.name, objnames[i]), 'rb') as f: objects.append(_pickle_load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = _parse_index_file("{}/ind.{}.test.index".format( root, self.name)) test_idx_range = np.sort(test_idx_reorder) if self.name == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] graph = nx.DiGraph(nx.from_dict_of_lists(graph)) onehot_labels = np.vstack((ally, ty)) onehot_labels[test_idx_reorder, :] = onehot_labels[test_idx_range, :] labels = np.argmax(onehot_labels, 1) idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) train_mask = generate_mask_tensor( _sample_mask(idx_train, labels.shape[0])) val_mask = generate_mask_tensor(_sample_mask(idx_val, labels.shape[0])) test_mask = generate_mask_tensor( _sample_mask(idx_test, labels.shape[0])) self._graph = graph g = from_networkx(graph) g.ndata['train_mask'] = train_mask g.ndata['val_mask'] = val_mask g.ndata['test_mask'] = test_mask g.ndata['label'] = F.tensor(labels) g.ndata['feat'] = F.tensor(_preprocess_features(features), dtype=F.data_type_dict['float32']) self._num_classes = onehot_labels.shape[1] self._labels = labels self._g = g if self.verbose: print('Finished data loading and preprocessing.') print(' NumNodes: {}'.format(self._g.number_of_nodes())) print(' NumEdges: {}'.format(self._g.number_of_edges())) print(' NumFeats: {}'.format(self._g.ndata['feat'].shape[1])) print(' NumClasses: {}'.format(self.num_classes)) print(' NumTrainingSamples: {}'.format( F.nonzero_1d(self._g.ndata['train_mask']).shape[0])) print(' NumValidationSamples: {}'.format( F.nonzero_1d(self._g.ndata['val_mask']).shape[0])) print(' NumTestSamples: {}'.format( F.nonzero_1d(self._g.ndata['test_mask']).shape[0]))
else: print 'sciezka' print q if once: return True else: visited[v] = True for x in g[v]: if not visited[x]: found = hamiltonian_backend(g, x, visited, once) if found: return True visited[v] = False q.pop(-1) def hamiltonian(g,once=False): v = [False] * len(g) return hamiltonian_backend(g, 1, v,once) if __name__ == "__main__": hamiltonian(graph,once=True) G = nx.from_dict_of_lists(graph, create_using=nx.MultiDiGraph()) nx.draw(G) plt.show()
def load_data(dataset_str, train_size, validation_size = 500, timeseta = 3, validate = False, shuffle=True): """Load data.""" if dataset_str in ['USPS-Fea', 'CIFAR-Fea', 'Cifar_10000_fea', 'Cifar_R10000_fea', 'MNIST-Fea', 'MNIST-10000', 'MNIST-5000']: data = sio.loadmat('data/{}.mat'.format(dataset_str)) l = data['labels'].flatten() labels = np.zeros([l.shape[0],np.max(data['labels'])+1]) labels[np.arange(l.shape[0]), l.astype(np.int8)] = 1 features = data['X'] sample = features[0].copy() adj = data['G'] else: names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() # features = sp.eye(features.shape[0]).tolil() # features = sp.lil_matrix(allx) labels = np.vstack((ally, ty)) # labels = np.vstack(ally) features[test_idx_reorder, :] = features[test_idx_range, :] labels[test_idx_reorder, :] = labels[test_idx_range, :] features = preprocess_features(features) global all_labels all_labels = labels.copy() # split the data set idx = np.arange(len(labels)) no_class = labels.shape[1] # number of class train_size = [train_size for i in range(labels.shape[1])] if shuffle: np.random.shuffle(idx) idx_train = [] count = [0 for i in range(no_class)] label_each_class = train_size next = 0 for i in idx: if count == label_each_class: break next += 1 for j in range(no_class): if labels[i, j] and count[j] < label_each_class[j]: idx_train.append(i) count[j] += 1 test_size = None if validate: if test_size: assert next+validation_size<len(idx) idx_val = idx[next:next+validation_size] assert next+validation_size+test_size < len(idx) idx_test = idx[-test_size:] if test_size else idx[next+validation_size:] else: if test_size: assert next+test_size<len(idx) idx_val = idx[-test_size:] if test_size else idx[next:] idx_test = idx[-test_size:] if test_size else idx[next:] print('labels of each class : ', np.sum(labels[idx_train], axis=0)) eta = np.float(adj.shape[0])/(np.float(adj.sum())/adj.shape[0])**2 t = (labels[idx_train].sum(axis=0)*timeseta*eta/labels[idx_train].sum()).astype(np.int64) features = torch.FloatTensor(np.array(features.todense())) labels = torch.LongTensor(np.argmax(labels,1)) adj = adj + sp.eye(adj.shape[0]) adj = normalize_adj(adj) adj = sparse_mx_to_torch_sparse_tensor(adj) idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) return adj, features, labels, idx_train, idx_val, idx_test, t
def load_data_vis_multi(dataset_str, use_trainval, X_dense_file, train_y_file, graph_file, test_index_file): """Load data.""" names = [X_dense_file, train_y_file, graph_file] objects = [] for name in names: with open(os.path.join(dataset_str, name), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) allx, ally, graph = tuple(objects) with open(os.path.join(dataset_str, test_index_file), 'rb') as f: train_test_mask = pkl.load(f) features = allx adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.array(ally) idx_test = [] idx_train = [] idx_trainval = [] if use_trainval: for i in range(len(train_test_mask)): if train_test_mask[i] == 0: idx_train.append(i) if train_test_mask[i] == 1: idx_test.append(i) if train_test_mask[i] >= 0: idx_trainval.append(i) else: for i in range(len(train_test_mask)): if train_test_mask[i] >= 0: idx_train.append(i) if train_test_mask[i] == 1: idx_test.append(i) if train_test_mask[i] >= 0: idx_trainval.append(i) idx_val = idx_test train_mask = sample_mask_sigmoid(idx_train, labels.shape[0], labels.shape[1]) train_adj_mask = sample_mask_sigmoid(idx_train, labels.shape[0], labels.shape[0]) val_mask = sample_mask_sigmoid(idx_val, labels.shape[0], labels.shape[1]) val_adj_mask = sample_mask_sigmoid(idx_val, labels.shape[0], labels.shape[0]) trainval_mask = sample_mask_sigmoid(idx_trainval, labels.shape[0], labels.shape[1]) trainval_adj_mask = sample_mask_sigmoid(idx_trainval, labels.shape[0], labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_trainval = np.zeros(labels.shape) y_train[train_mask] = labels[train_mask] y_val[val_mask] = labels[val_mask] y_trainval[trainval_mask] = labels[trainval_mask] return adj, features, y_train, train_mask, train_adj_mask, val_mask, val_adj_mask, trainval_mask, trainval_adj_mask
def load_data(dataset="cora", modified=False, attacked=False): """ Load Citation Networks Datasets. """ path = '../LAGCN/' names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] if modified: names[-1] = 'graph_lite' if attacked: names[-1] = 'graph_attack' print(names[-1]) objects = [] for i in range(len(names)): with open(path + "data/ind.{}.{}".format(dataset.lower(), names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( path + "data/ind.{}.test.index".format(dataset)) test_idx_range = np.sort(test_idx_reorder) if dataset == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) # adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) features = normalize(features) adj = normalize(adj + sp.eye(adj.shape[0])) idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) features = torch.FloatTensor(np.array(features.todense())) labels = torch.LongTensor(labels) labels = torch.max(labels, dim=1)[1] # labels = torch.LongTensor(np.where(labels)[1]) adj = sparse_mx_to_torch_sparse_tensor(adj) idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) return adj, features, labels, idx_train, idx_val, idx_test
def load_citation(dataset_str="cora", normalization="AugNormAdj", cuda=True): """ Load Citation Networks Datasets. """ names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("../data/ind.{}.{}".format(dataset_str.lower(), names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( "../data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) adj, features = preprocess_citation(adj, features, normalization) # porting to pytorch features = torch.FloatTensor(np.array(features.todense())).float() labels = torch.LongTensor(labels) labels = torch.max(labels, dim=1)[1] adj = sparse_mx_to_torch_sparse_tensor(adj).float() idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) if cuda: features = features.cuda() adj = adj.cuda() labels = labels.cuda() idx_train = idx_train.cuda() idx_val = idx_val.cuda() idx_test = idx_test.cuda() return adj, features, labels, idx_train, idx_val, idx_test
def adj_lists_to_directed_graph(adjacency_lists): """Turns a dict of lists of nodes to a directed graph""" return nx.from_dict_of_lists(adjacency_lists, create_using=nx.DiGraph())
def load_data(dataset_str, train_size, validation_size, model_config, shuffle=True, repeat_state=None): if train_size == 'public': return load_public_split_data(dataset_str) """Load data.""" if dataset_str in ['large_cora']: data = sio.loadmat('data/{}.mat'.format(dataset_str)) l = data['labels'].flatten() labels = np.zeros([l.shape[0], np.max(l) + 1]) labels[np.arange(l.shape[0]), l.astype(np.int8)] = 1 features = data['X'] adj = data['G'] else: names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) adj = nx.to_scipy_sparse_matrix(nx.from_dict_of_lists(graph)) # adj = sp.csr_matrix(adj) test_idx_reorder = parse_index_file( "data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), ty.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() labels = np.vstack((ally, ty)) if dataset_str.startswith('nell'): # Find relation nodes, add them as zero-vecs into the right position test_idx_range_full = range(allx.shape[0], len(graph)) isolated_node_idx = np.setdiff1d(test_idx_range_full, test_idx_reorder) tx_extended = sp.lil_matrix( (len(test_idx_range_full), tx.shape[1])) tx_extended[test_idx_range - allx.shape[0], :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), ty.shape[1])) ty_extended[test_idx_range - allx.shape[0], :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_all = np.setdiff1d(range(len(graph)), isolated_node_idx) if not os.path.isfile("data/{}.features.npz".format(dataset_str)): print( "Creating feature vectors for relations - this might take a while..." ) features_extended = sp.hstack( (features, sp.lil_matrix( (features.shape[0], len(isolated_node_idx)))), dtype=np.int32).todense() features_extended[isolated_node_idx, features.shape[1]:] = np.eye( len(isolated_node_idx)) features = sp.csr_matrix(features_extended, dtype=np.float32) print("Done!") save_sparse_csr("data/{}.features".format(dataset_str), features) else: features = load_sparse_csr( "data/{}.features.npz".format(dataset_str)) idx_train = np.arange(x.shape[0]) idx_test = test_idx_reorder if model_config['validate']: assert x.shape[0] + validation_size < allx.shape[0] + tx.shape[ 0] idx_val = np.arange(x.shape[0], x.shape[0] + validation_size) else: idx_val = test_idx_reorder train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask features[test_idx_reorder, :] = features[test_idx_range, :] labels[test_idx_reorder, :] = labels[test_idx_range, :] features = preprocess_features(features, feature_type=model_config['feature']) # split the data set idx_train, idx_val, idx_test = split_dataset( labels, train_size, model_config['test_size'], validation_size, validate=model_config['validate'], shuffle=shuffle) if model_config['verbose']: print('labels of each class : ', np.sum(labels[idx_train], axis=0)) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] size_of_each_class = np.sum(labels[idx_train], axis=0) features = features.astype(np.float32) return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
def build_edge_index_nx(adjacency_list_dict): nx_graph = nx.from_dict_of_lists(adjacency_list_dict) adj = nx.adjacency_matrix(nx_graph) adj = adj.tocoo() # convert to COO (COOrdinate sparse format) return np.row_stack((adj.row, adj.col))
# ## Generate graph from edges and node data ## # Read edges.csv and make a network out of it edges = defaultdict(list) with open('datasets/BlogCatalog-dataset/data/edges.csv') as csvfile: reader = csv.reader(csvfile) for row in reader: if crop is not None: if int(row[0]) in nodes and int(row[1]) in nodes: edges[int(row[0])].append(int(row[1])) else: edges[int(row[0])].append(int(row[1])) g = nx.from_dict_of_lists(edges, create_using=nx.Graph()) if crop is not None: g.add_nodes_from(nodes) # Read group-edges.csv and add that info to each node group_edges = defaultdict(list) with open('datasets/BlogCatalog-dataset/data/group-edges.csv') as csvfile: reader = csv.reader(csvfile) for row in reader: if crop is not None: if int(row[0]) in nodes: group_edges[int(row[0])].append(int(row[1])) else: group_edges[int(row[0])].append(int(row[1])) for node, data in g.nodes.items():
def dict_to_adj(the_dict, directed=True): if directed: graph = nx.from_dict_of_lists(the_dict, create_using=nx.DiGraph()) else: graph = nx.from_dict_of_lists(the_dict) return nx.adjacency_matrix(graph, nodelist=sorted(graph.nodes()))
def load_citation_data(dataset_str, use_feats, data_path, split_seed=None): if dataset_str[:3] == 'my_': names1 = ['adj_matrix.npz', 'attr_matrix.npz'] names2 = [ 'label_matrix.npy', 'train_mask.npy', 'val_mask.npy', 'test_mask.npy' ] objects = [] for tmp_name in names1: tmp_path = 'data/{}/{}.{}'.format(dataset_str, dataset_str, tmp_name) objects.append(sp.load_npz(tmp_path)) for tmp_name in names2: tmp_path = 'data/{}/{}.{}'.format(dataset_str, dataset_str, tmp_name) objects.append(np.load(tmp_path)) adj, features, label_matrix, train_mask, val_mask, test_mask = tuple( objects) labels = np.argmax(label_matrix, 1) arr = np.arange(len(train_mask)) idx_train = list(arr[train_mask]) idx_val = list(arr[val_mask]) idx_test = list(arr[test_mask]) return adj, features, labels, idx_train, idx_val, idx_test names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open( os.path.join(data_path, "ind.{}.{}".format(dataset_str, names[i])), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( os.path.join(data_path, "ind.{}.test.index".format(dataset_str))) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] labels = np.argmax(labels, 1) idx_test = test_idx_range.tolist() idx_train = list(range(len(y))) idx_val = range(len(y), len(y) + 500) adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) if not use_feats: features = sp.eye(adj.shape[0]) return adj, features, labels, idx_train, idx_val, idx_test
def get_mlp_embeddings(**kwargs): data = kwargs.get('data') vocab = kwargs.get('vocab') clf = MLP(n_epochs=50, batch_size=10000, init_parameters=None, complete_prob=False, add_hidden=True, regul_coefs=[1e-6, 1e-6], save_results=False, hidden_layer_size=2048, drop_out=True, drop_out_coefs=[0.5, 0.5], early_stopping_max_down=5, loss_name='log', nonlinearity='rectify') metainfo, X_train, Y_train, U_train, X_dev, Y_dev, U_dev, X_test, Y_test, U_test, classLatMedian, classLonMedian, userLocation, vectorizer = data convolution = False if convolution: logging.info('loading graph...') with open('/home/arahimi/git/jointgeo/data/trans.cmu.graph', 'rb') as fin: dev_graph = pickle.load(fin) ''' dev_graph_indices = xrange(X_train.shape[0], X_train.shape[0] + X_dev.shape[0]) X_test = X_test.tolil() for i in dev_graph_indices: nbrs = dev_graph[i] dev_index = i - X_train.shape[0] count = 1 for nbr in nbrs: if nbr < X_train.shape[0]: X_test[i - X_train.shape[0], :] += X_train[nbr, :] count += 1 X_test[i - X_train.shape[0], :] /= count X_test = X_test.tocsr().astype('float32') ''' for i in range(0, X_train.shape[0] + X_dev.shape[0]): dev_graph[i].append(i) logging.info('creating adjacency matrix...') adj = nx.adjacency_matrix(nx.from_dict_of_lists(dev_graph)) adj.setdiag(1) pdb.set_trace() logging.info('normalizing adjacency matrix...') normalize(adj, axis=1, norm='l1', copy=False) adj = adj.astype('float32') logging.info('vstacking...') X = sp.sparse.vstack([X_train, X_test]) logging.info('convolution...') X_conv = adj * X X_conv = X_conv.tocsr().astype('float32') #X_train = X_conv[0:X_train.shape[0], :] X_test = X_conv[X_train.shape[0]:, :] clf.fit(X_train, Y_train, X_dev, Y_dev) print('Test classification accuracy is %f' % clf.accuracy(X_test, Y_test)) y_pred = clf.predict(X_test) geo_eval(Y_test, y_pred, U_test, classLatMedian, classLonMedian, userLocation) print('Dev classification accuracy is %f' % clf.accuracy(X_dev, Y_dev)) y_pred = clf.predict(X_dev) geo_eval(Y_dev, y_pred, U_dev, classLatMedian, classLonMedian, userLocation) X_dare = vectorizer.transform(vocab) X_dare = X_dare.astype('float32') mlp_embeddings = clf.get_embedding(X_dare) return vocab, mlp_embeddings
def load_data(dataset_str): """ Loads input data from gcn/data directory ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object; ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object; ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object; ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object; ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object; ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object. All objects above must be saved using python pickle module. :param dataset_str: Dataset name :return: All data input files loaded (as well the training/test data). """ names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("../dataset/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( "../dataset/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] # preprocess feature features = preprocess_features(features) features = torch.tensor(features, dtype=torch.float32) adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) # preprocess adj adj = sparse_mx_to_torch_sparse_tensor(adj).to_dense() # adj = torch_normalize_adj(adj) # adj2 = preprocess_adj(adj) # adj2 = sparse_mx_to_torch_sparse_tensor(adj2).to_dense() labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] _, l_num = labels.shape labels = torch.tensor((labels * range(l_num)).sum(axis=1), dtype=torch.int64) idx_test = test_idx_range.tolist() idx_train = list(range(len(y))) idx_val = list(range(len(y), len(y) + 500)) return adj, features, labels, idx_train, idx_val, idx_test
def load_data(dataset_name='cora', normalize_features=True): """ Loads a citation dataset using the public splits as defined in [Kipf & Welling (2016)](https://arxiv.org/abs/1609.02907). :param dataset_name: name of the dataset to load ('cora', 'citeseer', or 'pubmed'); :param normalize_features: if True, the node features are normalized; :return: the citation network in numpy format, with train, test, and validation splits for the targets and masks. """ if dataset_name not in AVAILABLE_DATASETS: raise ValueError('Available datasets: {}'.format(AVAILABLE_DATASETS)) if not os.path.exists(DATA_PATH + dataset_name): download_data(dataset_name) print('Loading {} dataset'.format(dataset_name)) names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] data_path = os.path.join(DATA_PATH, dataset_name) for n in names: filename = "{}/ind.{}.{}".format(data_path, dataset_name, n) objects.append(load_binary(filename)) x, y, tx, ty, allx, ally, graph = tuple(objects) adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) test_idx_reorder = _parse_index_file("{}/ind.{}.test.index".format( data_path, dataset_name)) test_idx_range = np.sort(test_idx_reorder) if dataset_name == 'citeseer': test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) train_mask = _sample_mask(idx_train, labels.shape[0]) val_mask = _sample_mask(idx_val, labels.shape[0]) test_mask = _sample_mask(idx_test, labels.shape[0]) # Row-normalize the features if normalize_features: print('Pre-processing node features') features = preprocess_features(features) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask, labels
def load_graph_data(training_config, device): dataset_name = training_config['dataset_name'].lower() layer_type = training_config['layer_type'] should_visualize = training_config['should_visualize'] if dataset_name == DatasetType.CORA.name.lower(): # shape = (N, FIN), where N is the number of nodes and FIN is the number of input features node_features_csr = pickle_read( os.path.join(CORA_PATH, 'node_features.csr')) # shape = (N, 1) node_labels_npy = pickle_read( os.path.join(CORA_PATH, 'node_labels.npy')) # shape = (N, number of neighboring nodes) <- this is a dictionary not a matrix! adjacency_list_dict = pickle_read( os.path.join(CORA_PATH, 'adjacency_list.dict')) # Normalize the features node_features_csr = normalize_features_sparse(node_features_csr) num_of_nodes = len(node_labels_npy) if layer_type == LayerType.IMP3: # Build edge index explicitly (faster than nx ~100 times and as fast as PyGeometric imp but less complex) # shape = (2, E), where E is the number of edges, and 2 for source and target nodes. Basically edge index # contains tuples of the format S->T, e.g. 0->3 means that node with id 0 points to a node with id 3. topology = build_edge_index(adjacency_list_dict, num_of_nodes, add_self_edges=True) elif layer_type == LayerType.IMP2 or layer_type == LayerType.IMP1: # adjacency matrix shape = (N, N) topology = nx.adjacency_matrix( nx.from_dict_of_lists(adjacency_list_dict)).todense().astype( np.float) topology += np.identity(topology.shape[0]) # add self connections topology[topology > 0] = 1 # multiple edges not allowed topology[ topology == 0] = -np.inf # make it a mask instead of adjacency matrix (used to mask softmax) topology[topology == 1] = 0 else: raise Exception(f'Layer type {layer_type} not yet supported.') # Note: topology is just a fancy way of naming the graph structure data # (be it in the edge index format or adjacency matrix) if should_visualize: # network analysis and graph drawing plot_in_out_degree_distributions(topology, num_of_nodes, dataset_name) visualize_graph(topology, node_labels_npy, dataset_name) # Convert to dense PyTorch tensors # Needs to be long int type (in implementation 3) because later functions like PyTorch's index_select expect it topology = torch.tensor( topology, dtype=torch.long if layer_type == LayerType.IMP3 else torch.float, device=device) node_labels = torch.tensor( node_labels_npy, dtype=torch.long, device=device) # Cross entropy expects a long int node_features = torch.tensor(node_features_csr.todense(), device=device) # Indices that help us extract nodes that belong to the train/val and test splits train_indices = torch.arange(CORA_TRAIN_RANGE[0], CORA_TRAIN_RANGE[1], dtype=torch.long, device=device) val_indices = torch.arange(CORA_VAL_RANGE[0], CORA_VAL_RANGE[1], dtype=torch.long, device=device) test_indices = torch.arange(CORA_TEST_RANGE[0], CORA_TEST_RANGE[1], dtype=torch.long, device=device) return node_features, node_labels, topology, train_indices, val_indices, test_indices elif dataset_name == DatasetType.PPI.name.lower(): # Instead of checking it in, I'd rather download it on-the-fly the first time it's needed (lazy execution ^^) if not os.path.exists(PPI_PATH): os.makedirs(PPI_PATH) # Step 1: Download the ppi.zip (contains the PPI dataset) zip_tmp_path = os.path.join(PPI_PATH, 'ppi.zip') download_url_to_file(PPI_URL, zip_tmp_path) # Step 2: Unzip it with zipfile.ZipFile(zip_tmp_path) as zf: zf.extractall(path=PPI_PATH) print(f'Unzipping to: {PPI_PATH} finished.') # Step3: Remove the temporary resource file os.remove(zip_tmp_path) print(f'Removing tmp file {zip_tmp_path}.') # todo: load PPI raise Exception(f'{dataset_name} not yet supported.') else: raise Exception(f'{dataset_name} not yet supported.')
def load_citation(dataset_str="cora", normalization="AugNormAdj", cuda=True): """ Load Citation Networks Datasets. """ names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str.lower(), names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( "data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] if dataset_str == 'citeseer': idx_test = torch.LongTensor(test_idx_range.tolist()) #[1708, 2707] idx_train = torch.LongTensor(range(len(y))) #[0,140) idx_val = torch.LongTensor(range(len(y), len(y) + 500)) #[140,640) else: ### setting for cora ### take from https://github.com/tkipf/pygcn/blob/master/pygcn/utils.py idx_train = range(140) idx_val = range(200, 500) idx_test = range(500, 1500) idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) labels = torch.LongTensor(labels) labels = torch.max(labels, dim=1)[1] features = normalize(features) A_tilde = normalize_adjacency_matrix(adj, sp.eye(adj.shape[0])) adj_p = normalizemx(adj) features = torch.FloatTensor(np.array(features.todense())) print('Loading') adj_sct1 = scattering1st(adj_p, 1) print('SCT 1 done') print('Loading') adj_sct2 = scattering1st(adj_p, 2) print('SCT 2 done') adj_sct4 = scattering1st(adj_p, 4) print('SCT 4 done') adj_p = sparse_mx_to_torch_sparse_tensor(adj_p) A_tilde = sparse_mx_to_torch_sparse_tensor(A_tilde) adj = sparse_mx_to_torch_sparse_tensor(adj) return adj, adj_p, A_tilde, adj_sct1, adj_sct2, adj_sct4, features, labels, idx_train, idx_val, idx_test
def load_citation_data(cfg): """ (DCMMC) Github repo of - planetoid (Zhilin Yang, William W. - Cohen, Ruslan Salakhutdinov, Revisiting Semi-Supervised Learning with Graph Embeddings, ICML 2016) provided a preprocessed Cora dataset and a fixed splitting Copied from gcn citeseer/cora/pubmed with gcn split Loads input data from gcn/data directory ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object; ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object; ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object; ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object; ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object; ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object. All objects above must be saved using python pickle module. :param dataset_str: Dataset name :return: All data input files loaded (as well the training/test data). """ names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open( "{}/ind.{}.{}".format(cfg['citation_root'], cfg['activate_dataset'], names[i]), 'rb') as f: objects.append(pkl.load(f, encoding='latin1')) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file("{}/ind.{}.test.index".format( cfg['citation_root'], cfg['activate_dataset'])) test_idx_range = np.sort(test_idx_reorder) if cfg['activate_dataset'] == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] features = preprocess_features(features) features = features.todense() G = nx.from_dict_of_lists(graph) edge_list = G.adjacency_list() degree = [0] * len(edge_list) if cfg['add_self_loop']: for i in range(len(edge_list)): edge_list[i].append(i) degree[i] = len(edge_list[i]) max_deg = max(degree) mean_deg = sum(degree) / len(degree) print(f'max degree: {max_deg}, mean degree:{mean_deg}') labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] # one-hot labels n_sample = labels.shape[0] n_category = labels.shape[1] lbls = np.zeros((n_sample, )) if cfg['activate_dataset'] == 'citeseer': n_category += 1 # one-hot labels all zero: new category for i in range(n_sample): try: lbls[i] = np.where(labels[i] == 1)[0] # numerical labels except ValueError: # labels[i] all zeros lbls[i] = n_category + 1 # new category else: for i in range(n_sample): lbls[i] = np.where(labels[i] == 1)[0] # numerical labels idx_test = test_idx_range.tolist() idx_train = list(range(len(y))) idx_val = list(range(len(y), len(y) + 500)) return features, lbls, idx_train, idx_val, idx_test, n_category, edge_list, edge_list
def load_data_vis_multi(dataset_str, use_trainval, feat_suffix, label_suffix='ally_multi'): """Load data.""" names = [feat_suffix, label_suffix, 'graph'] objects = [] for i in range(len(names)): with open("{}/ind.NELL.{}".format(dataset_str, names[i]), 'rb') as f: print("{}/ind.NELL.{}".format(dataset_str, names[i])) if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) allx, ally, graph = tuple(objects) train_test_mask = [] with open("{}/ind.NELL.index".format(dataset_str), 'rb') as f: train_test_mask = pkl.load(f) features = allx # .tolil() adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.array(ally) idx_test = [] idx_train = [] idx_trainval = [] if use_trainval == True: for i in range(len(train_test_mask)): if train_test_mask[i] == 0: idx_train.append(i) if train_test_mask[i] == 1: idx_test.append(i) if train_test_mask[i] >= 0: idx_trainval.append(i) else: for i in range(len(train_test_mask)): if train_test_mask[i] >= 0: idx_train.append(i) if train_test_mask[i] == 1: idx_test.append(i) if train_test_mask[i] >= 0: idx_trainval.append(i) idx_val = idx_test train_mask = sample_mask_sigmoid(idx_train, labels.shape[0], labels.shape[1]) val_mask = sample_mask_sigmoid(idx_val, labels.shape[0], labels.shape[1]) trainval_mask = sample_mask_sigmoid(idx_trainval, labels.shape[0], labels.shape[1]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_trainval = np.zeros(labels.shape) y_train[train_mask] = labels[train_mask] y_val[val_mask] = labels[val_mask] y_trainval[trainval_mask] = labels[trainval_mask] return adj, features, y_train, y_val, y_trainval, train_mask, val_mask, trainval_mask
def prepare_data(dataset_dir,dataset_name): # data load # #py3中想读取py2中保存的数据,需要指定编码 # x=pickle.load(open('Data/citeseer/ind.citeseer.x','rb'),encoding='iso-8859-1')#(120, 3703) # y=pickle.load(open('Data/citeseer/ind.citeseer.y','rb'),encoding='iso-8859-1')#(120, 6),onehot print("Loading raw data from files...") #dataset_dir = './Data' #dataset_name = 'citeseer' names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open(dataset_dir+"/ind.{}.{}".format(dataset_name, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pickle.load(f, encoding='latin1')) else: objects.append(pickle.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) print("Done.") print("Processing Data...") index = [] for line in open(dataset_dir+"/ind.{0}.test.index".format(dataset_name)): index.append(int(line.strip())) test_idx_reorder = index test_idx_range = np.sort(test_idx_reorder) if dataset_name == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) # [2312,3326] tx_extended = sparse.lil_matrix((len(test_idx_range_full), x.shape[1])) # shape=(1015,3703) tx_extended[test_idx_range - min(test_idx_range), :] = tx # 没有数据的序号当成属性全0。 # 注意上一行用的是test_idx_range,所以现在tx内部还不是按照idx升序排列的。 tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) # shape=(1015,6) ty_extended[test_idx_range - min(test_idx_range), :] = ty # 没有数据的序号当成标签全0。 ty = ty_extended # 注意这步结束之后,tx与ty中,与idx相吻合的行号,都是对应的值。 features = sparse.vstack([allx, tx]).toarray() #(3327,3703) features[test_idx_reorder, :] = features[test_idx_range, :] # 这一步保证了feature内,test结点每一行的行号都与其值对应的index吻合。 ### 对feature归一化。 features = row_normalize_safe(features) labels = np.vstack((ally, ty)) #(N,n_classes) , (3327,6) labels[test_idx_reorder, :] = labels[test_idx_range, :] # 同理,行号与index对齐。 idx_test = test_idx_range.tolist() #从这步可以看到空白孤立点并不参与验证集的检验,index不在名单中。 idx_train = range(len(y)) # range(120) idx_val = range(len(y), len(y) + 500) # range(120,620) # 在citeseer数据集里有[0,119]个点在x中,[0,2311]共2312个点在allx中,有[2312,2326]共1015个点在tx中。 # 不知道为什么训练集120,验证集只划分了500个。 ###图结构 # 这里偷懒用了nx包的函数,坏处是多引进了一个包却只为了一个函数。 # 可以考虑自己写一个从dict创建adjacent matrix的函数。 adj_matrix = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) hatA = torch.FloatTensor(get_hatA(adj_matrix))#(3327,3327) print('Data processed to numpy.') # 目前为止都是numpyd ,处理为适配torch模型的形态 n_classes = labels.shape[1] #(N,cls) labels = torch.LongTensor(labels).argmax(dim=1) #(N,) features = torch.FloatTensor(features) #(N,D) print('Data adjusted to torch.') return hatA,features,labels, n_classes,idx_train,idx_val,idx_test
def load_data(dataset_str): """ Loads input data from gcn/data directory ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object; ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object; ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object; ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object; ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object; ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object. All objects above must be saved using python pickle module. :param dataset_str: Dataset name :return: All data input files loaded (as well the training/test data). """ names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( "data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) print(x.shape, y.shape, tx.shape, ty.shape, allx.shape, ally.shape) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] # print(len(labels)) idx_test = test_idx_range.tolist() # print(idx_test) idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
def train(args): env = make_env(args.domain, args.instance) num_action_vars = env.num_action_vars # neural net parameters num_valid_actions = num_action_vars + 2 state_dim = env.num_state_vars # nn hidden layer parameters num_gcn_features = args.num_features num_hidden_transition = int((2 * state_dim + num_action_vars) / 2) global_step = tf.Variable(0, name="global_step", trainable=False) instance_parser = InstanceParser(args.domain, args.instance) fluent_feature_dims = instance_parser.fluent_feature_dims nonfluent_feature_dims = instance_parser.nonfluent_feature_dims # Build network model = TransitionModel(num_inputs=state_dim, num_outputs=num_valid_actions, num_features=num_gcn_features, num_hidden_transition=num_hidden_transition, fluent_feature_dims=fluent_feature_dims, nonfluent_feature_dims=nonfluent_feature_dims, to_train="decoder", activation=args.activation, learning_rate=args.lr) # Loader current_sa_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='current_state_encoder') next_sa_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='next_state_encoder') transition_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='transition') loader = tf.train.Saver({ 'global/policy_net/gconv1_vars/weights_0': current_sa_vars[0], 'global/policy_net/gconv1_vars/weights_0': next_sa_vars[0], 'global/policy_net/transition_hidden1/weights': transition_vars[0], 'global/policy_net/transition_hidden1/biases': transition_vars[1], 'global/policy_net/transition_hidden2/weights': transition_vars[2], 'global/policy_net/transition_hidden2/biases': transition_vars[3], }) restore_dir = args.restore_dir config = tf.ConfigProto() # config.gpu_options.allow_growth = True # config.gpu_options.per_process_gpu_memory_fraction = 0.9 adjacency_list = instance_parser.get_adjacency_list() adjacency_list = nx.adjacency_matrix(nx.from_dict_of_lists(adjacency_list)) MODEL_DIR = os.path.join( args.model_dir, '{}-{}-{}'.format(args.domain, args.instance, args.num_features)) summary_writer = tf.summary.FileWriter(os.path.join(MODEL_DIR, "train")) summaries_freq = 100 CHECKPOINT_DIR = os.path.join(MODEL_DIR, "checkpoints") if not os.path.exists(CHECKPOINT_DIR): os.makedirs(CHECKPOINT_DIR) checkpoint_path = os.path.join(CHECKPOINT_DIR, 'model') saver = tf.train.Saver(max_to_keep=10) checkpoint_freq = 5000 with tf.Session(config=config) as sess: load_model(sess, loader, restore_dir) # Training for counter in xrange(args.num_train_iter): # Generate state tuples state_tuples = generate_data_from_env(env, args.domain) # Compute transition probabilities states = [] next_states = [] action_probs = [] for st in state_tuples: state = np.array(st[0]) next_state = np.array(st[1]) action_prob = instance_parser.get_action_probs( state, next_state) states.append(state) next_states.append(next_state) action_probs.append(np.array(action_prob)) batch_size = len(states) # adj_preprocessed = get_processed_adj(adjacency_list, batch_size) # current_input_features_preprocessed = get_processed_input( # states, env.num_state_vars) # next_input_features_preprocessed = get_processed_input( # next_states, env.num_state_vars) adj_preprocessed = get_processed_adj(adjacency_list, batch_size) current_input_features_preprocessed = get_processed_input( states, instance_parser) next_input_features_preprocessed = get_processed_input( next_states, instance_parser) # Backprop feed_dict = { model.current_state: states, model.current_inputs: current_input_features_preprocessed, model.next_inputs: next_input_features_preprocessed, model.placeholders_hidden1['support'][0]: adj_preprocessed, model.placeholders_hidden1['dropout']: 0.0, model.placeholders_hidden1['num_features_nonzero']: current_input_features_preprocessed[1].shape, model.placeholders_hidden2['support'][0]: adj_preprocessed, model.placeholders_hidden2['dropout']: 0.0, model.placeholders_hidden2['num_features_nonzero']: next_input_features_preprocessed[1].shape, model.action_probs: action_probs } step, loss, _, summaries = sess.run( [global_step, model.loss, model.train_op, model.summaries], feed_dict) # Write summaries if counter % summaries_freq == 0: summary_writer.add_summary(summaries, step) summary_writer.flush() # Store checkpoints if counter % checkpoint_freq == 0: saver.save(sess, checkpoint_path, step)
def maze_graph(nodelist): flower_graph = { 1: [2, 7], 2: [1, 3], 3: [2, 4, 9], 4: [3, 5], 5: [4, 11], 6: [7, 13], 7: [6, 1, 8], 8: [7, 9, 15], 9: [3, 8, 10], 10: [9, 11, 17], 11: [5, 10, 12], 12: [11, 19], 13: [6, 14], 14: [13, 15, 20], 15: [8, 14, 16], 16: [15, 17, 22], 17: [10, 16, 18], 18: [17, 19, 24], 19: [12, 18], 20: [14, 21], 21: [20, 22], 22: [16, 21, 23], 23: [22, 24], 24: [18, 23] } island_prefixes = ['1', '2', '3', '4'] bridge_edges = [('124', '201', 60), ('302', '121', 172), ('223', '404', 169), ('324', '401', 60), ('305', '220', 60)] bridge_edges_uw = [('124', '201'), ('121', '302'), ('223', '404'), ('324', '401'), ('305', '220')] graph_prototype = {} for letter in island_prefixes: for node_suffix, edges in flower_graph.items(): if node_suffix < 10: first_point = letter + '{}{}'.format(0, str(node_suffix)) else: first_point = letter + '{}'.format(node_suffix) edge_list = [] for n in edges: if n < 10: second_point = letter + '{}{}'.format(0, str(n)) else: second_point = letter + '{}'.format(n) edge_list.append(second_point) graph_prototype[first_point] = edge_list mg = nx.Graph() xg = nx.Graph() mg = nx.from_dict_of_lists(graph_prototype) xg = nx.from_dict_of_lists(graph_prototype) for e in mg.edges(): mg[e[0]][e[1]]['weight'] = 30 mg.add_weighted_edges_from(bridge_edges) xg.add_edges_from(bridge_edges_uw) simple_path = dict(nx.all_pairs_shortest_path(xg)) dijkstra_path = dict(nx.all_pairs_dijkstra_path(mg, weight='weight')) return mg, simple_path, dijkstra_path
def load_nell(dataset="nell.0.001", normalization="AugNormAdj", porting_to_torch=True, data_path=datadir, task_type="full"): names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/nell_data/ind.{}.{}".format(dataset, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( "data/nell_data/ind.{}.test.index".format(dataset)) test_idx_range = np.sort(test_idx_reorder) if dataset == 'nell.0.001': # Find relation nodes, add them as zero-vecs into the right position test_idx_range_full = range(allx.shape[0], len(graph)) isolated_node_idx = np.setdiff1d(test_idx_range_full, test_idx_reorder) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - allx.shape[0], :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - allx.shape[0], :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] idx_all = np.setdiff1d(range(len(graph)), isolated_node_idx) if not os.path.isfile("data/{}.features.npz".format(dataset)): print( "Creating feature vectors for relations - this might take a while..." ) features_extended = sp.hstack( (features, sp.lil_matrix((features.shape[0], len(isolated_node_idx)))), dtype=np.int32).todense() features_extended[isolated_node_idx, features.shape[1]:] = np.eye( len(isolated_node_idx)) features = sp.csr_matrix(features_extended) print("Done!") save_sparse_csr("data/{}.features".format(dataset), features) else: features = load_sparse_csr("data/{}.features.npz".format(dataset)) adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) # degree = np.asarray(G.degree) degree = np.sum(adj, axis=1) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] if task_type == "full": print("Load full supervised task.") #supervised setting idx_test = test_idx_range.tolist() idx_train = range(len(ally) - 500) idx_val = range(len(ally) - 500, len(ally)) elif task_type == "semi": print("Load semi-supervised task.") #semi-supervised setting idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) else: raise ValueError( "Task type: %s is not supported. Available option: full and semi.") features = features.astype(float) adj, features = preprocess_citation(adj, features, normalization) # features = np.array(features.todense()) labels = np.argmax(labels, axis=1) # porting to pytorch if porting_to_torch: features = torch.FloatTensor(features).float() labels = torch.LongTensor(labels) # labels = torch.max(labels, dim=1)[1] adj = sparse_mx_to_torch_sparse_tensor(adj).float() idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) degree = torch.LongTensor(degree) learning_type = "transductive" return adj, features, labels, idx_train, idx_val, idx_test, degree, learning_type
# get D1 Friends, the Rest will be iterable # Get the top five reciprocal friends and put them into the list of tuples list_of_tuples = [] list_of_ids = [] final_graph = {} client = pymongo.MongoClient() db = client.final for val in db.collection_names(): final_graph[int(val)] = load_from_mongo('final', str(val))[0]['reciprocal_friends'] final_graph[24551258] = intersection # Start creating the graph peice by peice. G=nx.from_dict_of_lists(final_graph) pos=nx.spring_layout(G) # positions for all nodes nx.draw_networkx_nodes(G,pos,node_size=3) nx.draw_networkx_edges(G,pos,width=1) nx.draw_networkx_labels(G,pos,font_size=2,font_family='sans-serif') plt.show() print("the number of nodes of the network is " + str(G.size())) print("The diameter of the network is: " + str(nx.diameter(G))) print("The average distance of the network is " + str(nx.center(G)))
from pprint import pprint import networkx as nx import matplotlib.pyplot as plt a, b, c, d, e, f, g, h = range(8) N = { a: [b, c, d, e, f], b: [c, e], c: [d], d: [e], e: [f], f: [c, g, h], g: [f, h], h: [f, g] } G = nx.from_dict_of_lists(N) nx.draw(G) plt.show() print(type(N)) pprint(N)
def load_data(dataset_str, is_sparse): if dataset_str == "ppi": return load_graphsage_data('data/ppi/ppi', is_sparse) """Load data.""" if dataset_str != 'nell': names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( "data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] features = preprocess_features(features, is_sparse) adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) support = preprocess_adj(adj) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) # y_train = np.zeros(labels.shape) # y_val = np.zeros(labels.shape) # y_test = np.zeros(labels.shape) # y_train = labels[train_mask, :] # y_val[val_mask, :] = labels[val_mask, :] # y_test[test_mask, :] = labels[test_mask, :] else: names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/savedData/{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( "data/savedData/{}.test.index".format(dataset_str)) features = allx.tolil() adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = ally features = preprocess_features(features, is_sparse) support = preprocess_adj(adj) idx_test = test_idx_reorder idx_train = range(len(y)) idx_val = range(len(y), len(y) + 969) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) # if not os.path.isfile("data/{}.nbrs.npz".format(dataset_str)): # N = adj.shape[0] # pool = multiprocessing.Pool(processes=56) # # lis = [] # for i in range(32): # li = range(int(N/32)*i, max(int(N/32)*(i+1), N)) # lis.append(li) # adjs = [adj] * 32 # results = pool.map(starfind_4o_nbrs, zip(adjs, lis)) # # pool.close() # pool.join() # nbrs = results[0] # # cnt = 0 # # for i in range(32): # # # # cnt += len(results[i]) # # print(cnt) # # nbrs += results[i] # # np.savez("data/{}.nbrs.npz".format(dataset_str), data = nbrs) # else: # loader = np.load("data/{}.nbrs.npz".format(dataset_str)) # nbrs = loader['data'] print(adj.shape) return None, support, support, features, labels, train_mask, val_mask, test_mask
def load_data(dataset_str): """Load data.""" names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) print("graph is....") print(type(graph)) print("allx is....") print(type(allx)) # test_idx_reorder = parse_index_file( "data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended if dataset_str == 'nell.0.001': # Find relation nodes, add them as zero-vecs into the right position test_idx_range_full = range(allx.shape[0], len(graph)) isolated_node_idx = np.setdiff1d(test_idx_range_full, test_idx_reorder) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - allx.shape[0], :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - allx.shape[0], :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil().toarray() print(features) features[test_idx_reorder, :] = features[test_idx_range, :] idx_all = np.setdiff1d(range(len(graph)), isolated_node_idx) if not os.path.isfile( "data/planetoid/{}.features.npz".format(dataset_str)): print( "Creating feature vectors for relations - this might take a while..." ) features_extended = sp.hstack( (features, sp.lil_matrix((features.shape[0], len(isolated_node_idx)))), dtype=np.int32).todense() features_extended[isolated_node_idx, features.shape[1]:] = np.eye( len(isolated_node_idx)) features = sp.csr_matrix(features_extended) print("Done!") save_sparse_csr("data/planetoid/{}.features".format(dataset_str), features) else: features = load_sparse_csr( "data/planetoid/{}.features.npz".format(dataset_str)) adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] np.savetxt('labels', labels) return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
for i, city in enumerate(msa): print "Compute the number of neighbourhoods for %s (%s/%s)"%(msa[city], i+1, len(msa)) ## Import adjacency matrix adjacency = {} with open('extr/adjacency_bg/msa/%s.csv'%city, 'r') as source: reader = csv.reader(source, delimiter='\t') reader.next() for rows in reader: adjacency[rows[0]] = rows[1:] ## Transform into graph G = nx.from_dict_of_lists(adjacency) ## Import list of bg where each class is overrepresented over_bg = {cl:[] for cl in classes} with open('extr/neighbourhoods/classes/msa/%s.csv'%city, 'r') as source: reader = csv.reader(source, delimiter='\t') for rows in reader: over_bg[rows[0]].append(rows[1]) ## Extract neighbourhoods (the connected components of the subgraph ## constituted of the areal units where the class is overrepresented) neighbourhoods = {cl: nx.connected_component_subgraphs(G.subgraph(over_bg[cl])) for cl in classes} neigh_num[city] = {cl: len(list(neighbourhoods[cl])) for cl in classes}
def graph_to_json(): json_file = {} position_file = 'https://s3-us-west-2.amazonaws.com/pollstr/visuals/dataBrexit.txt' open_s3 = urllib.URLopener() position = eval(open_s3.open(position_file).read()) neighborhood_file = 'https://s3-us-west-2.amazonaws.com/pollstr/visuals/net4.txt' open_s3 = urllib.URLopener() neighborhood_dict = eval(open_s3.open(neighborhood_file).read()) Graph = nx.from_dict_of_lists(neighborhood_dict) nodes = Graph.nodes() list_of_nodes = [] id_of_nodes = {} i = 0 for node in nodes: id_of_nodes[node] = i i += 1 node_info_dict = {} for node in nodes: node_info = {} node_info['name'] = str(node) try: if position[node]['position'] == 'leave': node_info['color'] = 'blue' node_info['followers'] = position[node]['followers'] node_info['logFollowers'] = position[node]['log'] elif position[node]['position'] == 'remain': node_info['color'] = 'yellow' node_info['followers'] = position[node]['followers'] node_info['logFollowers'] = position[node]['log'] else: node_info['color'] = '#e7e7e7' node_info['followers'] = position[node]['followers'] node_info['logFollowers'] = position[node]['log'] except: node_info['color'] = '#e7e7e7' node_info['followers'] = 'DK' node_info['logFollowers'] = 3 node_info_dict[str(node)] = node_info list_of_nodes.append(node_info) edges = Graph.edges() list_of_edges = [] for node in nodes: neighbors = Graph.neighbors(node) for neighbor in neighbors: edge_info = {} edge_info['source'] = id_of_nodes[node] edge_info['target'] = id_of_nodes[neighbor] edge_info['value'] = 1 try: edge_info['color'] = node_info_dict[node]['color'] except: edge_info['color'] = '#e7e7e7' list_of_edges.append(edge_info) json_file['nodes'] = list_of_nodes json_file['links'] = list_of_edges json_file = json.dumps(json_file) return json_file
import json import heapq as heap from operator import itemgetter import numpy as np import sim import betweenness_centrality # Load data from file given by command line argument filename = sys.argv[1] N = int(filename.split('.')[-3]) f = open(filename) graph_data = json.load(f) f.close() G = nx.from_dict_of_lists(graph_data) def save_graph(graph, save_name): ''' Saves networkx graph "graph" as pdf named "save_name" Source: http://stackoverflow.com/a/17388676 ''' #initialze Figure plt.figure(num=None, figsize=(20, 20), dpi=80) plt.axis('off') fig = plt.figure(1) pos = nx.spring_layout(graph) nx.draw_networkx_nodes(graph,pos) nx.draw_networkx_edges(graph,pos) nx.draw_networkx_labels(graph,pos)
def construct_tree_from_graph(adjacency_list, density, prune_threshold=None, num_levels=None, verbose=False): """ Construct a level set tree from a similarity graph and a density estimate. Parameters ---------- adjacency_list : list [list] Adjacency list of the k-nearest neighbors graph on the data. Each entry contains the indices of the `k` closest neighbors to the data point at the same row index. density : list [float] Estimate of the density function, evaluated at the data points represented by the keys in `adjacency_list`. prune_threshold : int, optional Leaf nodes with fewer than this number of members are recursively merged into larger nodes. If 'None' (the default), then no pruning is performed. num_levels : list int, optional Number of density levels in the constructed tree. If None (default), `num_levels` is internally set to be the number of rows in `X`. verbose : bool, optional If True, a progress indicator is printed at every 100th level of tree construction. Returns ------- T : levelSetTree See the LevelSetTree class for attributes and method definitions. See Also -------- construct_tree, LevelSetTree Examples -------- >>> X = numpy.random.rand(100, 2) >>> knn_graph, radii = debacl.utils.knn_graph(X, k=8) >>> density = debacl.utils.knn_density(radii, n=100, p=2, k=8) >>> tree = debacl.construct_tree_from_graph(knn_graph, density, ... prune_threshold=5) >>> print tree +----+-------------+-----------+------------+----------+------+--------+----------+ | id | start_level | end_level | start_mass | end_mass | size | parent | children | +----+-------------+-----------+------------+----------+------+--------+----------+ | 0 | 0.000 | 0.768 | 0.000 | 0.390 | 100 | None | [1, 2] | | 1 | 0.768 | 1.494 | 0.390 | 0.790 | 30 | 0 | [7, 8] | | 2 | 0.768 | 4.812 | 0.390 | 1.000 | 31 | 0 | [] | | 7 | 1.494 | 2.375 | 0.790 | 0.950 | 6 | 1 | [] | | 8 | 1.494 | 2.308 | 0.790 | 0.940 | 5 | 1 | [] | +----+-------------+-----------+------------+----------+------+--------+----------+ """ ## Initialize the graph and cluster tree levels = _utl.define_density_mass_grid(density, num_levels=num_levels) G = _nx.from_dict_of_lists( {i: neighbors for i, neighbors in enumerate(adjacency_list)}) T = LevelSetTree(density, levels) ## Figure out roots of the tree cc0 = _nx.connected_components(G) for i, c in enumerate(cc0): # c is only the vertex list, not the subgraph T._subgraphs[i] = G.subgraph(c) T.nodes[i] = ConnectedComponent( i, parent=None, children=[], start_level=0., end_level=None, start_mass=0., end_mass=None, members=c) # Loop through the removal grid previous_level = 0. n = float(len(adjacency_list)) for i, level in enumerate(levels): if verbose and i % 100 == 0: _logging.info("iteration {}".format(i)) ## figure out which points to remove, i.e. the background set. bg = _np.where((density > previous_level) & (density <= level))[0] previous_level = level ## compute the mass after the current bg set is removed old_vcount = sum([x.number_of_nodes() for x in T._subgraphs.itervalues()]) current_mass = 1. - ((old_vcount - len(bg)) / n) # loop through active components, i.e. subgraphs deactivate_keys = [] # subgraphs to deactivate at the iter end activate_subgraphs = {} # new subgraphs to add at the end of the iter for (k, H) in T._subgraphs.iteritems(): ## remove nodes at the current level H.remove_nodes_from(bg) ## check if subgraph has vanished if H.number_of_nodes() == 0: T.nodes[k].end_level = level T.nodes[k].end_mass = current_mass deactivate_keys.append(k) else: # subgraph hasn't vanished ## check if subgraph now has multiple connected components # NOTE: this is *the* bottleneck if not _nx.is_connected(H): ## deactivate the parent subgraph T.nodes[k].end_level = level T.nodes[k].end_mass = current_mass deactivate_keys.append(k) ## start a new subgraph & node for each child component cc = _nx.connected_components(H) for c in cc: new_key = max(T.nodes.keys()) + 1 T.nodes[k].children.append(new_key) activate_subgraphs[new_key] = H.subgraph(c) T.nodes[new_key] = ConnectedComponent( new_key, parent=k, children=[], start_level=level, end_level=None, start_mass=current_mass, end_mass=None, members=c) # update active components for k in deactivate_keys: del T._subgraphs[k] T._subgraphs.update(activate_subgraphs) ## Prune the tree if prune_threshold is not None: T = T.prune(threshold=prune_threshold) return T
import simplejson as json import networkx as nx from networkx.readwrite import json_graph import matplotlib.pyplot as plt from numpy import cumsum print 'Running Graph Properties script' with open('net_sci_coauthorships.txt', 'r') as f: js_graph = json.load(f) # Dictionary of key-value pairs G = nx.from_dict_of_lists(js_graph) #### Plot histogram #### # Get degrees of all nodes, create sorted list degrees = nx.degree(G).values() plt.hist(degrees, bins=10, log=True) plt.title('Degree Histogram') plt.ylabel('Number of Nodes') plt.xlabel('Degree') # plt.show() plt.savefig('degree_histogram.png') plt.clf() #### Plot cumulative distribution function #### cumsums = cumsum(degrees) plt.plot(cumsums) plt.title('Cumulative Node Degrees') plt.ylabel('Cumulative Node Degree') plt.xlabel('Number of Nodes') # plt.show() plt.savefig('degree_cumsum.png')
letters = {'a':['c','e','g'], 'b':['a','d','c','e','f','g',], 'c':['a','d','g','h'], 'd':['d','c','f','g'], 'e':['d','g'], 'f':['j'], 'g':['h'], 'h':['a','e','g'], 'i':[], 'j':['a','g'], 'k':['e','f','g'] } # the digraph datastructure from LangDict object g = nx.DiGraph() nx.from_dict_of_lists(LangDict(letters), create_using=g) g = g.reverse() #before graph nx.draw_networkx(g, arrows=True) plt.draw() plt.show() scc = nx.strongly_connected_components(g) print('yup') G = nx.condensation(g, scc=scc) print('done') # Display the info
sys.exit(1) path = sys.argv[1] # Define global var num_rounds = 50 # Parse the input path to find filename and number of players and seeds (directory, filename, num_players, num_seeds) = parse_file_path(path) # Find output file name output_filename = directory + filename.rsplit('.', 1)[0] + ".txt" # Get the adjacency list graph = load_graph(path) # Generate graph from nodes G = nx.from_dict_of_lists(graph) # Generate a list of random nodes as root nodes # strategy = random_nodes_strategy(graph, num_seeds, num_rounds) # strategy = highest_degree_strategy(graph, num_seeds, num_rounds) # strategy = eigenvector_strategy(G, num_seeds, num_rounds) # strategy = dominating_set_strategy(G, num_seeds, num_rounds) # strategy = load_centraility(G, num_seeds, num_rounds) # strategy = mixed_strategy(G, num_seeds, num_rounds) strategy = closeness_strategy(G, num_seeds, num_rounds) # Save input file save_output(output_filename, strategy)
def matching(items): """I'm not sure what this does but I bet it's useful""" G = nx.from_dict_of_lists(items) return tuple((k, v) for k, v in nx.bipartite.maximum_matching(G, top_nodes=items).items() if k in items)
# In[123]: word_list = ['halloween', 'love', 'follow', 'happi', 'night', 'bihday', 'dress'] word_ass_dict = {} for x in word_list: word_ass_dict[x] = count_ass(x) # In[124]: word_ass_dict # In[125]: Gword=nx.from_dict_of_lists(word_ass_dict) # In[156]: pos = nx.shell_layout(Gword) nx.draw_networkx(Gword, pos, node_size = 1500, node_color = 'w', font_color = 'b', font_size = '12') plt.axis('off') plt.title('Graph of associations between most frequent words', fontsize='20') plt.show() # In[ ]:
def load_data(dataset_str): """ Loads input data from gcn/data directory ind.dataset_str.x => 训练实例(节点)特征向量,scipy.sparse.csr.csr_matrix ind.dataset_str.tx => 测试实例特征向量,scipy.sparse.csr.csr_matrix ind.dataset_str.allx => labeled and unlabeled training instances (a superset of ind.dataset_str.x), scipy.sparse.csr.csr_matrix ind.dataset_str.y => 已标记训练实例的one-hot标记,numpy.ndarray ind.dataset_str.ty => 已标记测试实例的one-hot标记,numpy.ndarray ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object; ind.dataset_str.graph => 网络,{index: [index_of_neighbor_nodes]} as collections.defaultdict object; ind.dataset_str.test.index => graph中测试集的索引, for the inductive setting as list object. 所有数据需保存为python pickle :param dataset_str: 数据集名字 :return: All data input files loaded (as well the training/test data). """ names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] #【allx的shape:(1708, 1433);ally的shape:(1708, 7)】 objects = [] # 读入各数据 for name in names: with open("data/ind.{}.{}".format(dataset_str, name), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) # pylint: disable=unbalanced-tuple-unpacking #print(ally.shape) #p = np.sum(ally,axis=1) # 导入测试集下标 test_idx_reorder = parse_index_file( "data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) # 下标排序 if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack( (allx, tx)).tolil() # 换一种稀疏矩阵格式lil(适合逐个添加元素,并且能快速获取行相关的数据) features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) # 邻接矩阵 #print(adj.shape) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[ test_idx_range, :] #【将labels中test_idx_reorder对应的行转成test_idx_range对应的行,例如将labels的1708行移到2692行】 idx_test = test_idx_range.tolist() #【1708~2707】 idx_train = range(len(y)) #【0~139】 idx_val = range(len(y), len(y) + 500) #【140~639】 train_mask = sample_mask(idx_train, labels.shape[0]) #【0~139行标记为1】 val_mask = sample_mask(idx_val, labels.shape[0]) #【140~639标记为1】 test_mask = sample_mask(idx_test, labels.shape[0]) #【1078~2707标记为1】 y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] #【0~139行赋值为labels的0~139】 y_val[val_mask, :] = labels[val_mask, :] #【140~639赋值为labels的140~639】 y_test[test_mask, :] = labels[ test_mask, :] #【1078~2707赋值为labels的1078~2707】 return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
for i in targetlist: global memberlist memberlist = [] memberlist=data['Source'][data.Target==i].tolist() print memberlist cxlist = (list(itertools.permutations(memberlist,2))) for n in cxlist: mytuple = tuple([n,i]) connectionlist += [mytuple] #print mytuple resultdict=collections.defaultdict(list) #for x in connectionlist: # resultdict[x[0]].append(x[1]) # for x in connectionlist: resultdict[x[0]].append(x[1]) #resultdict G=networkx.from_dict_of_lists(resultdict) mymatrix = networkx.to_numpy_matrix(G) mymatrix.shape mymatrix[0,:] #G.nodes() label = [memberlist,memberlist] mylarry = la.larry(mymatrix,label, dtype=float)
def load_data(dataset_str): """ Loads input data from gcn/data directory ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object; ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances (a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object; ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object; ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object; ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object; ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object; ind.dataset_str.test.index(测试实例的id) => the indices of test instances in graph, for the inductive setting as list object. :param dataset_str: Dataset name :return: All data input files loaded (as well the training/test data). LIL(基于行的链表格式):稀疏矩阵转换为两个链表 data,data[k]是行k中的非零元素的列表;rows是在位置k包含了在行k中的非零元素列索引列表 adj(邻接矩阵):格式LIL features(特征矩阵):格式LIL labels:ally, ty数据集叠加构成 train_mask, val_mask, test_mask:shaped都为(2708, )的向量,但是train_mask中的[0,140)范围的是True,其余是False;val_mask 中范围为(140, 640]范围为True,其余的是False;test_mask中范围为[1708,2707]范围是True,其余的是False y_train, y_val, y_test:shape都是(2708, 7) 。y_train的值为对应与labels中train_mask为True的行,其余全是0;y_val的值为对应 与labels中val_mask为True的行,其余全是0;y_test的值为对应与labels中test_mask为True的行,其余全是0 """ names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( "data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
def isolate_all(xyt_filename, BINS=6, force=False, sparse=False): filaments_filename = filament_filename_from_xyt_filename(xyt_filename) # Assertions inside function if not SILENT: print "Isolating filaments from:", xyt_filename if not force and os.path.isfile(filaments_filename): if SILENT: return filaments_filename else: print "Filaments already saved as", filaments_filename if "y" not in raw_input("Run isolate_all() anyway? ([no]/yes): "): print "Aborted: isolate_all()" return filaments_filename hdu_list = config.default_open(xyt_filename) ntheta = hdu_list[0].header["NTHETA"] wlen = hdu_list[0].header["WLEN"] frac = hdu_list[0].header["FRAC"] naxis1 = hdu_list[0].header["NAXIS1"] naxis2 = hdu_list[0].header["NAXIS2"] original = hdu_list[0].header["ORIGINAL"] Hi = hdu_list[1].data["hi"] Hj = hdu_list[1].data["hj"] # Compute TheteRHT for all pixels given, then bin by theta B = map(rht.theta_rht, hdu_list[1].data["hthets"]) # List of theta_rht values C = np.multiply(np.asarray(B), BINS / np.pi).astype(np.int_) del B # Ready the output HDUList and close the input HDUList output_hdulist = fits.HDUList(hdu_list[0].copy()) # , open(filaments_filename, 'w')) #Overwrites hdu_list.close() # Set Assignment # unprocessed = list() list_of_HDUs = list() search_pattern = [ (-1, -1), (-1, 0), (-1, 1), (0, -1), ] # [(-1, 1), (-1,-1), (-1, 0), (0, -1), (-2, -2), (-2, -1), (-2, 0), (-2, 1), (-2, 2), (-1, -2), (-1, 2), (0,-2)] for bin in range(BINS): delimiter = np.nonzero(C == bin)[0] raw_points = zip(Hi[delimiter], Hj[delimiter]) del delimiter problem_size = len(raw_points) # message='Step '+str(bin+1)+'/'+str(BINS)+': (N='+str(problem_size)+')' # progress_bar = Progress(problem_size, message=message, incrementing=True) point_dict = dict([x[::-1] for x in enumerate(raw_points)]) set_dict = collections.defaultdict(list) # theta_dict = dict() for coord in raw_points: # rht.update_progress(0.3*(i/problem_size), message=message) # progress_bar.update() # theta_dict[coord] = B[point_dict[coord]] for rel_coord in search_pattern: try: j = point_dict[config.rel_add(coord, rel_coord)] set_dict[point_dict[coord]].append(j) except Exception: continue G = nx.from_dict_of_lists(set_dict) # Undirected graph made using set_dict as an adjacency list del set_dict # progress_bar = Progress(problem_size, message=message, incrementing=False) sources = range(problem_size) flags = np.ones((problem_size), dtype=np.int_) while len(sources) > 0: source = sources.pop() if not flags[source]: continue else: # rht.update_progress(0.3+0.3*(1.0-len(sources)/problem_size), message=message) # progress_bar.update(len(sources)) try: for member in nx.descendants(G, source): flags[member] = False point_dict[raw_points[member]] = source G.remove_node(member) # TODO Remove members from G if that would speed up subsequent calls? except nx.NetworkXError: # Assume we hit an isolated pixel (never made it into G) and move on pass del sources, flags, G histogram = np.bincount(map(point_dict.get, raw_points)) mask = np.nonzero(histogram >= int(frac * wlen))[0] del histogram # progress_bar = Progress(problem_size, message=message, incrementing=False) mask_dict = dict([x[::-1] for x in enumerate(mask)]) out_clouds = collections.defaultdict(list) while len(point_dict) > 0: temp = point_dict.popitem() try: # Keying into mask_dict is the only operation that ought to throw an exception out_clouds[mask_dict[temp[1]]].append(temp[0]) # progress_bar.update(len(point_dict)) # rht.update_progress(0.6+0.399*(1.0-len(point_dict)/problem_size), message=message) except Exception: continue while len(out_clouds) > 0: cloud = out_clouds.popitem()[1] # unprocessed.append(cloud) list_of_HDUs.append(config.Cloud(cloud).as_HDU(sparse=sparse)) # TODO Incorporate theta_dict # rht.update_progress(1.0, final_message='Finished joining '+str(problem_size)+' points! Time Elapsed:') # Convert lists of two-integer tuples into ImageHDUs # unprocessed.sort(key=len, reverse=True) # output_hdulist = fits.HDUList(map(config.Cloud.as_ImageHDU, map(config.Cloud, unprocessed))) # del unprocessed list_of_HDUs.sort(key=lambda h: h.header["DIAG"], reverse=False) while len(list_of_HDUs) > 0: output_hdulist.append(list_of_HDUs.pop()) # Output HDUList to File output_hdulist.writeto(filaments_filename, output_verify="silentfix", clobber=True, checksum=True) try: output_hdulist.flush() except Exception: pass try: output_hdulist.close() except Exception: pass if not SILENT: print "Results successfully output to " + filaments_filename return filaments_filename
def load_citation(dataset_str="cora", normalization="AugNormAdj", porting_to_torch=True, data_path=datadir, task_type="full"): """ Load Citation Networks Datasets. """ names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open( os.path.join(data_path, "ind.{}.{}".format(dataset_str.lower(), names[i])), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( os.path.join(data_path, "ind.{}.test.index".format(dataset_str))) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] G = nx.from_dict_of_lists(graph) adj = nx.adjacency_matrix(G) adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) # degree = np.asarray(G.degree) degree = np.sum(adj, axis=1) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] if task_type == "full": print("Load full supervised task.") #supervised setting idx_test = test_idx_range.tolist() idx_train = range(len(ally) - 500) idx_val = range(len(ally) - 500, len(ally)) elif task_type == "semi": print("Load semi-supervised task.") #semi-supervised setting idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) else: raise ValueError( "Task type: %s is not supported. Available option: full and semi.") adj, features = preprocess_citation(adj, features, normalization) features = np.array(features.todense()) labels = np.argmax(labels, axis=1) # porting to pytorch if porting_to_torch: features = torch.FloatTensor(features).float() labels = torch.LongTensor(labels) # labels = torch.max(labels, dim=1)[1] #adj = sparse_mx_to_torch_sparse_tensor(adj).float() adj = torch.FloatTensor(np.array(adj.todense())) idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) degree = torch.LongTensor(degree) learning_type = "transductive" return adj, features, labels, idx_train, idx_val, idx_test, degree, learning_type