def preprocess_citation_bigraph(adj, features, normalization="FirstOrderGCN"): adj_normalizer = fetch_normalization(normalization) adj = adj_normalizer(adj) adj_cn = features.T features = row_normalize(features) adj_cn = row_normalize(adj_cn) adj_nc = features return adj, features, adj_nc, adj_cn
def get_hops(adj, n_hops, args): # adj is csr_matrix, n_hops hop_file = 'hop_adj/' + args.dataset + '_hop_{}'.format( args.nlayer) + '.pickle' if os.path.isfile(hop_file): with open(hop_file, 'rb') as f: adj_result = pickle.load(f) else: n_node, _ = adj.shape adj = adj - sp.dia_matrix( (adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape, dtype=float) adj_orig = adj adj_result = [] for i in range(n_hops): if i == 0: adj_ = adj_orig.tocoo() else: adj = sp.csr_matrix(adj.dot(adj_orig.toarray().T)) adj_ = adj print('---> Sparse rate of %d is : %.4f' % (i + 1, adj_.nnz / n_node / n_node)) adj_ = row_normalize(adj_) adj_result.append(adj_) with open(hop_file, 'wb') as pfile: pickle.dump(adj_result, pfile, pickle.HIGHEST_PROTOCOL) return adj_result
def preprocess_citation_graph(adj, features, normalization="FirstOrderGCN"): adj_normalizer = fetch_normalization(normalization) adj = adj_normalizer(adj) adj2 = features * features.T adj2 = adj_normalizer(adj2) features = row_normalize(features) return adj, features, adj2
def preprocess_citation(adj, features, normalization="FirstOrderGCN", gamma=1): adj_normalizer = fetch_normalization(normalization) if 'Aug' in normalization: adj = adj_normalizer(adj, gamma=gamma) else: adj = adj_normalizer(adj) features = row_normalize(features) return adj, features
def load_data(dataset_str="cora", normalization=[], feat_normalize=True, cuda=False, split="default", random_state=None, **kwargs): """ Load pickle packed datasets. """ with open(dataf+dataset_str+".graph", "rb") as f: graph = pkl.load(f) with open(dataf+dataset_str+".X", "rb") as f: X = pkl.load(f) with open(dataf+dataset_str+".y", "rb") as f: y = pkl.load(f) if split != "default": tr_size, va_size, te_size = [float(i) for i in split.split("_")] idx_train, idx_val, idx_test = \ train_val_test_split(np.arange(len(y)), train_size=tr_size, val_size=va_size, test_size=te_size, stratify=y, random_state=random_state) else: with open(dataf+dataset_str+".split", "rb") as f: split = pkl.load(f) idx_train = split['train'] idx_test = split['test'] idx_val = split['valid'] normed_adj = [] if len(normalization) > 0: adj = nx.adj_matrix(graph) for n in normalization: nf = fetch_normalization(n, **kwargs) normed_adj.append(nf(adj)) if feat_normalize: X = row_normalize(X) X = torch.FloatTensor(X).float() y = torch.LongTensor(y) normed_adj = [sparse_mx_to_torch_sparse_tensor(adj).float() \ for adj in normed_adj] idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) if cuda: X = X.cuda() normed_adj = [adj.cuda() for adj in normed_adj] y = y.cuda() idx_train = idx_train.cuda() idx_val = idx_val.cuda() idx_test = idx_test.cuda() return graph, normed_adj, X, y, idx_train, idx_val, idx_test
def preprocess_citation(adj, features, normalization, extra=None): adj_normalizer = fetch_normalization(normalization, extra) adj = adj_normalizer(adj) #row_sum = 1 / (np.sqrt(np.array(adj.sum(1)))) #row_sum = np.array(adj.sum(1)) #features = row_sum #features = features.todense() #features = np.concatenate([features, row_sum], axis=1) #features = sp.lil_matrix(features) if normalization != "": features = row_normalize(features) return adj, features
def preprocess_citation(adj, features, normalization='AugNormAdj'): adj_normalizer = fetch_normalization(normalization) adj = adj_normalizer(adj) features = row_normalize(features) return adj, features
def preprocess_citation_feat(features): features = row_normalize(features) return features
def preprocess_citation(adj, features, normalization="FirstOrderGCN"): adj_normalizer = fetch_normalization(normalization) #features, Droot = row_normalize(features,adj) features = row_normalize(features) adj = adj_normalizer(adj) return adj, features
def load_citation_gac(dataset_str="cora", semi=1): """ Load Citation Networks Datasets. """ names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str.lower(), names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file( "data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range - min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range - min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) features = row_normalize(features) else: features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y) + 500) features = row_normalize(features) # porting to pytorch # features = torch.FloatTensor(np.array(features.todense())).float() features = sparse_mx_to_torch_sparse_tensor(features).float() labels = torch.LongTensor(labels) labels = torch.max(labels, dim=1)[1] # adj = sparse_mx_to_torch_sparse_tensor(adj).float() if semi == 0: idx_all = list(range(labels.shape[0])) used_all = set(idx_train).union(set(idx_val)).union(set(idx_test)) unused_all = list(set(idx_all).difference(used_all)) idx_train = list(idx_train) + unused_all idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) return adj, features, labels, idx_train, idx_val, idx_test
def load_webANEmat_gac(dataset_str="texas", semi=1, semi_rate=0.1): data_file = 'data/{}/{}'.format('web', dataset_str) + '.mat' file_train = 'data/{}/{}_train'.format('web', dataset_str) + '.pickle' file_valid = 'data/{}/{}_valid'.format('web', dataset_str) + '.pickle' file_test = 'data/{}/{}_test'.format('web', dataset_str) + '.pickle' data = scio.loadmat(data_file) features = data['Attributes'] labels = data['Label'].reshape(-1) adj = data['Network'] features = row_normalize(features) if (adj != adj.T).sum() != 0: adj = adj + adj.T if np.any(np.unique(adj[adj.nonzero()].A1) != 1): adj.data = np.ones_like(adj.data) label_min = np.min(labels) if label_min != 0: labels = labels - 1 with open(file_test, 'rb') as f: idx_test = pickle.load(f) with open(file_valid, 'rb') as f: idx_val = pickle.load(f) with open(file_train, 'rb') as f: idx_train = pickle.load(f) if semi == 1: train_idx_file = 'data/' + 'web' + '/' + dataset_str + '_train_{}'.format( semi_rate) + '.pickle' valid_idx_file = 'data/' + 'web' + '/' + dataset_str + '_valid_{}'.format( semi_rate) + '.pickle' test_idx_file = 'data/' + 'web' + '/' + dataset_str + '_test_{}'.format( semi_rate) + '.pickle' if os.path.isfile(train_idx_file): with open(test_idx_file, 'rb') as f: idx_test = pickle.load(f) with open(valid_idx_file, 'rb') as f: idx_val = pickle.load(f) with open(train_idx_file, 'rb') as f: idx_train = pickle.load(f) else: mask = np.unique(labels) label_count = [np.sum(labels == v) for v in mask] idx_train = [] idx_val = [] idx_test = [] for i, v in enumerate(mask): cnt = label_count[i] idx_all = np.where(labels == v)[0] np.random.shuffle(idx_all) idx_all = idx_all.tolist() test_len = math.ceil(cnt * 0.2) valid_len = math.ceil(cnt * 0.2) train_len = math.ceil(cnt * semi_rate) idx_test.extend(idx_all[-test_len:]) idx_val.extend(idx_all[-(test_len + valid_len):-test_len]) train_len_ = min(train_len, cnt - test_len - valid_len) idx_train.extend(idx_all[:train_len_]) idx_train = np.array(idx_train) idx_val = np.array(idx_val) idx_test = np.array(idx_test) with open(train_idx_file, 'wb') as pfile: pickle.dump(idx_train, pfile, pickle.HIGHEST_PROTOCOL) with open(test_idx_file, 'wb') as pfile: pickle.dump(idx_test, pfile, pickle.HIGHEST_PROTOCOL) with open(valid_idx_file, 'wb') as pfile: pickle.dump(idx_val, pfile, pickle.HIGHEST_PROTOCOL) features = sparse_mx_to_torch_sparse_tensor(features).float() idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) labels = torch.LongTensor(labels) return adj, features, labels, idx_train, idx_val, idx_test
def load_citationANEmat_gac(dataset_str="BlogCatalog", semi_rate=0.1): data_file = 'data/{}/{}'.format('social', dataset_str) + '.mat' data = scio.loadmat(data_file) if dataset_str == 'ACM': features = data['Features'] else: features = data['Attributes'] labels = data['Label'].reshape(-1) adj = data['Network'] # adj, features = preprocess_citation(adj, features, normalization) features = row_normalize(features) label_min = np.min(labels) if label_min != 0: labels = labels - 1 train_idx_file = 'data/' + 'social' + '/' + dataset_str + '_train_{}'.format( semi_rate) + '.pickle' valid_idx_file = 'data/' + 'social' + '/' + dataset_str + '_valid_{}'.format( semi_rate) + '.pickle' test_idx_file = 'data/' + 'social' + '/' + dataset_str + '_test_{}'.format( semi_rate) + '.pickle' if os.path.isfile(train_idx_file): with open(test_idx_file, 'rb') as f: idx_test = pickle.load(f) with open(valid_idx_file, 'rb') as f: idx_val = pickle.load(f) with open(train_idx_file, 'rb') as f: idx_train = pickle.load(f) else: mask = np.unique(labels) label_count = [np.sum(labels == v) for v in mask] idx_train = [] idx_val = [] idx_test = [] for i, v in enumerate(mask): cnt = label_count[i] idx_all = np.where(labels == v)[0] np.random.shuffle(idx_all) idx_all = idx_all.tolist() test_len = math.ceil(cnt * 0.2) valid_len = math.ceil(cnt * 0.2) train_len = math.ceil(cnt * semi_rate) idx_test.extend(idx_all[-test_len:]) idx_val.extend(idx_all[-(test_len + valid_len):-test_len]) train_len_ = min(train_len, cnt - test_len - valid_len) idx_train.extend(idx_all[:train_len_]) idx_train = np.array(idx_train) idx_val = np.array(idx_val) idx_test = np.array(idx_test) with open(train_idx_file, 'wb') as pfile: pickle.dump(idx_train, pfile, pickle.HIGHEST_PROTOCOL) with open(test_idx_file, 'wb') as pfile: pickle.dump(idx_test, pfile, pickle.HIGHEST_PROTOCOL) with open(valid_idx_file, 'wb') as pfile: pickle.dump(idx_val, pfile, pickle.HIGHEST_PROTOCOL) # adj = sparse_mx_to_torch_sparse_tensor(adj).float() features = sparse_mx_to_torch_sparse_tensor(features).float() idx_train = torch.LongTensor(idx_train) idx_val = torch.LongTensor(idx_val) idx_test = torch.LongTensor(idx_test) labels = torch.LongTensor(labels) return adj, features, labels, idx_train, idx_val, idx_test
def preprocess_citation(adj, features, normalization='FIrstOrderGCN'): adj_normalizer = fetch_normalization(normalization) adj = adj_normalizer(adj) features = row_normalize(features) return adj, features