def _add_ndata(self): vectorizer = CountVectorizer(min_df=5) features = vectorizer.fit_transform( self.data['plot_keywords'].fillna('').values) self.g.nodes['movie'].data['feat'] = torch.from_numpy( features.toarray()).float() self.g.nodes['movie'].data['label'] = torch.from_numpy( self.labels).long() # actor和director顶点的特征为其关联的movie顶点特征的平均 self.g.multi_update_all( { 'ma': (fn.copy_u('feat', 'm'), fn.mean('m', 'feat')), 'md': (fn.copy_u('feat', 'm'), fn.mean('m', 'feat')) }, 'sum') n_movies = len(self.movies) train_idx, val_idx, test_idx = split_idx(np.arange(n_movies), 400, 400, self._seed) self.g.nodes['movie'].data['train_mask'] = generate_mask_tensor( idx2mask(train_idx, n_movies)) self.g.nodes['movie'].data['val_mask'] = generate_mask_tensor( idx2mask(val_idx, n_movies)) self.g.nodes['movie'].data['test_mask'] = generate_mask_tensor( idx2mask(test_idx, n_movies))
def process(self): with open(os.path.join(self.raw_dir, 'ACM3025.pkl'), 'rb') as f: data = pickle.load(f) features = torch.from_numpy( data['feature'].todense()).float() # (3025, 1870) labels = torch.from_numpy( data['label'].todense()).long().nonzero(as_tuple=True)[1] # (3025) # Adjacency matrices for meta-path based neighbors # (Mufei): I verified both of them are binary adjacency matrices with self loops author_g = dgl.from_scipy(data['PAP']) subject_g = dgl.from_scipy(data['PLP']) self.gs = [author_g, subject_g] num_nodes = data['label'].shape[0] train_mask = generate_mask_tensor( idx2mask(data['train_idx'][0], num_nodes)) val_mask = generate_mask_tensor(idx2mask(data['val_idx'][0], num_nodes)) test_mask = generate_mask_tensor( idx2mask(data['test_idx'][0], num_nodes)) for g in self.gs: g.ndata['feat'] = features g.ndata['label'] = labels g.ndata['train_mask'] = train_mask g.ndata['val_mask'] = val_mask g.ndata['test_mask'] = test_mask
def _add_ndata(self): _raw_file2 = os.path.join(self.raw_dir, 'DBLP4057_GAT_with_idx.mat') if not os.path.exists(_raw_file2): raise FileNotFoundError('请手动下载文件 {} 提取码:6b3h 并保存到 {}'.format( self._url2, _raw_file2 )) mat = sio.loadmat(_raw_file2) self.g.nodes['author'].data['feat'] = torch.from_numpy(mat['features']).float() self.g.nodes['author'].data['label'] = torch.tensor(self.authors['label'].to_list()) n_authors = len(self.authors) train_idx, val_idx, test_idx = split_idx(np.arange(n_authors), 800, 400, self._seed) self.g.nodes['author'].data['train_mask'] = generate_mask_tensor(idx2mask(train_idx, n_authors)) self.g.nodes['author'].data['val_mask'] = generate_mask_tensor(idx2mask(val_idx, n_authors)) self.g.nodes['author'].data['test_mask'] = generate_mask_tensor(idx2mask(test_idx, n_authors)) self.g.nodes['conf'].data['label'] = torch.tensor(self.confs['label'].to_list())
def process(self): data = sio.loadmat(os.path.join(self.raw_dir, 'DBLP4057_GAT_with_idx.mat')) apa_g = dgl.graph(data['net_APA'].nonzero()) apcpa_g = dgl.graph(data['net_APCPA'].nonzero()) aptpa_g = dgl.graph(data['net_APTPA'].nonzero()) self.gs = [apa_g, apcpa_g, aptpa_g] features = torch.from_numpy(data['features']).float() labels = torch.from_numpy(data['label'].nonzero()[1]) num_nodes = data['label'].shape[0] train_mask = generate_mask_tensor(idx2mask(data['train_idx'][0], num_nodes)) val_mask = generate_mask_tensor(idx2mask(data['val_idx'][0], num_nodes)) test_mask = generate_mask_tensor(idx2mask(data['test_idx'][0], num_nodes)) for g in self.gs: g.ndata['feat'] = features g.ndata['label'] = labels g.ndata['train_mask'] = train_mask g.ndata['val_mask'] = val_mask g.ndata['test_mask'] = test_mask
def process(self): data = sio.loadmat(os.path.join(self.raw_dir, 'imdb5k.mat')) mam_g = dgl.graph(data['MAM'].nonzero()) mdm_g = dgl.graph(data['MDM'].nonzero()) # mym_g = dgl.graph(data['MYM'].nonzero()) self.gs = [mam_g, mdm_g] features = torch.from_numpy(data['feature']).float() num_nodes = features.shape[0] labels = torch.full((num_nodes, ), -1, dtype=torch.long) idx, label = data['label'].nonzero() labels[idx] = torch.from_numpy(label) train_mask = generate_mask_tensor( idx2mask(data['train_idx'][0], num_nodes)) val_mask = generate_mask_tensor(idx2mask(data['val_idx'][0], num_nodes)) test_mask = generate_mask_tensor( idx2mask(data['test_idx'][0], num_nodes)) for g in self.gs: g.ndata['feat'] = features g.ndata['label'] = labels g.ndata['train_mask'] = train_mask g.ndata['val_mask'] = val_mask g.ndata['test_mask'] = test_mask
def process(self): self.g = dgl.heterograph(self._read_edges()) feats = self._read_feats() for ntype, feat in feats.items(): self.g.nodes[ntype].data['feat'] = feat labels = torch.from_numpy( np.load(os.path.join(self.raw_path, 'labels.npy'))).long() self._num_classes = labels.max().item() + 1 self.g.nodes[self.predict_ntype].data['label'] = labels n = self.g.num_nodes(self.predict_ntype) for split in ('train', 'val', 'test'): idx = np.load(os.path.join(self.raw_path, f'{split}_60.npy')) mask = generate_mask_tensor(idx2mask(idx, n)) self.g.nodes[self.predict_ntype].data[f'{split}_mask'] = mask pos_i, pos_j = sp.load_npz(os.path.join(self.raw_path, 'pos.npz')).nonzero() self.pos_i, self.pos_j = torch.from_numpy( pos_i).long(), torch.from_numpy(pos_j).long()
def process(self): data = sio.loadmat(os.path.join(self.raw_dir, 'ACM.mat')) p_vs_l = data['PvsL'] # paper-field? p_vs_a = data['PvsA'] # paper-author p_vs_t = data['PvsT'] # paper-term, bag of words p_vs_c = data['PvsC'] # paper-conference, labels come from that # We assign # (1) KDD papers as class 0 (data mining), # (2) SIGMOD and VLDB papers as class 1 (database), # (3) SIGCOMM and MobiCOMM papers as class 2 (communication) conf_ids = [0, 1, 9, 10, 13] label_ids = [0, 1, 2, 2, 1] p_vs_c_filter = p_vs_c[:, conf_ids] p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0] p_vs_l = p_vs_l[p_selected] p_vs_a = p_vs_a[p_selected] p_vs_t = p_vs_t[p_selected] p_vs_c = p_vs_c[p_selected] self.g = dgl.heterograph({ ('paper', 'pa', 'author'): p_vs_a.nonzero(), ('author', 'ap', 'paper'): p_vs_a.transpose().nonzero(), ('paper', 'pf', 'field'): p_vs_l.nonzero(), ('field', 'fp', 'paper'): p_vs_l.transpose().nonzero() }) paper_features = torch.FloatTensor(p_vs_t.toarray()) # (4025, 1903) pc_p, pc_c = p_vs_c.nonzero() paper_labels = np.zeros(len(p_selected), dtype=np.int64) for conf_id, label_id in zip(conf_ids, label_ids): paper_labels[pc_p[pc_c == conf_id]] = label_id paper_labels = torch.from_numpy(paper_labels) float_mask = np.zeros(len(pc_p)) for conf_id in conf_ids: pc_c_mask = (pc_c == conf_id) float_mask[pc_c_mask] = np.random.permutation( np.linspace(0, 1, pc_c_mask.sum())) train_idx = np.where(float_mask <= 0.2)[0] val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0] test_idx = np.where(float_mask > 0.3)[0] num_paper_nodes = self.g.num_nodes('paper') train_mask = generate_mask_tensor(idx2mask(train_idx, num_paper_nodes)) val_mask = generate_mask_tensor(idx2mask(val_idx, num_paper_nodes)) test_mask = generate_mask_tensor(idx2mask(test_idx, num_paper_nodes)) self.g.nodes['paper'].data['feat'] = paper_features self.g.nodes['paper'].data['label'] = paper_labels self.g.nodes['paper'].data['train_mask'] = train_mask self.g.nodes['paper'].data['val_mask'] = val_mask self.g.nodes['paper'].data['test_mask'] = test_mask # author顶点的特征为其关联的paper顶点特征的平均 self.g.multi_update_all( {'pa': (fn.copy_u('feat', 'm'), fn.mean('m', 'feat'))}, 'sum') self.g.nodes['field'].data['feat'] = torch.eye( self.g.num_nodes('field'))