def get_model_and_optimizer(training_method, dataset_name, features_dimension, device): training_method_signature = 'BP' if training_method == 'bp' else 'ALT' if training_method_signature == 'BP': model = GAE(GraphEncoder(features_dimension, 16)) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) else: model = GAE(DFAGraphEncoder(features_dimension, 16, training_method=training_method)) if dataset_name == 'cora': optimizer = torch.optim.Adam(model.parameters(), lr=0.01) elif dataset_name == 'citeseer': optimizer = torch.optim.Adam(model.parameters(), lr=0.02) elif dataset_name == 'pubmed': optimizer = torch.optim.Adam(model.parameters(), lr=0.01) return model.to(device), optimizer
def forward(self, x, edge_index): return self.conv_mu(x, edge_index), self.conv_logstd(x, edge_index) in_channels, out_channels = dataset.num_features, 16 if not args.variational and not args.linear: model = GAE(GCNEncoder(in_channels, out_channels)) elif not args.variational and args.linear: model = GAE(LinearEncoder(in_channels, out_channels)) elif args.variational and not args.linear: model = VGAE(VariationalGCNEncoder(in_channels, out_channels)) elif args.variational and args.linear: model = VGAE(VariationalLinearEncoder(in_channels, out_channels)) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train(): model.train() optimizer.zero_grad() z = model.encode(train_data.x, train_data.edge_index) loss = model.recon_loss(z, train_data.pos_edge_label_index) if args.variational: loss = loss + (1 / train_data.num_nodes) * model.kl_loss() loss.backward() optimizer.step() return float(loss)
class UnsGAE(object): def __init__(self, data, embed_dim, **kwargs): super(UnsGAE, self).__init__() self.data = data self.input_dim = self.data.dim self.embed_dim = embed_dim # for now, we only work with 2-layer encoders self.hidden_dim = kwargs.get('hidden_dim', 2*embed_dim) self.encoder = kwargs.get('encoder', batched_SAGEEncoder) self.encoder = self.encoder(self.input_dim, self.hidden_dim, self.embed_dim) self.model = GAE(self.encoder) # preparing the device device = kwargs.get('device', 'cuda') if device=='gpu' and not(torch.cuda.is_available()): print('CUDA is not available in PyTorch. the model ' +\ 'will be initiated on CPU.') device = 'cpu' self.device = torch.device(device) def init_model(self, sizes, weights_path=None): self.model = self.model.to(self.device) # sizes are directly used for initializing the model # but it will be used for every feed-forward as the # sampling size of the neighbors assert len(sizes)==self.model.encoder.num_layers, \ 'Number of sizes should be equal to the number of layers in the encoder.' self.sizes = sizes if not(hasattr(self.data, 'loader')): self.data.get_neighbor_sampler(self.sizes) if weights_path is not None: self.model.load_state_dict(torch.load(weights_path, map_location=self.device)) def init_training(self, neg_num, optim='Adam', lr=1e-5, smooth_par=0.75): if optim=='Adam': self.optimizer = torch.optim.Adam(self.model.parameters(), lr=lr) elif optim=='SGD': self.optimizer = torch.optim.SGD(self.model.parameters(), lr=lr) self.train_one_epoch = self._train_edge_batching self.neg_num = neg_num if not(hasattr(self.data, 'pos_pairs')): assert 'pos_samples_path' in kwargs, 'The provided data does ' +\ 'not come with positive pairs, and we need a path to the ' +\ 'already selected positive samples. You can provide it through ' +\ 'input pos_samples_path .' include_nodes = kwargs.get('include_nodes', None) self.data.load_positive_pairs(kwargs['pos_samples_path'], include_nodes) if not(hasattr(self.data, 'neg_sampler')): #smooth_par = kwargs.get('smooth_par', 0.75) self.data.get_negative_sampler(smooth_par) if not(hasattr(self.data, 'x_all')): self.data._fetch_node_features() def init_validation(self): if not(hasattr(self.data, 'x_all')): self.data._fetch_node_features() def embed_some(self, sample_inds, b=100): """This will be used in the training, when the embedding of a batch of samples are needed """ quot, rem = np.divmod(len(sample_inds), b) Z = [] for i in range(quot+1): if i<quot: b_ids = sample_inds[i*b:(i+1)*b] elif rem>0: b_ids = sample_inds[i*b:] # neighbor-sampling for each sample _, n_id, adjs = self.data.train_loader.sample(b_ids) adjs = [adj.to(self.device) for adj in adjs] # get feature vectors through the neighbors sampled above batch_X = torch.from_numpy(self.data.get_node_features(n_id)) batch_X = batch_X.to(torch.float).to(self.device) # the encoder's output as the embedding try: batch_Z = self.model.encoder(batch_X, adjs) except: pdb.set_trace() Z += [batch_Z] Z = torch.cat(Z, dim=0) return Z def embed_all(self): L = self.model.encoder.num_layers pbar = tqdm(total=self.data.n_x * L, position=0, leave=True) pbar.set_description('Evaluating') self.model.encoder.eval() # inference is used in the evaluation stage (not in training) when # the embeddings for "all" nodes will be computed. It's written in a way # that is faster than the foward-passing function which is mostly used # for single batches in the training with torch.no_grad(): for i in range(L): xs = [] for batch_size, n_id, adj in self.data.test_loader: edge_index, _, size = adj.to(self.device) if i==0: x = torch.from_numpy(self.data.get_node_features(n_id)) x = x.to(torch.float).to(self.device) else: x = x_all[n_id,:].to(self.device) x_target = x[:size[1]] x = self.model.encoder.convs[i]((x,x_target), edge_index) if i != L-1: x = F.relu(x) xs.append(x[:batch_size,:].cpu()) pbar.update(batch_size) x_all = torch.cat(xs, dim=0) pbar.close() return x_all def _train_edge_batching(self, ep, batch_size=5000): assert hasattr(self.data, 'pos_pairs'), 'Positive and negative ' + \ 'samples must be generated before starting the training' self.model.train() neg_num = self.neg_num torch.multiprocessing.set_sharing_strategy('file_system') pbar = tqdm(total=self.data.pos_pairs.shape[1], position=0, leave=True) pbar.set_description(f'Epoch {ep:02d}') total_loss = 0 np.random.shuffle(self.data.pos_pairs.T) quot, rem = np.divmod(self.data.pos_pairs.shape[1], batch_size) for i in range(quot+1): # positive mini-batch # (#: batch size) if i<quot: batch_pos_pairs = self.data.pos_pairs[:,i*batch_size:(i+1)*batch_size] else: batch_pos_pairs = self.data.pos_pairs[:,i*batch_size:] batch_pos_samples, pos_edge_index = np.unique(batch_pos_pairs, return_inverse=True) pos_edge_index = pos_edge_index.reshape(batch_pos_pairs.shape) # negative mini-batch # (#: batch_size * neg_num) batch_neg_samples = self.data.neg_sampler.sample( torch.Size([neg_num*batch_pos_pairs.shape[1]])) neg_edge_index = np.array([np.repeat(pos_edge_index[0,:],neg_num), np.arange(pos_edge_index.max()+1, pos_edge_index.max()+len(batch_neg_samples)+1)]) # embeddings of the nodes involved in + and - edges self.optimizer.zero_grad() unodes = batch_pos_samples.tolist() + batch_neg_samples.tolist() Z = self.embed_some(unodes) # reconstruction loss pos_edge_index = torch.from_numpy(pos_edge_index).to(self.device) neg_edge_index = torch.from_numpy(neg_edge_index).to(self.device) loss = self.model.recon_loss(Z, pos_edge_index, neg_edge_index) loss.backward() self.optimizer.step() total_loss += float(loss) pbar.update(batch_size) pbar.close() loss = total_loss / (quot+1) return loss def validate(self): self.model.eval() Z = self.embed_all() ents_Z = Z[:-1,:][self.data.selected_inds[:-1]>=self.data.nA,:].detach().numpy() prop_Z = Z[self.data.tags=='prop',:].detach().numpy() scores = np.dot(ents_Z, prop_Z.T).squeeze() sorted_ents = self.data.selected_ents[np.argsort(-scores)] unstudied_sorted_ents = np.array([x for x in sorted_ents if x not in self.data.studied_ents]) preds = unstudied_sorted_ents[:50] prec = np.isin(preds,self.data.GT).sum() / len(preds) return prec
pool_ratios, act, variational, ) decoder = VariationalGraphDecoder( graph_out_channels, hidden_channels, node_out_channels, depth, sum_res, act, ) # Hardware device = torch.device("cuda" if torch.cuda.is_available() else "cpu") node_ae = node_ae.to(device) encoder, decoder = encoder.to(device), decoder.to(device) # Optimizer optimizer = torch.optim.Adam( set(node_ae.parameters()) | set(encoder.parameters()) | set(decoder.parameters()), lr=0.01, ) def train(): node_ae.train() encoder.train() decoder.train() train_loss = 0.0
def perturb_edges(data, name, remove_pct, add_pct, hidden_channels=16, epochs=400): if remove_pct == 0 and add_pct == 0: return try: cached = pickle.load( open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'rb')) print(f'Use cached edge augmentation for dataset {name}') if data.setting == 'inductive': data.train_edge_index = cached else: data.edge_index = cached return except FileNotFoundError: try: A_pred, adj_orig = pickle.load( open(f'{ROOT}/cache/edge/{name}.pt', 'rb')) A = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct) data.edge_index, _ = from_scipy_sparse_matrix(A) pickle.dump( data.edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb')) return except FileNotFoundError: print( f'cache/edge/{name}_{remove_pct}_{add_pct}.pt not found! Regenerating it now' ) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if data.setting == 'inductive': train_data = Data(x=data.train_x, ori_x=data.ori_x, edge_index=data.train_edge_index, y=data.train_y) else: train_data = deepcopy(data) edge_index = deepcopy(train_data.edge_index) train_data = train_test_split_edges(train_data, val_ratio=0.1, test_ratio=0) num_features = train_data.ori_x.shape[1] model = GAE(GCNEncoder(num_features, hidden_channels)) model = model.to(device) x = train_data.ori_x.to(device) train_pos_edge_index = train_data.train_pos_edge_index.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) best_val_auc = 0 best_z = None for epoch in range(1, epochs + 1): model.train() optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) loss = model.recon_loss(z, train_pos_edge_index) loss.backward() optimizer.step() model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) auc, ap = model.test(z, train_data.val_pos_edge_index, train_data.val_neg_edge_index) print('Val | Epoch: {:03d}, AUC: {:.4f}, AP: {:.4f}'.format( epoch, auc, ap)) if auc > best_val_auc: best_val_auc = auc best_z = deepcopy(z) A_pred = torch.sigmoid(torch.mm(z, z.T)).cpu().numpy() adj_orig = to_scipy_sparse_matrix(edge_index).asformat('csr') adj_pred = sample_graph_det(adj_orig, A_pred, remove_pct, add_pct) if data.setting == 'inductive': data.train_edge_index, _ = from_scipy_sparse_matrix(adj_pred) else: data.edge_index, _ = from_scipy_sparse_matrix(adj_pred) pickle.dump((A_pred, adj_orig), open(f'{ROOT}/cache/edge/{name}.pt', 'wb')) if data.setting == 'inductive': pickle.dump( data.train_edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb')) else: pickle.dump( data.edge_index, open(f'{ROOT}/cache/edge/{name}_{remove_pct}_{add_pct}.pt', 'wb'))
def run_model(dataset, conf): # ## 1) Build Table graph # ### Tables tokenization tokenized_tables, vocabulary, cell_dict, reversed_dictionary = corpus_tuple = create_corpus( dataset, include_attr=conf["add_attr"]) if conf["shuffle_vocab"] == True: shuffled_vocab = shuffle_vocabulary(vocabulary) else: shuffled_vocab = None nodes = build_node_features(vocabulary) row_edges_index, row_edges_weights = build_graph_edges( tokenized_tables, s_vocab=shuffled_vocab, sample_frac=conf["row_edges_sample"], columns=False) col_edges_index, col_edges_weights = build_graph_edges( tokenized_tables, s_vocab=shuffled_vocab, sample_frac=conf["column_edges_sample"], columns=True) edges = torch.cat((row_edges_index, col_edges_index), dim=1) weights = torch.cat((row_edges_weights, col_edges_weights), dim=0) graph_data = Data(x=nodes, edge_index=edges, edge_attr=weights) # ## 2 ) Run Table Auto-Encoder Model: device = 'cuda' if torch.cuda.is_available() else 'cpu' loader = DataLoader(torch.arange(graph_data.num_nodes), batch_size=128, shuffle=True) graph_data = graph_data.to(device) x, train_pos_edge_index = nodes, edges class Encoder(torch.nn.Module): def __init__(self, in_channels, out_channels): super(Encoder, self).__init__() self.conv1 = GCNConv(in_channels, 2 * out_channels, cached=True) self.conv_mu = GCNConv(2 * out_channels, out_channels, cached=True) self.conv_logvar = GCNConv(2 * out_channels, out_channels, cached=True) def forward(self, x, edge_index): x = F.relu(self.conv1(x, edge_index)) return self.conv_mu(x, edge_index), self.conv_logvar(x, edge_index) channels = conf["vector_size"] enc = Encoder(graph_data.num_features, channels) model = GAE(enc) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train(model, optimizer, x, train_pos_edge_index): model.train() optimizer.zero_grad() z = model.encode(x, train_pos_edge_index) loss = model.recon_loss(z, train_pos_edge_index) #loss = model.kl_loss() loss.backward() optimizer.step() return loss losses = [] for epoch in range(conf["epoch_num"]): loss = train(model, optimizer, x, train_pos_edge_index) losses.append(loss) print(epoch, loss) losses.append(loss) # ### 3) Extract the latent cell vectors, generate table vectors: def get_cell_vectors(model, x, train_pos_edge_index): model.eval() with torch.no_grad(): z = model.encode(x, train_pos_edge_index) cell_vectors = z.numpy() return z, cell_vectors z, cell_vectors = get_cell_vectors(model, x, train_pos_edge_index) vec_list = generate_table_vectors(cell_vectors, tokenized_tables, s_vocab=shuffled_vocab) # ## 3) Evaluate the model result_score = evaluate_model(dataset, vec_list, k=5) return cell_vectors, vec_list, losses, result_score
# Model if not args.variational: if not args.linear: model = GAE(GCNEncoder(num_features, out_channels)) else: model = GAE(LinearEncoder(num_features, out_channels)) else: if args.linear: model = VGAE(VariationalLinearEncoder(num_features, out_channels)) else: model = VGAE(VariationalGCNEncoder(num_features, out_channels)) # Hardware device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model, data = model.to(device), data.to(device) # Optimizer optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train(): model.train() optimizer.zero_grad() if args.constant: num_nodes = int(data.edge_index.max().item()) + 1 x = torch.ones((num_nodes, num_features)) x = x.to(device) else: x = data.x