def __init__(self, ckpt_path, newModel=False): self.dualGraph = readGraph() file_name = "dualGraphNodes.pkl" open_file = open(file_name, "wb") pickle.dump(list(self.dualGraph.nodes), open_file) open_file.close() #print(list(self.dualGraph.nodes)) self.data = from_networkx(self.dualGraph) self.device = 'cuda' if torch.cuda.is_available() else 'cpu' open_file = open("edge_index.pkl", "wb") pickle.dump(self.data.edge_index, open_file) open_file.close() open_file = open("edge_index.pkl", "rb") edge_index = pickle.load(open_file) open_file.close() self.model = Node2Vec(edge_index, embedding_dim=32, walk_length=20, context_size=10, walks_per_node=10, num_negative_samples=1, p=1, q=1, sparse=True).to(self.device) self.loader = self.model.loader(batch_size=128, shuffle=True, num_workers=0) self.optimizer = torch.optim.SparseAdam(list(self.model.parameters()), lr=0.01) if newModel: self.train(epochs=20) self.saveTo(ckpt_path) else: self.loadFrom(ckpt_path)
def train(self, data): if data.has_features == False: embedding_dim = 128 embedder = Node2Vec( data.x.size()[0], # Num nodes embedding_dim, # Embedding dimesion 5, # Walk len 3, # Context size ) # First train embedder embedder = n2v_trainer( data, embedder, self.device, lr=0.1, epochs=400 ) # Then use n2v embeddings as features data.x = embedder.embedding.weight model = BenGCN( features_num=data.x.size()[1], num_class=int(max(data.y)) + 1, num_layers=2 ) return generic_training_loop( data, model, self.device, lr=0.01 )
def node2vec(edge_index): embedding_dim = 128 walk_length = 80 context_size = 20 walks_per_node = 10 batch_size = 256 lr = 0.01 epochs = 5 log_steps = 1 device = f'cuda:{0}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) model = Node2Vec(edge_index, embedding_dim, walk_length, context_size, walks_per_node, sparse=True).to(device) optimizer = torch.optim.SparseAdam(model.parameters(), lr=lr) loader = model.loader(batch_size=batch_size, shuffle=True, num_workers=4) model.train() for epoch in range(1, epochs + 1): for i, (pos_rw, neg_rw) in enumerate(loader): optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() if (i + 1) % log_steps == 0: print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, ' f'Loss: {loss:.4f}') print(f'node2vec total params are {sum(p.numel() for p in model.parameters())}') return model.embedding.weight.data.cpu().numpy()
def generate_node2vec_feature(self, data, epochs=20, num_features=64): edge_index = data['edge_file'][['src_idx', 'dst_idx']].to_numpy() edge_index = sorted(edge_index, key=lambda d: d[0]) edge_index = torch.tensor(edge_index, dtype=torch.long).transpose(0, 1) model = Node2Vec(edge_index, embedding_dim=num_features, walk_length=20, context_size=10, walks_per_node=10, num_negative_samples=1, sparse=True).to('cuda') loader = model.loader(batch_size=128, shuffle=True, num_workers=4) optimizer = torch.optim.SparseAdam(model.parameters(), lr=0.01) def train(): model.train() total_loss = 0 for pos_rw, neg_rw in loader: optimizer.zero_grad() loss = model.loss(pos_rw.to('cuda'), neg_rw.to('cuda')) loss.backward() optimizer.step() total_loss += loss.item() return total_loss / len(loader) for epoch in range(1, epochs + 1): loss = train() print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}') return pd.concat( [data['fea_table'], pd.DataFrame(model().detach().cpu().numpy())], axis=1)
def train_nn(self): self.model = Node2Vec(self.data.num_nodes, embedding_dim=128, walk_length=20, context_size=10, walks_per_node=10) self.model = self.model.to(self.device) self.data = self.data.to(self.device) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr) self.loader = DataLoader(torch.arange(self.data.num_nodes), batch_size=128, shuffle=True) for epoch in range(1, self.epochs + 1): t1 = time.time() self.model.train() total_loss = 0 for subset in self.loader: self.optimizer.zero_grad() loss = self.model.loss(self.data.edge_index, subset.to(self.device)) loss.backward() self.optimizer.step() total_loss += loss.item() total_loss = total_loss / len(self.loader) print("epoch: %d, time elapsed: %.2f, loss: %.5f" % (epoch, time.time() - t1, total_loss)) self.model.eval() with torch.no_grad(): z = self.model( torch.arange(self.data.num_nodes, device=self.device)) return z
def __init__(self, n2v_dim, attention_dim, feature_dim, embedding_dim, num_heads, output_dimension, windowsz=3): super(ConstGat, self).__init__() open_file = open("edge_index.pkl", "rb") edge_index = pickle.load(open_file) open_file.close() self.n2v = Node2Vec(edge_index, embedding_dim=32, walk_length=20, context_size=10, walks_per_node=10, num_negative_samples=1, p=1, q=1, sparse=True) #self.n2v = N2V('node2vec.mdl') self.linearContextual = nn.Linear(n2v_dim, attention_dim) self.embedding_dim = embedding_dim self.feature_dim = feature_dim self.num_heads = num_heads self.output_dimension = output_dimension self.background_dim = embedding_dim[1] + embedding_dim[2] + 1 self.linearQ = nn.Linear(self.background_dim + n2v_dim, attention_dim) # embedding layers for 7 categorical features # "road_type", "time_stage", "week_day", "lanes", "bridge", "endpoint_u", "endpoint_v", "trip_id" # 0 represents Unknown # 0-21 self.embedding_road_type = nn.Embedding(22, self.embedding_dim[0]) # 0-6 self.embedding_time_stage = nn.Embedding(7, self.embedding_dim[1]) # 0-7 self.embedding_week_day = nn.Embedding(8, self.embedding_dim[2]) # 0-8 self.embedding_lanes = nn.Embedding(9, self.embedding_dim[3]) # 0-1 self.embedding_bridge = nn.Embedding(2, self.embedding_dim[4]) # 0-16 self.embedding_endpoint_u = nn.Embedding(17, self.embedding_dim[5]) self.embedding_endpoint_v = nn.Embedding(17, self.embedding_dim[6]) self.selfattn = nn.MultiheadAttention(embed_dim=attention_dim, num_heads=self.num_heads, batch_first=True) self.traffic_dim = embedding_dim[0] + sum( embedding_dim[3:]) + feature_dim - 1 self.linearTraffic = nn.Linear(self.traffic_dim, attention_dim) #self.norm = LayerNorm(self.total_embed_dim) self.feed_forward = PositionwiseFeedForward( (2 * attention_dim + self.background_dim) * windowsz, self.output_dimension) self.activate = nn.ReLU()
def main(): parser = argparse.ArgumentParser(description='OGB (Node2Vec)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--task', type=str, default='ogbn') parser.add_argument('--dataset', type=str, default='arxiv') parser.add_argument('--embedding_dim', type=int, default=128) parser.add_argument('--walk_length', type=int, default=80) parser.add_argument('--context_size', type=int, default=20) parser.add_argument('--walks_per_node', type=int, default=10) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=5) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--dropedge_rate', type=float, default=0.4) parser.add_argument('--dump_adj_only', dest="dump_adj_only", action="store_true", help="dump adj matrix for proX") parser.set_defaults(dump_adj_only=False) args = parser.parse_args() device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = create_dataset(name=f'{args.task}-{args.dataset}') data = dataset[0] if args.dataset == 'arxiv': data.edge_index = to_undirected(data.edge_index, data.num_nodes) elif args.dataset == 'papers100M': data.edge_index, _ = dropout_adj(data.edge_index, p = args.dropedge_rate, num_nodes= data.num_nodes) data.edge_index = to_undirected(data.edge_index, data.num_nodes) if args.dump_adj_only: adj = to_scipy_sparse_matrix(data.edge_index) sp.save_npz(f'data/{args.name}-adj.npz', adj) return model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length, args.context_size, args.walks_per_node, sparse=True).to(device) loader = model.loader(batch_size=args.batch_size, shuffle=True, num_workers=4) optimizer = torch.optim.SparseAdam(model.parameters(), lr=args.lr) model.train() for epoch in range(1, args.epochs + 1): for i, (pos_rw, neg_rw) in enumerate(loader): optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() if (i + 1) % args.log_steps == 0: print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, ' f'Loss: {loss:.4f}') if (i + 1) % 100 == 0: # Save model every 100 steps. save_embedding(model, args.embedding_dim, args.dataset, args.context_size) save_embedding(model, args.embedding_dim, args.dataset, args.context_size)
def main(): dataset = 'Cora' path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset) dataset = Planetoid(path, dataset) data = dataset[0] device = 'cuda' if torch.cuda.is_available() else 'cpu' model = Node2Vec(data.edge_index, embedding_dim=128, walk_length=20, context_size=10, walks_per_node=10, num_negative_samples=1, p=1, q=1, sparse=True).to(device) loader = model.loader(batch_size=128, shuffle=True, num_workers=4) optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01) def train(): model.train() total_loss = 0 for pos_rw, neg_rw in loader: optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() total_loss += loss.item() return total_loss / len(loader) @torch.no_grad() def test(): model.eval() z = model() acc = model.test(z[data.train_mask], data.y[data.train_mask], z[data.test_mask], data.y[data.test_mask], max_iter=150) return acc for epoch in range(1, 101): loss = train() acc = test() print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, Acc: {acc:.4f}') @torch.no_grad() def plot_points(colors): model.eval() z = model(torch.arange(data.num_nodes, device=device)) z = TSNE(n_components=2).fit_transform(z.cpu().numpy()) y = data.y.cpu().numpy() plt.figure(figsize=(8, 8)) for i in range(dataset.num_classes): plt.scatter(z[y == i, 0], z[y == i, 1], s=20, color=colors[i]) plt.axis('off') plt.show() colors = [ '#ffc0cb', '#bada55', '#008080', '#420420', '#7fe5f0', '#065535', '#ffd700' ] plot_points(colors)
def n2v(edge_list, node2id, round_id,init_dict=None, embedding_dim=128, walk_length=10, context_size=5, walks_per_node=10,tol=1e-4,verbose=False, epochs=100): edge_index = torch.tensor(np.array(edge_list).T, dtype=torch.long) data = Data(edge_index=edge_index) model = Node2Vec(data.edge_index, embedding_dim=embedding_dim, walk_length=walk_length, context_size=context_size, walks_per_node=walks_per_node, sparse=True) if init_dict is not None: miss_nodes = [] X = np.random.randn(len(node2id), embedding_dim) for node, idx in node2id.items(): try: X[idx] = init_dict[node] except: miss_nodes.append(node) print("Missing {} nodes: {} ".format(len(miss_nodes), miss_nodes)) model.embedding.data = torch.tensor(X) model = model.to(device) loader = model.loader(batch_size=32, shuffle=True) optimizer = torch.optim.SparseAdam(model.parameters(), lr=0.01/(int(round_id)+1)) best_loss = 10e8 n_step_without_progress = 0 for epoch in range(epochs): model.train() total_loss = 0 if verbose: for pos_rw, neg_rw in tqdm(loader, desc="Train epoch {}".format(epoch+1)): optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() total_loss += loss.item() optimizer.step() else: for pos_rw, neg_rw in loader: optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() total_loss += loss.item() optimizer.step() if (best_loss - total_loss)/best_loss < tol: n_step_without_progress += 1 if n_step_without_progress == 3: break else: best_loss = total_loss n_step_without_progress = 0 if verbose: print("Epoch {}: loss {} best loss {} #step without progress {}".format(epoch, total_loss, best_loss, n_step_without_progress)) model.eval() out = model().cpu().detach().numpy() return out
def main(): parser = argparse.ArgumentParser(description='OGBN-citeseer (Node2Vec)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--embedding_dim', type=int, default=256) parser.add_argument('--walk_length', type=int, default=80) parser.add_argument('--context_size', type=int, default=20) parser.add_argument('--walks_per_node', type=int, default=10) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--log_steps', type=int, default=1) args = parser.parse_args() device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) # root = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'arxiv') data_dir = 'planetoid' dataset = pyg.datasets.Planetoid(name='Citeseer', root=data_dir) data = dataset[0] data.edge_index = to_undirected(data.edge_index, data.num_nodes) model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length, args.context_size, args.walks_per_node, sparse=True).to(device) loader = model.loader(batch_size=args.batch_size, shuffle=True, num_workers=4) optimizer = torch.optim.SparseAdam(model.parameters(), lr=args.lr) model.train() for epoch in range(1, args.epochs + 1): for i, (pos_rw, neg_rw) in enumerate(loader): optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() if (i + 1) % args.log_steps == 0: print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, ' f'Loss: {loss:.4f}') if (i + 1) % 100 == 0: # Save model every 100 steps. save_embedding(model) save_embedding(model)
def main(): parser = argparse.ArgumentParser(description="OGBL-Citation2 (Node2Vec)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--embedding_dim", type=int, default=128) parser.add_argument("--walk_length", type=int, default=40) parser.add_argument("--context_size", type=int, default=20) parser.add_argument("--walks_per_node", type=int, default=10) parser.add_argument("--batch_size", type=int, default=256) parser.add_argument("--lr", type=float, default=0.01) parser.add_argument("--epochs", type=int, default=1) parser.add_argument("--log_steps", type=int, default=1) args = parser.parse_args() device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygLinkPropPredDataset(name="ogbl-citation2") data = dataset[0] data.edge_index = to_undirected(data.edge_index, data.num_nodes) model = Node2Vec( data.edge_index, args.embedding_dim, args.walk_length, args.context_size, args.walks_per_node, sparse=True, ).to(device) loader = model.loader(batch_size=args.batch_size, shuffle=True, num_workers=4) optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr) model.train() for epoch in range(1, args.epochs + 1): for i, (pos_rw, neg_rw) in enumerate(loader): optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() if (i + 1) % args.log_steps == 0: print(f"Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, " f"Loss: {loss:.4f}") if (i + 1) % 100 == 0: # Save model every 100 steps. save_embedding(model) save_embedding(model)
def main(): parser = argparse.ArgumentParser(description='OGBN-Arxiv (Node2Vec)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--embedding_dim', type=int, default=128) parser.add_argument('--walk_length', type=int, default=80) parser.add_argument('--context_size', type=int, default=20) parser.add_argument('--walks_per_node', type=int, default=10) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=5) parser.add_argument('--log_steps', type=int, default=1) args = parser.parse_args() device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset( name='ogbn-arxiv', root='/srv/scratch/ogb/datasets/nodeproppred') data = dataset[0] data.edge_index = to_undirected(data.edge_index, data.num_nodes) model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length, args.context_size, args.walks_per_node, sparse=True).to(device) loader = model.loader(batch_size=args.batch_size, shuffle=True, num_workers=4) optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr) model.train() for epoch in range(1, args.epochs + 1): for i, (pos_rw, neg_rw) in enumerate(loader): optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() if (i + 1) % args.log_steps == 0: print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, ' f'Loss: {loss:.4f}') if (i + 1) % 100 == 0: # Save model every 100 steps. save_embedding(model) save_embedding(model)
def main(): parser = argparse.ArgumentParser(description='OGBN-Proteins (Node2Vec)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--embedding_dim', type=int, default=128) parser.add_argument('--walk_length', type=int, default=80) parser.add_argument('--context_size', type=int, default=20) parser.add_argument('--walks_per_node', type=int, default=10) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=1) parser.add_argument('--log_steps', type=int, default=1) args = parser.parse_args() device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-proteins') data = dataset[0] edge_index = data.edge_index.to(device) perm = torch.argsort(edge_index[0] * data.num_nodes + edge_index[1]) edge_index = edge_index[:, perm] model = Node2Vec(data.num_nodes, args.embedding_dim, args.walk_length, args.context_size, args.walks_per_node).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) loader = DataLoader(torch.arange(data.num_nodes), batch_size=args.batch_size, shuffle=True) model.train() for epoch in range(1, args.epochs + 1): for i, subset in enumerate(loader): optimizer.zero_grad() loss = model.loss(edge_index, subset.to(edge_index.device)) loss.backward() optimizer.step() if (i + 1) % args.log_steps == 0: print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, ' f'Loss: {loss:.4f}') if (i + 1) % 100 == 0: # Save model every 100 steps. save_embedding(model) save_embedding(model)
def __init__(self, num_nodes, embedding_dim=16, walk_length=5, context_size=5, walks_per_node=1, num_layers=2, hidden=32, features_num=16, num_class=2): super().__init__(num_layers, hidden, features_num + embedding_dim, num_class) self.n2v = Node2Vec(num_nodes, embedding_dim, walk_length, context_size, walks_per_node=walks_per_node)
def test_node2vec(): edge_index = torch.tensor([[0, 1, 1, 2], [1, 0, 2, 1]]) model = Node2Vec(edge_index, embedding_dim=16, walk_length=2, context_size=2) assert model.__repr__() == 'Node2Vec(3, 16)' z = model(torch.arange(3)) assert z.size() == (3, 16) pos_rw, neg_rw = model.sample(torch.arange(3)) loss = model.loss(pos_rw, neg_rw) assert 0 <= loss.item() acc = model.test(torch.ones(20, 16), torch.randint(10, (20, )), torch.ones(20, 16), torch.randint(10, (20, ))) assert 0 <= acc and acc <= 1
def __init__(self, feature_dim, embedding_dim, num_heads, output_dimension,n2v_dim,window_size,attention_dim = 64): super(Pigat, self).__init__() self.device = 'cuda' if torch.cuda.is_available() else 'cpu' #self.n2v = N2V('node2vec.mdl') open_file = open("edge_index.pkl", "rb") edge_index = pickle.load(open_file) open_file.close() self.n2v = Node2Vec(edge_index, embedding_dim=32, walk_length=20, context_size=10, walks_per_node=10, num_negative_samples=1, p=1, q=1, sparse=True) self.attention_dim = attention_dim self.embedding_dim = embedding_dim self.feature_dim = feature_dim self.total_embed_dim = self.feature_dim + sum(self.embedding_dim)+ n2v_dim self.output_dimension = output_dimension self.num_heads = num_heads # embedding layers for 7 categorical features # "road_type", "time_stage", "week_day", "lanes", "bridge", "endpoint_u", "endpoint_v", "trip_id" # 0 represents Unknown # 0-21 self.embedding_road_type = nn.Embedding(22, self.embedding_dim[0]) # 0-6 self.embedding_time_stage = nn.Embedding(7, self.embedding_dim[1]) # 0-7 self.embedding_week_day = nn.Embedding(8, self.embedding_dim[2]) # 0-8 self.embedding_lanes = nn.Embedding(9, self.embedding_dim[3]) # 0-1 self.embedding_bridge = nn.Embedding(2, self.embedding_dim[4]) # 0-16 self.embedding_endpoint_u = nn.Embedding(17, self.embedding_dim[5]) self.embedding_endpoint_v = nn.Embedding(17, self.embedding_dim[6]) # self.linearq = nn.Linear(self.total_embed_dim, self.attention_dim) # self.linearx = nn.Linear(self.total_embed_dim, self.attention_dim) self.attention_dim = self.total_embed_dim self.selfattn = nn.MultiheadAttention(embed_dim= self.attention_dim, num_heads= self.num_heads) self.norm = LayerNorm(self.attention_dim ) self.feed_forward = PositionwiseFeedForward(self.attention_dim) self.linear = nn.Linear(self.attention_dim,self.output_dimension) self.activate = nn.Softplus() self.middleOfTheWindow = window_size//2
def main_node2vec(data): torch.backends.cudnn.deterministic = True device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") num_nodes = data.x.size(0) model = Node2Vec(num_nodes, embedding_dim=64, walk_length=10, context_size=10, walks_per_node=10) model = model.to(device) data = data.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4) for i in range(100): model.train() optimizer.zero_grad() loss = model.loss(data.edge_index) loss.backward() optimizer.step() node_index = torch.tensor([i for i in range(num_nodes)]).to(device) return model.forward(node_index).cpu()
def train(self, data): if data.has_features == False: embedding_dim = 128 embedder = Node2Vec( data.x.size()[0], # Num nodes embedding_dim, # Embedding dimesion 7, # Walk len 3, # Context size ) # First train embedder # Use a higher learning rate, bc this part is # meant to be kind of "quick and dirty" embedder = n2v_trainer(data, embedder, self.device, lr=0.1) # Then use n2v embeddings as features data.x = embedder.embedding.weight model = BenSAGE(features_num=data.x.size()[1], num_class=int(max(data.y)) + 1, num_layers=2) return generic_training_loop(data, model, self.device)
from torch.utils.data import DataLoader import matplotlib.pyplot as plt from sklearn.manifold import TSNE from torch_geometric.datasets import Planetoid from torch_geometric.nn import Node2Vec dataset = 'Cora' path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset) dataset = Planetoid(path, dataset) data = dataset[0] loader = DataLoader(torch.arange(data.num_nodes), batch_size=128, shuffle=True) device = 'cuda' if torch.cuda.is_available() else 'cpu' model = Node2Vec(data.num_nodes, embedding_dim=128, walk_length=20, context_size=10, walks_per_node=10) model, data = model.to(device), data.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) def train(): model.train() total_loss = 0 for subset in loader: optimizer.zero_grad() loss = model.loss(data.edge_index, subset.to(device)) loss.backward() optimizer.step() total_loss += loss.item()
import onnxruntime from sklearn.manifold import TSNE from torch_geometric.datasets import Planetoid from torch_geometric.nn import Node2Vec dataset = 'Cora' path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset) dataset = Planetoid(path, dataset) data = dataset[0] device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cpu' model = Node2Vec(data.edge_index, embedding_dim=128, walk_length=20, context_size=10, walks_per_node=10, num_negative_samples=1, sparse=True).to(device) loader = model.loader(batch_size=128, shuffle=True, num_workers=4) optimizer = torch.optim.SparseAdam(model.parameters(), lr=0.01) def export_to_onnx_pt(model, data, use_dynamic=True): input_names = ["input_1"] output_names = ["output1"] batch = torch.arange(data.num_nodes) if use_dynamic: torch_out = torch.onnx.export( model, # model being run
def node2vec(fp, PARAMS): """[generate node2vec embedding] Args: fp ([string]): [the file path of the root of the data] PARAMS ([dict]): [the parameters of the node2vec model, KEYS: { GRAPH_NAME: the name of the graph file EMBEDDING_DIM: dimension of embedding, WALK_LENGTH: random walk length, CONTEXT_SIZE: context size, WALKS_PER_NODE: number of walks per node, P: P parameter of node2vec, Q: Q parameter of node2vec, LEARNING_RATE: learning rate, BATCH_SIZE: batch size of each batch, NUM_EPOCH: number of epoch to be trained, CUDA: use GPU }] Returns: [np.array]: [the numpy array format of embedding] """ N = io.loadmat(osp.join(fp, 'interim', 'graph', PARAMS['GRAPH_NAME']))['N'] edge_idx, x = from_scipy_sparse_matrix(N) post_indx = io.loadmat( osp.join(fp, 'interim', 'graph', PARAMS['GRAPH_NAME']))['post_indx'] post_indx = post_indx.reshape(-1, ) data = Data(x=x, edge_index=edge_idx) if PARAMS['CUDA']: device = 'cuda' if torch.cuda.is_available() else 'cpu' else: device = 'cpu' model = Node2Vec(data.edge_index, embedding_dim=PARAMS['EMBEDDING_DIM'], walk_length=PARAMS['WALK_LENGTH'], context_size=PARAMS['CONTEXT_SIZE'], walks_per_node=PARAMS['WALKS_PER_NODE'], p=PARAMS['P'], q=PARAMS['Q'], sparse=True).to(device) loader = model.loader(batch_size=PARAMS['BATCH_SIZE'], shuffle=True, num_workers=8) optimizer = torch.optim.SparseAdam(model.parameters(), lr=PARAMS['LEARNING_RATE']) def train(): model.train() total_loss = 0 for pos_rw, neg_rw in tqdm(loader): optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() total_loss += loss.item() return total_loss / len(loader) print('number of nodes to be embedded {}'.format(len(post_indx))) print('Start Node2vec Embedding Process with Following Parameters:') print(PARAMS) losses = [] for epoch in range(1, PARAMS['NUM_EPOCH'] + 1): loss = train() losses.append(loss) print('Epoch: {:02d}, Node2vec Loss: {:.4f}'.format(epoch, loss)) model.eval() with torch.no_grad(): z = model() if not os.path.exists(os.path.join(fp, 'processed', 'node2vec')): os.makedirs(os.path.join(fp, 'processed', 'node2vec'), exist_ok=True) with open( osp.join(fp, 'processed', 'node2vec', PARAMS['EMBEDDING_NAME'] + 'log.json'), 'w') as f: json.dump({'loss': losses}, f) z = z.detach().cpu().numpy()[post_indx, :] np.save(osp.join(fp, 'processed', 'node2vec', PARAMS['EMBEDDING_NAME']), z) print('successfully saved embedding') return z
def main(): parser = argparse.ArgumentParser(description='OGBN-Papers100M (Node2Vec)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--embedding_dim', type=int, default=128) parser.add_argument('--walk_length', type=int, default=20) parser.add_argument('--context_size', type=int, default=10) parser.add_argument('--walks_per_node', type=int, default=10) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=2) parser.add_argument('--log_steps', type=int, default=10) parser.add_argument('--dropedge_rate', type=float, default=0.4) args = parser.parse_args() device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-papers100M') split_idx = dataset.get_idx_split() data = dataset[0] # if args.add_inverse: print('Making the graph undirected.') ### Randomly drop some edges to avoid segmentation fault data.edge_index, _ = dropout_adj(data.edge_index, p=args.dropedge_rate, num_nodes=data.num_nodes) data.edge_index = to_undirected(data.edge_index, data.num_nodes) filename = 'data_dict.pt' print(data) model = Node2Vec(data.edge_index, args.embedding_dim, args.walk_length, args.context_size, args.walks_per_node, sparse=True).to(device) loader = model.loader(batch_size=args.batch_size, shuffle=True, num_workers=4) optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=args.lr) print('Saving data_dict before training...') save_data_dict(model, data, split_idx, save_file=filename) model.train() for epoch in range(1, args.epochs + 1): for i, (pos_rw, neg_rw) in tqdm(enumerate(loader)): optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() if (i + 1) % args.log_steps == 0: print(f'Epoch: {epoch:02d}, Step: {i+1:03d}/{len(loader)}, ' f'Loss: {loss:.4f}') if (i + 1) % 1000 == 0: # Save model every 1000 steps. print('Saving data dict...') save_data_dict(model, data, split_idx, save_file=filename) print('Saving data dict...') save_data_dict(model, data, split_idx, save_file=filename)
def main(_): if not os.path.exists(FLAGS.output_dir): os.makedirs(FLAGS.output_dir) start = time.time() outfile = os.path.join(FLAGS.output_dir, '%s_%i_%i' % (FLAGS.dataset, FLAGS.dim, FLAGS.C)) if FLAGS.run: outfile += '_' + FLAGS.run device = 'cuda' main_directory = FLAGS.datasets_dir main_directory = os.path.expanduser(main_directory) dataset_dir = os.path.join(main_directory, FLAGS.dataset) if not os.path.exists(dataset_dir): print('Dataset not found ' + FLAGS.dataset) print(', '.join(os.listdir(dataset_dir))) exit(-1) graph_file = os.path.join(dataset_dir, 'train.txt.npy') edges = np.load(graph_file) pyg_edges = np.concatenate([edges, edges[:, ::-1]], axis=0).T pyg_edges = torch.from_numpy(np.array(pyg_edges, dtype='int64')) test_neg_file = os.path.join(dataset_dir, 'test.neg.txt.npy') test_neg_arr = np.load(open(test_neg_file, 'rb')) test_pos_file = os.path.join(dataset_dir, 'test.txt.npy') test_pos_arr = np.load(open(test_pos_file, 'rb')) model = Node2Vec(pyg_edges, embedding_dim=FLAGS.dim, walk_length=FLAGS.C, context_size=FLAGS.C, walks_per_node=20, num_negative_samples=1, sparse=True).to(device) loader = model.loader(batch_size=128, shuffle=True, num_workers=4) optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01) def train(): model.train() total_loss = 0 for pos_rw, neg_rw in loader: optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() total_loss += loss.item() return total_loss / len(loader) def test(): model.eval() embeds = model() npe = embeds.detach().cpu().numpy() test_scores = (npe[test_pos_arr[:, 0]] * npe[test_pos_arr[:, 1]]).sum(-1) test_neg_scores = (npe[test_neg_arr[:, 0]] * npe[test_neg_arr[:, 1]]).sum(-1) test_y = [0] * len(test_neg_scores) + [1] * len(test_scores) test_y_pred = np.concatenate([test_neg_scores, test_scores], 0) test_accuracy = metrics.roc_auc_score(test_y, test_y_pred) return test_accuracy header = 'epoch,time,accuracy' with open(outfile, 'w') as fout: print('writing to ' + outfile) fout.write(header + '\n') print(header) for epoch in range(1, 100): # Over 100, it starts overfitting. loss = train() acc = test() line = '%i,%f,%f' % (epoch, time.time() - start, acc) print(line) fout.write(line + '\n')
# We need to add loop edges for vertices with no outgoing edges. # https://github.com/rusty1s/pytorch_cluster/issues/45 index, counts = np.unique(es.src, return_counts=True) degree = np.zeros(num_nodes) degree[index] = counts deadends = (degree == 0).nonzero()[0] # Also convert from uint32 to int64 for PyTorch. srcs = np.concatenate((es.src, deadends)).astype('int64') dsts = np.concatenate((es.dst, deadends)).astype('int64') # Configure Node2Vec. edges = torch.tensor([srcs, dsts]).to(device) model = Node2Vec(edges, num_nodes=num_nodes, embedding_dim=op.params['dimensions'], walk_length=op.params['walkLength'], context_size=op.params['contextSize'], walks_per_node=op.params['walksPerNode']).to(device) loader = model.loader(batch_size=128, shuffle=True) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) # Train model. for epoch in range(op.params['iterations']): model.train() total_loss = 0 for pos_rw, neg_rw in loader: optimizer.zero_grad() loss = model.loss(pos_rw, neg_rw) loss.backward() optimizer.step() total_loss += loss.item()
try: id2 = node2id[node2] except: id2 = len(node2id) node2id[node2] = id2 edge_list.add((id1, id2)) # edge_list.add((id2, id1)) except: pass edge_index = torch.tensor(np.array(edge_list).T, dtype=torch.long) data = Data(edge_index=edge_index) model = Node2Vec(data.edge_index, embedding_dim=128, walk_length=4, context_size=2, walks_per_node=2, sparse=True).to(device) loader = model.loader(batch_size=2000, shuffle=True, num_workers=12) optimizer = torch.optim.SparseAdam(model.parameters(), lr=0.01) for epoch in range(EPOCHS): model.train() # total_loss = 0 for pos_rw, neg_rw in loader: optimizer.zero_grad() loss = model.loss(pos_rw.to(device), neg_rw.to(device)) loss.backward() optimizer.step() # total_loss += loss.item() # total_loss = total_loss / len(loader)
def __init__(self, A): self.n2v = Node2Vec(len(A), **params) self.svm = SVC()
def run_model(dataset, conf): # ## 1) Build Table graph # ### Tables tokenization tokenized_tables, vocabulary, cell_dict, reversed_dictionary = corpus_tuple = create_corpus( dataset, include_attr=conf["add_attr"]) if conf["shuffle_vocab"] == True: shuffled_vocab = shuffle_vocabulary(vocabulary) else: shuffled_vocab = None nodes = build_node_features(vocabulary) row_edges_index, row_edges_weights = build_graph_edges( tokenized_tables, s_vocab=shuffled_vocab, sample_frac=conf["row_edges_sample"], columns=False) col_edges_index, col_edges_weights = build_graph_edges( tokenized_tables, s_vocab=shuffled_vocab, sample_frac=conf["column_edges_sample"], columns=True) edges = torch.cat((row_edges_index, col_edges_index), dim=1) weights = torch.cat((row_edges_weights, col_edges_weights), dim=0) graph_data = Data(x=nodes, edge_index=edges, edge_attr=weights) # ## 2 ) Run Table Auto-Encoder Model: device = 'cuda' if torch.cuda.is_available() else 'cpu' loader = DataLoader(torch.arange(graph_data.num_nodes), batch_size=128, shuffle=True) graph_data = graph_data.to(device) def train(): model.train() total_loss = 0 for subset in loader: optimizer.zero_grad() loss = model.loss(graph_data.edge_index, subset.to(device)) loss.backward() optimizer.step() total_loss += loss.item() return total_loss / len(loader) model = Node2Vec(graph_data.num_nodes, embedding_dim=conf["vector_size"], walk_length=conf["n2v_walk_length"], context_size=conf["n2v_context_size"], walks_per_node=conf["n2v_walks_per_node"]) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) losses = [] for epoch in range(conf["epoch_num"]): loss = train() print('Epoch: {:02d}, Loss: {:.4f}'.format(epoch, loss)) losses.append(float(loss)) # ### 3) Extract the latent cell vectors, generate table vectors: model.eval() with torch.no_grad(): z = model(torch.arange(graph_data.num_nodes, device=device)) cell_vectors = z.cpu().numpy() vec_list = generate_table_vectors(cell_vectors, tokenized_tables, s_vocab=shuffled_vocab) # ## 3) Evaluate the model result_score = evaluate_model(dataset, vec_list, k=5) return cell_vectors, vec_list, losses, result_score
def train(self, data, start_time, time_budget): ADD_N2V = False ADD_GRAPH_FEATS = True MIN_TIME = 3 # Stop training loop early if less than this many seconds remain # Graph data avg_degree = degree(data.edge_index[0], data.x.size()[0]).mean() # Add n2v embeddings to features if there are an order of magnitude more # edges than there are features if int(log(data.x.size()[0], 10)) < int( log(data.edge_index.size()[1], 10)): ADD_N2V = True # Hyperparamters train_epochs = 1000 num_layers = 2 # gcn layers # Different algorithms for the number of hidden dims for each if data.has_features: hidden = min([int(max(data.y) + 1)**2, 128]) attn_heads = 'N/a' else: attn_heads = min([int(log(max(data.y) + 1)) + 2, 4]) hidden = (min([int(max(data.y) + 1)**2, 32]) // attn_heads) + 1 early_stopping = True val_patience = 100 # how long validation loss can increase before we stop # Use heuristic-based hyperparams if too many edges to handle simplified = True if data.edge_index.size()[1] > 1e6 else False print('Hidden dimensions: %d' % hidden) print('Attention heads: %s' % str(attn_heads)) if not data.has_features or ADD_N2V: # Requires at least len(class) dimensions, but give it a little more embedding_dim = 128 + int(avg_degree**(1 / 2)) # The larger the avg degree, the less distant walks matter # Of course, a minimum is still important context_size = int(log(data.edge_index.size()[1]) / avg_degree) context_size = context_size if context_size >= 3 else 3 # We should look at at least 1 context per walk walk_len = context_size + 1 print('Embedding dim: %d\tWalk Len: %d\tContext size: %d' % (embedding_dim, walk_len, context_size)) embedder = Node2Vec( data.x.size()[0], # Num nodes embedding_dim, # Embedding dimesion walk_len, # Walk len context_size, # Context size num_negative_samples=context_size**2) # First, train embedder # Use a higher learning rate, bc this part is # meant to be kind of "quick and dirty" embedder = self.n2v_trainer( data, embedder, lr=0.05, patience=50 # lower patience when time is important ) # Training moves data to GPU. Have to put it back before manipulating # it further. data = data.to('cpu') embedder = embedder.to('cpu') if data.has_features and ADD_N2V: data.x = torch.cat( (self.var_thresh(data.x), embedder.embedding.weight), axis=1) else: # Then use n2v embeddings as features data.x = embedder.embedding.weight # Remove reference to embedder to free up memory on GPU del embedder gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() else: print('Num feature before: %d' % data.x.size()[1]) data.x = self.var_thresh(data.x) print('Num features after: %d' % data.x.size()[1]) if ADD_GRAPH_FEATS: print('Num feature before: %d' % data.x.size()[1]) data.x = torch.cat((data.x, data.graph_data), axis=1) print('Num features after: %d' % data.x.size()[1]) if data.has_features: print("Using GCN") # Make sure we actually need this.. Only if it crashes on test data # Just use heuristics-based #if simplified: # params = { # 'features_num': data.x.size()[1], # 'num_class': int(max(data.y)) + 1, # 'hidden': hidden # } # Grid search to find best #else: params = self.grid_search(data, hidden, h_dist=10, epochs=50, h_step=1) model = GCN(**params) else: print("Using GAT") # Just use heuristics based if simplified: params = { 'features_num': data.x.size()[1], 'num_class': int(max(data.y)) + 1, 'hidden': hidden, 'heads': attn_heads, 'dropout': 0.7 # Increase dropout if complex graph } # Do grid search for best params else: params = self.grid_search(data, hidden, attn_heads=attn_heads, h_dist=10, h_step=1, a_dist=1, a_step=1, epochs=25) model = GAT(**params) # Move data to compute device model = model.to(self.device) data = data.to(self.device) # Configure optimizer lr = 0.005 optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5) # Main training loop min_loss = float('inf') val_loss_min = 1000 train_loss_min = 1000 val_increase = 0 stopped_early = False state_dict_save = 'checkpoint.model' # Fraction of patience before restarting with lower lr from best model FRUSTRATION = 1 if data.has_features else 25 # Number of times to retry training from prev best with lower lr NUM_REDOS = 15 # Smallest value we allow loss to be, usually 5e-6 MIN_LOSS = 5e-6 # How much we decrease loss when frustrated LOSS_DEC_INC = 1.25 # What percent validation loss increase we're willing to accept if training loss # goes down by more than that much. The hope is this balances for overfitting? # E.g., an epoch that increases val loss from 1 to 1.01 but decreases train loss # from 1 to 0.98 is considered the best model GOOD_ENOUGH = 1.00025 BECOME_FRUSTRATED = False lr_decays = 0 # LR must decay at least this many times before GOOD_ENOUGH training # is activated. This way we're sure it's in a local minimum before we # allow it to stray GOOD_ENOUGH_THRESH = 3 epoch = 0 while (True): train_start = time.time() model.train() optimizer.zero_grad() loss = F.nll_loss(model(data)[data.train_mask], data.y[data.train_mask], weight=data.weighted_loss) loss.backward() optimizer.step() train_loss = loss.item() # calculate loss on validation set model.eval() loss = F.nll_loss(model(data)[data.val_mask], data.y[data.val_mask], weight=data.weighted_loss) val_loss = loss.item() if ((val_loss > val_loss_min and early_stopping) and not (BECOME_FRUSTRATED and val_loss <= GOOD_ENOUGH * val_loss_min and train_loss * GOOD_ENOUGH <= train_loss_min)): val_increase += 1 else: print("===New Minimum validation loss===") print('[%d] Train loss: %.3f Val Loss: %.3f' % (epoch, train_loss, val_loss)) val_loss_min = val_loss train_loss_min = train_loss val_increase = 0 redos = 1 torch.save(model.state_dict(), state_dict_save) # Want to make sure we have the amount of time it takes # to loop and however much extra we need later time_cutoff = MIN_TIME + (time.time() - train_start) if (val_increase > val_patience or time_budget - (time.time() - start_time) < time_cutoff): print("Early stopping!") stopped_early = True break # Lower learning rate and start from prev best after model becomes # frustrated with poor progress if val_increase > val_patience // FRUSTRATION and lr > MIN_LOSS: if redos % NUM_REDOS == 0: lr /= LOSS_DEC_INC lr = lr if lr > MIN_LOSS else MIN_LOSS # make sure not less than 5e-6 print('LR decay: New lr: %.6f' % lr) for g in optimizer.param_groups: g['lr'] = lr lr_decays += 1 if lr_decays >= GOOD_ENOUGH_THRESH: BECOME_FRUSTRATED = True model.load_state_dict(torch.load(state_dict_save)) redos += 1 # Use simple LR decay for data w features elif data.has_features and epoch > 0 and epoch % 10 == 0 and lr > 5e-6: lr -= 0.00025 lr = lr if lr > 5e-6 else 5e-6 # make sure not less than 5e-6 print('LR decay: New lr: %.6f' % lr) for g in optimizer.param_groups: g['lr'] = lr epoch += 1 if stopped_early: print("Reloading best parameters!") # State dict saved to CPU so have to load from there(?) model.load_state_dict(torch.load(state_dict_save)) return model