def main(args): dataname = os.path.basename(args.dataset) g = dgl.contrib.graph_store.create_graph_from_store(dataname, "shared_mem") labels = data.get_labels(args.dataset) n_classes = len(np.unique(labels)) train_mask, val_mask, test_mask = data.get_masks(args.dataset) train_nid = np.nonzero(train_mask)[0].astype(np.int64) vnum = train_mask.shape[0] num_hops = args.n_layers if args.preprocess else args.n_layers + 1 for epoch in range(args.n_epochs): #epoch_load_vnum = 0 freq = np.zeros(vnum, dtype=np.int64) for nf in dgl.contrib.sampling.NeighborSampler(g, args.batch_size, args.num_neighbors, neighbor_type='in', shuffle=True, num_workers=16, num_hops=num_hops, seed_nodes=train_nid, prefetch=False): #epoch_load_vnum += count_nf_vnum(nf) count_vertex_freq(nf, freq) hit_rate = optimal_cache_hit(freq, 0.2) print('Oracle cache hit rate: ', hit_rate)
def main(args): dataname = os.path.basename(args.dataset) g = dgl.contrib.graph_store.create_graph_from_store(dataname, "shared_mem") labels = data.get_labels(args.dataset) n_classes = len(np.unique(labels)) train_mask, val_mask, test_mask = data.get_masks(args.dataset) test_nid = np.nonzero(test_mask)[0].astype(np.int64) labels = torch.LongTensor(labels) if args.arch == 'gcn-nssc': from PaGraph.model.gcn_nssc import GCNSampling, GCNInfer infer_model = GCNInfer(args.feat_size, 32, n_classes, args.n_layers, F.relu, args.preprocess) elif args.arch == 'gs-nssc': from PaGraph.model.graphsage_nssc import GraphSageSampling infer_model = GraphSageSampling(args.feat_size, 16, n_classes, args.n_layers, F.relu, 0, 'mean', args.preprocess) else: print('Unknown arch') sys.exit(-1) gnneval(args, infer_model, g, labels, 0, test_nid)
def main(args): dataname = os.path.basename(args.dataset) g = dgl.contrib.graph_store.create_graph_from_store(dataname, "shared_mem") labels = data.get_labels(args.dataset) n_classes = len(np.unique(labels)) train_mask, val_mask, test_mask = data.get_masks(args.dataset) train_nid = np.nonzero(train_mask)[0].astype(np.int64) num_hops = args.n_layers if args.preprocess else args.n_layers + 1 for epoch in range(args.n_epochs): epoch_load_vnum = 0 for nf in dgl.contrib.sampling.NeighborSampler(g, args.batch_size, args.num_neighbors, neighbor_type='in', shuffle=True, num_workers=16, num_hops=num_hops, seed_nodes=train_nid, prefetch=False): epoch_load_vnum += count_nf_vnum(nf) print('Epoch loaded vertex#: ', epoch_load_vnum)
type=int, default=2, help="num of partitions") parser.add_argument("--num-hops", type=int, default=1, help="num of hop neighbors required for a batch") parser.add_argument("--ordering", dest='ordering', action='store_true') parser.set_defaults(ordering=False) args = parser.parse_args() # get data adj = spsp.load_npz(os.path.join(args.dataset, 'adj.npz')) train_mask, val_mask, test_mask = data.get_masks(args.dataset) train_nids = np.nonzero(train_mask)[0].astype(np.int64) labels = data.get_labels(args.dataset) # ordering if args.ordering: print('re-ordering graphs...') adj = adj.tocsc() adj, vmap = ordering.reordering( adj, depth=args.num_hop) # vmap: orig -> new # save to files mapv = np.zeros(vmap.shape, dtype=np.int64) mapv[vmap] = np.arange(vmap.shape[0]) # mapv: new -> orig train_nids = np.sort(vmap[train_nids]) spsp.save_npz(os.path.join(args.dataset, 'adj.npz'), adj) np.save(os.path.join(args.dataset, 'labels.npy'), labels[mapv]) np.save(os.path.join(args.dataset, 'train.npy'), train_mask[mapv]) np.save(os.path.join(args.dataset, 'val.npy'), val_mask[mapv])
def trainer(rank, world_size, args, backend='nccl'): # init multi process init_process(rank, world_size, backend) # load data dataname = os.path.basename(args.dataset) g = dgl.contrib.graph_store.create_graph_from_store(dataname, "shared_mem") labels = data.get_labels(args.dataset) n_classes = len(np.unique(labels)) # masks for semi-supervised learning train_mask, val_mask, test_mask = data.get_masks(args.dataset) train_nid = np.nonzero(train_mask)[0].astype(np.int64) test_nid = np.nonzero(test_mask)[0].astype(np.int64) chunk_size = int(train_nid.shape[0] / world_size) - 1 train_nid = train_nid[chunk_size * rank:chunk_size * (rank + 1)] # to torch tensor labels = torch.LongTensor(labels) train_mask = torch.ByteTensor(train_mask) val_mask = torch.ByteTensor(val_mask) test_mask = torch.ByteTensor(test_mask) # prepare model num_hops = args.n_layers if args.preprocess else args.n_layers + 1 model = GCNSampling(args.feat_size, args.n_hidden, n_classes, args.n_layers, F.relu, args.dropout, args.preprocess) infer_model = GCNInfer(args.feat_size, args.n_hidden, n_classes, args.n_layers, F.relu) loss_fcn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) model.cuda(rank) infer_model.cuda(rank) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank]) ctx = torch.device(rank) # start training epoch_dur = [] sock = PaGraph.utils.trainer(port=8200 + rank) port = 8760 sampler = dgl.contrib.sampling.SamplerReceiver(g, '127.0.0.1:' + str(port + rank), 1, net_type='socket') with torch.autograd.profiler.profile(enabled=(rank == 0), use_cuda=True) as prof: for epoch in range(args.n_epochs): model.train() epoch_start_time = time.time() step = 0 for nf in sampler: with torch.autograd.profiler.record_function('gpu-load'): nf.copy_from_parent(ctx=ctx) batch_nids = nf.layer_parent_nid(-1) label = labels[batch_nids] label = label.cuda(rank, non_blocking=True) with torch.autograd.profiler.record_function('gpu-compute'): pred = model(nf) loss = loss_fcn(pred, label) optimizer.zero_grad() loss.backward() optimizer.step() step += 1 if rank == 0 and step % 20 == 0: print('epoch [{}] step [{}]. Loss: {:.4f}'.format( epoch + 1, step, loss.item())) if step % 100 == 0: PaGraph.utils.barrier(sock, role='trainer') if rank == 0: epoch_dur.append(time.time() - epoch_start_time) print('Epoch average time: {:.4f}'.format( np.mean(np.array(epoch_dur[2:])))) # epoch barrier PaGraph.utils.barrier(sock, role='trainer') if rank == 0: print(prof.key_averages().table(sort_by='cuda_time_total'))
def trainer(rank, world_size, args, backend='nccl'): # init multi process init_process(rank, world_size, backend) # load data dataname = os.path.basename(args.dataset) g = dgl.contrib.graph_store.create_graph_from_store(dataname, "shared_mem") labels = data.get_labels(args.dataset) n_classes = args.n_classes # masks for semi-supervised learning train_mask, val_mask, test_mask = data.get_masks(args.dataset) train_nid = np.nonzero(train_mask)[0].astype(np.int64) chunk_size = int(train_nid.shape[0] / world_size) - 1 train_nid = train_nid[chunk_size * rank:chunk_size * (rank + 1)] test_nid = np.nonzero(test_mask)[0].astype(np.int64) # to torch tensor labels = torch.LongTensor(labels) train_mask = torch.ByteTensor(train_mask) val_mask = torch.ByteTensor(val_mask) test_mask = torch.ByteTensor(test_mask) # cacher embed_names = ['features', 'norm'] vnum = train_mask.size(0) maps = torch.arange(vnum) cacher = storage.GraphCacheServer(g, vnum, maps, rank) cacher.init_field(embed_names) cacher.log = False # prepare model model = GCNSampling(args.feat_size, args.n_hidden, n_classes, args.n_layers, F.relu, args.dropout, args.preprocess) loss_fcn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) model.cuda(rank) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank]) # sampler num_hops = args.n_layers if args.preprocess else args.n_layers + 1 if args.remote_sample: sampler = SampleLoader(g, rank, one2all=False) else: sampler = dgl.contrib.sampling.NeighborSampler( g, args.batch_size, args.num_neighbors, neighbor_type='in', shuffle=True, num_workers=16, num_hops=num_hops, seed_nodes=train_nid, prefetch=True) # start training epoch_dur = [] tic = time.time() with torch.autograd.profiler.profile(enabled=(rank==0), use_cuda=True) as prof: for epoch in range(args.n_epochs): model.train() epoch_start_time = time.time() step = 0 for nf in sampler: with torch.autograd.profiler.record_function('gpu-load'): cacher.fetch_data(nf) batch_nids = nf.layer_parent_nid(-1) label = labels[batch_nids] label = label.cuda(rank, non_blocking=True) with torch.autograd.profiler.record_function('gpu-compute'): pred = model(nf) loss = loss_fcn(pred, label) optimizer.zero_grad() loss.backward() optimizer.step() step += 1 if epoch == 0 and step == 1: cacher.auto_cache(g, embed_names) if rank == 0 and step % 20 == 0: print('epoch [{}] step [{}]. Loss: {:.4f}' .format(epoch + 1, step, loss.item())) if rank == 0: epoch_dur.append(time.time() - epoch_start_time) print('Epoch average time: {:.4f}'.format(np.mean(np.array(epoch_dur[2:])))) if cacher.log: miss_rate = cacher.get_miss_rate() print('Epoch average miss rate: {:.4f}'.format(miss_rate)) toc = time.time() if rank == 0: print(prof.key_averages().table(sort_by='cuda_time_total')) print('Total Time: {:.4f}s'.format(toc - tic))
def trainer(rank, world_size, args, backend='nccl'): # init multi process init_process(rank, world_size, backend) # load data dataname = os.path.basename(args.dataset) g = dgl.contrib.graph_store.create_graph_from_store(dataname, "shared_mem") labels = data.get_labels(args.dataset) n_classes = len(np.unique(labels)) # masks for semi-supervised learning train_mask, val_mask, test_mask = data.get_masks(args.dataset) train_nid = np.nonzero(train_mask)[0].astype(np.int64) chunk_size = int(train_nid.shape[0] / world_size) - 1 train_nid = train_nid[chunk_size * rank:chunk_size * (rank + 1)] test_nid = np.nonzero(test_mask)[0].astype(np.int64) # to torch tensor labels = torch.LongTensor(labels) train_mask = torch.ByteTensor(train_mask) val_mask = torch.ByteTensor(val_mask) test_mask = torch.ByteTensor(test_mask) # prepare model num_hops = args.n_layers if args.preprocess else args.n_layers + 1 model = GCNSampling(args.feat_size, args.n_hidden, n_classes, args.n_layers, F.relu, args.dropout, args.preprocess) loss_fcn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) model.cuda(rank) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank]) ctx = torch.device(rank) # start training epoch_dur = [] sampler = dgl.contrib.sampling.NeighborSampler( g, args.batch_size, args.num_neighbors, neighbor_type='in', shuffle=True, num_workers=args.num_workers, num_hops=num_hops, seed_nodes=train_nid, prefetch=False) profile_begin = time.time() with torch.autograd.profiler.profile(enabled=(rank == 0), use_cuda=True) as prof: for epoch in range(args.n_epochs): model.train() epoch_start_time = time.time() step = 0 for nf in sampler: with torch.autograd.profiler.record_function('gpu-load'): nf.copy_from_parent(ctx=ctx) batch_nids = nf.layer_parent_nid(-1) label = labels[batch_nids] label = label.cuda(rank, non_blocking=True) step += 1 if rank == 0 and step % 20 == 0: print('epoch [{}] step [{}]'.format(epoch + 1, step)) if rank == 0: epoch_dur.append(time.time() - epoch_start_time) print('Epoch average time: {:.4f}'.format( np.mean(np.array(epoch_dur[2:])))) print('Total Time: {:.4f}s'.format(time.time() - profile_begin)) if rank == 0: print(prof.key_averages().table(sort_by='cuda_time_total'))
def trainer(rank, world_size, args, backend='nccl'): # init multi process init_process(rank, world_size, backend) # load data dataname = os.path.basename(args.dataset) g = dgl.contrib.graph_store.create_graph_from_store(dataname, "shared_mem") labels = data.get_labels(args.dataset) n_classes = len(np.unique(labels)) # masks for semi-supervised learning train_mask, val_mask, test_mask = data.get_masks(args.dataset) train_nid = np.nonzero(train_mask)[0].astype(np.int64) chunk_size = int(train_nid.shape[0] / world_size) - 1 train_nid = train_nid[chunk_size * rank:chunk_size * (rank + 1)] test_nid = np.nonzero(test_mask)[0].astype(np.int64) # to torch tensor labels = torch.LongTensor(labels) train_mask = torch.ByteTensor(train_mask) val_mask = torch.ByteTensor(val_mask) test_mask = torch.ByteTensor(test_mask) # generate data vnum = 4036538 data1 = torch.rand((vnum, args.feat_size)) data2 = torch.rand((vnum, 1)) # norm # start training epoch_dur = [] num_hops = args.n_layers if args.preprocess else args.n_layers + 1 sampler = dgl.contrib.sampling.NeighborSampler( g, args.batch_size, args.num_neighbors, neighbor_type='in', shuffle=True, num_workers=args.num_workers, num_hops=num_hops, seed_nodes=train_nid, prefetch=False) profile_begin = time.time() with torch.autograd.profiler.profile(enabled=(rank == 0), use_cuda=True) as prof: for epoch in range(args.n_epochs): epoch_start_time = time.time() step = 0 for nf in sampler: datas = [] for i in range(nf.num_layers): with torch.autograd.profiler.record_function( 'index-select'): tnid = nf.layer_parent_nid(i) fdata1 = data1[tnid] fdata2 = data2[tnid] datas.append([fdata1, fdata2]) with torch.autograd.profiler.record_function('h2d-load'): fdata1 = fdata1.cuda() fdata2 = fdata2.cuda() with torch.autograd.profiler.record_function('index-select'): label = labels[tnid] with torch.autograd.profiler.record_function('h2d-load'): label = label.cuda() # clear data del label for d in datas: del d step += 1 if rank == 0 and step % 20 == 0: print('epoch [{}] step [{}]'.format(epoch + 1, step)) if rank == 0: epoch_dur.append(time.time() - epoch_start_time) print('Epoch average time: {:.4f}'.format( np.mean(np.array(epoch_dur[2:])))) print('Total Time: {:.4f}s'.format(time.time() - profile_begin)) if rank == 0: print(prof.key_averages().table(sort_by='cuda_time_total'))