def train_ray(opt, checkpoint_dir=None, data_dir="../data"): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") data = get_dataset(opt) g = data[0] if opt['gpu'] < 0: cuda = False else: cuda = True g = g.int().to(opt['gpu']) features = g.ndata['feat'] labels = g.ndata['label'] train_mask = g.ndata['train_mask'] val_mask = g.ndata['val_mask'] test_mask = g.ndata['test_mask'] num_feats = features.shape[1] n_classes = data.num_classes n_edges = data.graph.number_of_edges() print("""----Data statistics------' #Edges %d #Classes %d #Train samples %d #Val samples %d #Test samples %d""" % (n_edges, n_classes, train_mask.int().sum().item(), val_mask.int().sum().item(), test_mask.int().sum().item())) # add self loop g = dgl.remove_self_loop(g) g = dgl.add_self_loop(g) n_edges = g.number_of_edges() # create model heads = ([opt['num_heads']] * opt['num_layers']) + [opt['num_out_heads']] models = [] optimizers = [] datas = [g for i in range(opt['num_init'])] for split in range(opt['num_init']): if opt['model'] == 'GAT': model = GAT(g, opt['num_layers'], num_feats, opt['num_hidden'], n_classes, heads, F.elu, opt['in_drop'], opt['attn_drop'], opt['negative_slope'], opt['residual'], opt) elif opt['model'] == 'AGNN': model = AGNN(g, opt['num_layers'], num_feats, opt['num_hidden'], n_classes, opt['in_drop'], opt) train_this = train model = model.to(device) models.append(model) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) # model = model.to(device) parameters = [p for p in model.parameters() if p.requires_grad] optimizer = get_optimizer(opt['optimizer'], parameters, lr=opt['lr'], weight_decay=opt['weight_decay']) optimizers.append(optimizer) # The `checkpoint_dir` parameter gets passed by Ray Tune when a checkpoint # should be restored. if checkpoint_dir: checkpoint = os.path.join(checkpoint_dir, "checkpoint") model_state, optimizer_state = torch.load(checkpoint) model.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) for epoch in range(1, opt['epochs']): loss = np.mean([ train_this(model, optimizer, features, train_mask, labels)[0].item() for model, optimizer in zip(models, optimizers) ]) train_accs, val_accs, tmp_test_accs = average_test(models, datas) with tune.checkpoint_dir(step=epoch) as checkpoint_dir: best = np.argmax(val_accs) path = os.path.join(checkpoint_dir, "checkpoint") torch.save( (models[best].state_dict(), optimizers[best].state_dict()), path) tune.report(loss=loss, accuracy=np.mean(val_accs), test_acc=np.mean(tmp_test_accs), train_acc=np.mean(train_accs))
def train_ray_int(opt, checkpoint_dir=None, data_dir="../data"): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") data = get_dataset(opt) g = data[0] if opt['gpu'] < 0: cuda = False else: cuda = True g = g.int().to(opt['gpu']) # if opt["num_splits"] > 0: # dataset.data = set_train_val_test_split( # 23 * np.random.randint(0, opt["num_splits"]), # random prime 23 to make the splits 'more' random. Could remove # dataset.data, # num_development=5000 if opt["dataset"] == "CoauthorCS" else 1500) features = g.ndata['feat'] labels = g.ndata['label'] train_mask = g.ndata['train_mask'] val_mask = g.ndata['val_mask'] test_mask = g.ndata['test_mask'] num_feats = features.shape[1] n_classes = data.num_classes n_edges = data.graph.number_of_edges() print("""----Data statistics------' #Edges %d #Classes %d #Train samples %d #Val samples %d #Test samples %d""" % (n_edges, n_classes, train_mask.int().sum().item(), val_mask.int().sum().item(), test_mask.int().sum().item())) # add self loop g = dgl.remove_self_loop(g) g = dgl.add_self_loop(g) n_edges = g.number_of_edges() # create model heads = ([opt['num_heads']] * opt['num_layers']) + [opt['num_out_heads']] if opt['model'] == 'GAT': model = GAT(g, opt['num_layers'], num_feats, opt['num_hidden'], n_classes, heads, F.elu, opt['in_drop'], opt['attn_drop'], opt['negative_slope'], opt['residual'], opt) elif opt['model'] == 'AGNN': model = AGNN(g, opt['num_layers'], num_feats, opt['num_hidden'], n_classes, opt['in_drop'], opt) model = model.to(device) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) parameters = [p for p in model.parameters() if p.requires_grad] optimizer = get_optimizer(opt["optimizer"], parameters, lr=opt["lr"], weight_decay=opt["weight_decay"]) if checkpoint_dir: checkpoint = os.path.join(checkpoint_dir, "checkpoint") model_state, optimizer_state = torch.load(checkpoint) model.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) train_this = train this_test = test_OGB if opt['dataset'] == 'ogbn-arxiv' else test best_time = best_epoch = train_acc = val_acc = test_acc = 0 for epoch in range(1, opt["epoch"]): # loss = train(model, optimizer, data) loss = train_this(model, optimizer, features, train_mask, labels)[0].item() if opt["no_early"]: tmp_train_acc, tmp_val_acc, tmp_test_acc = this_test(model, g) best_time = opt['time'] else: tmp_train_acc, tmp_val_acc, tmp_test_acc = this_test(model, g) if tmp_val_acc > val_acc: best_epoch = epoch train_acc = tmp_train_acc val_acc = tmp_val_acc test_acc = tmp_test_acc with tune.checkpoint_dir(step=epoch) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") torch.save((model.state_dict(), optimizer.state_dict()), path) tune.report(loss=loss, accuracy=val_acc, test_acc=test_acc, train_acc=train_acc, best_time=best_time, best_epoch=best_epoch)