def load_dataset_for_classification(args): """Load dataset for classification tasks. Parameters ---------- args : dict Configurations. Returns ------- dataset The whole dataset. train_set Subset for training. val_set Subset for validation. test_set Subset for test. """ assert args['dataset'] in ['Tox21'] if args['dataset'] == 'Tox21': from dgl.data.chem import Tox21 dataset = Tox21(atom_featurizer=args['atom_featurizer']) train_set, val_set, test_set = split_dataset(dataset, args['train_val_test_split']) return dataset, train_set, val_set, test_set
def train_val_test_split(dataset, frac_train=0.8, frac_val=0.1, frac_test=0.1): """Split the dataset into three consecutive chunks for training, validation and test. Parameters ---------- dataset We assume ``len(dataset)`` gives the size for the dataset and ``dataset[i]`` gives the ith datapoint. frac_train : float Fraction of data to use for training. By default, we set this to be 0.8, i.e. 80% of the dataset is used for training. frac_val : float Fraction of data to use for validation. By default, we set this to be 0.1, i.e. 10% of the dataset is used for validation. frac_test : float Fraction of data to use for test. By default, we set this to be 0.1, i.e. 10% of the dataset is used for test. Returns ------- list of length 3 Subsets for training, validation and test that also have ``len(dataset)`` and ``dataset[i]`` behaviors """ return split_dataset(dataset, frac_list=[frac_train, frac_val, frac_test], shuffle=False)
def load_dataset_for_regression(args): """Load dataset for regression tasks. Parameters ---------- args : dict Configurations. Returns ------- train_set Subset for training. val_set Subset for validation. test_set Subset for test. """ assert args['dataset'] in ['Alchemy', 'Aromaticity'] if args['dataset'] == 'Alchemy': from dgl.data.chem import TencentAlchemyDataset train_set = TencentAlchemyDataset(mode='dev') val_set = TencentAlchemyDataset(mode='valid') test_set = None if args['dataset'] == 'Aromaticity': from dgl.data.chem import PubChemBioAssayAromaticity dataset = PubChemBioAssayAromaticity( atom_featurizer=args['atom_featurizer'], bond_featurizer=args['bond_featurizer']) train_set, val_set, test_set = split_dataset( dataset, frac_list=args['train_val_test_split'], shuffle=True, random_state=args['random_seed']) return train_set, val_set, test_set
def main(args): args['device'] = "cuda" if torch.cuda.is_available() else "cpu" set_random_seed() # Interchangeable with other datasets if args['dataset'] == 'Tox21': from dgl.data.chem import Tox21 dataset = Tox21() trainset, valset, testset = split_dataset(dataset, args['train_val_test_split']) train_loader = DataLoader(trainset, batch_size=args['batch_size'], collate_fn=collate_molgraphs_for_classification) val_loader = DataLoader(valset, batch_size=args['batch_size'], collate_fn=collate_molgraphs_for_classification) test_loader = DataLoader(testset, batch_size=args['batch_size'], collate_fn=collate_molgraphs_for_classification) if args['pre_trained']: args['num_epochs'] = 0 model = model_zoo.chem.load_pretrained(args['exp']) else: # Interchangeable with other models if args['model'] == 'GCN': model = model_zoo.chem.GCNClassifier(in_feats=args['in_feats'], gcn_hidden_feats=args['gcn_hidden_feats'], classifier_hidden_feats=args['classifier_hidden_feats'], n_tasks=dataset.n_tasks) elif args['model'] == 'GAT': model = model_zoo.chem.GATClassifier(in_feats=args['in_feats'], gat_hidden_feats=args['gat_hidden_feats'], num_heads=args['num_heads'], classifier_hidden_feats=args['classifier_hidden_feats'], n_tasks=dataset.n_tasks) loss_criterion = BCEWithLogitsLoss(pos_weight=dataset.task_pos_weights.to(args['device']), reduction='none') optimizer = Adam(model.parameters(), lr=args['lr']) stopper = EarlyStopping(patience=args['patience']) model.to(args['device']) for epoch in range(args['num_epochs']): # Train run_a_train_epoch(args, epoch, model, train_loader, loss_criterion, optimizer) # Validation and early stop val_roc_auc = run_an_eval_epoch(args, model, val_loader) early_stop = stopper.step(val_roc_auc, model) print('epoch {:d}/{:d}, validation roc-auc score {:.4f}, best validation roc-auc score {:.4f}'.format( epoch + 1, args['num_epochs'], val_roc_auc, stopper.best_score)) if early_stop: break if not args['pre_trained']: stopper.load_checkpoint(model) test_roc_auc = run_an_eval_epoch(args, model, test_loader) print('test roc-auc score {:.4f}'.format(test_roc_auc))
def main(args): args = setup(args) dataset = Tox21() train_set, val_set, test_set = split_dataset(dataset, shuffle=True) train_loader = DataLoader(train_set, batch_size=args['batch_size'], shuffle=True, collate_fn=collate_molgraphs) val_loader = DataLoader(val_set, batch_size=args['batch_size'], shuffle=True, collate_fn=collate_molgraphs) test_loader = DataLoader(test_set, batch_size=args['batch_size'], shuffle=True, collate_fn=collate_molgraphs) model = model_zoo.chem.GCNClassifier( in_feats=args['n_input'], gcn_hidden_feats=[args['n_hidden'] for _ in range(args['n_layers'])], n_tasks=dataset.n_tasks, classifier_hidden_feats=args['n_hidden']).to(args['device']) loss_criterion = BCEWithLogitsLoss(pos_weight=torch.tensor( dataset.task_pos_weights).to(args['device']), reduction='none') optimizer = Adam(model.parameters(), lr=args['lr']) stopper = EarlyStopper(args['patience']) history = [] for epoch in range(args['n_epochs']): # Train train_score = run_a_train_epoch(args, epoch, model, train_loader, loss_criterion, optimizer) # Validation and early stop val_score = run_an_eval_epoch(args, model, val_loader) history.append([train_score, val_score]) early_stop = stopper.step(val_score, model) print( 'epoch {:d}/{:d}, validation roc-auc {:.4f}, best validation roc-auc {:.4f}' .format(epoch + 1, args['n_epochs'], val_score, stopper.best_score)) torch.save(history, "./history.pt") if early_stop: break stopper.load_checkpoint(model) test_score = run_an_eval_epoch(args, model, test_loader) plot_save(history) print('Best validation score {:.4f}'.format(stopper.best_score)) print('Test score {:.4f}'.format(test_score))
def main(): # Setup Variables config_dir = '/opt/ml/input/config' model_dir = '/opt/ml/model' with open(os.path.join(config_dir, 'hyperparameters.json'), 'r') as file: parameters_dict = json.load(file) learning_rate = float(parameters_dict['learning-rate']) epochs = int(parameters_dict['epochs']) # Getting dataset dataset = CoraFullDataset() graph = dataset[0] features = graph.ndata['feat'] labels = graph.ndata['label'] # Spliting dataset train_mask, val_mask = split_dataset(graph, [0.8, 0.2]) # Creating Model model = GraphConvolutionalNetwork(features.shape[1], 16, dataset.num_classes) optimizer = th.optim.Adam(model.parameters(), lr=learning_rate) # Training for epoch in range(epochs): pred = model(graph, features) loss = F.cross_entropy(pred[train_mask.indices], labels[train_mask.indices].to(th.long)) train_acc = (labels[train_mask.indices] == pred[ train_mask.indices].argmax(1)).float().mean() val_acc = (labels[val_mask.indices] == pred[val_mask.indices].argmax(1) ).float().mean() optimizer.zero_grad() loss.backward() optimizer.step() print( f'Epoch {epoch}/{epochs} | Loss: {loss.item()}, train_accuracy: {train_acc}, val_accuracy: {val_acc}' ) # Saving Graph save_graphs(os.path.join(model_dir, 'dgl-citation-network-graph.bin'), graph) # Saving Model th.save(model, os.path.join(model_dir, 'dgl-citation-network-model.pt'))
def main(args): args = setup(args) dataset = Tox21() train_set, val_set, test_set = split_dataset(dataset, shuffle=True) train_loader = DataLoader(train_set, batch_size=args["batch_size"], shuffle=True, collate_fn=collate_molgraphs) val_loader = DataLoader(val_set, batch_size=args["batch_size"], shuffle=True, collate_fn=collate_molgraphs) test_loader = DataLoader(test_set, batch_size=args["batch_size"], shuffle=True, collate_fn=collate_molgraphs) model = model_zoo.chem.GCNClassifier( in_feats=args["n_input"], gcn_hidden_feats=[args["n_hidden"] for _ in range(args["n_layers"])], n_tasks=dataset.n_tasks, classifier_hidden_feats=args["n_hidden"], ).to(args["device"]) loss_criterion = BCEWithLogitsLoss(pos_weight=torch.tensor( dataset.task_pos_weights).to(args["device"]), reduction="none") optimizer = Adam(model.parameters(), lr=args["lr"]) stopper = EarlyStopper(args["patience"]) for epoch in range(args["n_epochs"]): # Train run_a_train_epoch(args, epoch, model, train_loader, loss_criterion, optimizer) # Validation and early stop val_score = run_an_eval_epoch(args, model, val_loader) early_stop = stopper.step(val_score, model) print( "epoch {:d}/{:d}, validation roc-auc {:.4f}, best validation roc-auc {:.4f}" .format(epoch + 1, args["n_epochs"], val_score, stopper.best_score)) if early_stop: break stopper.load_checkpoint(model) test_score = run_an_eval_epoch(args, model, test_loader) print("Best validation score {:.4f}".format(stopper.best_score)) print("Test score {:.4f}".format(test_score))
def main(): # downloading dataset dataset = CoraFullDataset() graph = dataset[0] # saving graph save_graphs(os.path.join(model_dir, 'dgl-citation-network-graph.bin'), graph) # saving train/val indices train_mask, val_mask = split_dataset(graph, [0.8, 0.2]) with open(os.path.join(model_dir, '/train_indices.bin'), 'wb') as train_file: pickle.dump(train_mask.indices, train_file) with open(os.path.join(model_dir, '/validation_indices.bin'), 'wb') as validation_file: pickle.dump(val_mask.indices, validation_file)
def train_val_test_split(dataset, frac_train=0.8, frac_val=0.1, frac_test=0.1, random_state=None): """Randomly permute the dataset and then split it into three consecutive chunks for training, validation and test. Parameters ---------- dataset We assume ``len(dataset)`` gives the size for the dataset and ``dataset[i]`` gives the ith datapoint. frac_train : float Fraction of data to use for training. By default, we set this to be 0.8, i.e. 80% of the dataset is used for training. frac_val : float Fraction of data to use for validation. By default, we set this to be 0.1, i.e. 10% of the dataset is used for validation. frac_test : float Fraction of data to use for test. By default, we set this to be 0.1, i.e. 10% of the dataset is used for test. random_state : None, int or array_like, optional Random seed used to initialize the pseudo-random number generator. Can be any integer between 0 and 2**32 - 1 inclusive, an array (or other sequence) of such integers, or None (the default). If seed is None, then RandomState will try to read data from /dev/urandom (or the Windows analogue) if available or seed from the clock otherwise. Returns ------- list of length 3 Subsets for training, validation and test, which also have ``len(dataset)`` and ``dataset[i]`` behaviors. """ return split_dataset(dataset, frac_list=[frac_train, frac_val, frac_test], shuffle=True, random_state=random_state)
def main(args): device = "cuda" if torch.cuda.is_available() else "cpu" batch_size = 128 learning_rate = 0.001 num_epochs = 100 set_random_seed() # Interchangeable with other Dataset dataset = Tox21() atom_data_field = 'h' trainset, valset, testset = split_dataset(dataset, [0.8, 0.1, 0.1]) train_loader = DataLoader( trainset, batch_size=batch_size, collate_fn=collate_molgraphs) val_loader = DataLoader( valset, batch_size=batch_size, collate_fn=collate_molgraphs) test_loader = DataLoader( testset, batch_size=batch_size, collate_fn=collate_molgraphs) if args.pre_trained: num_epochs = 0 model = model_zoo.chem.load_pretrained('GCN_Tox21') else: # Interchangeable with other models model = model_zoo.chem.GCNClassifier(in_feats=74, gcn_hidden_feats=[64, 64], n_tasks=dataset.n_tasks) loss_criterion = BCEWithLogitsLoss(pos_weight=torch.tensor( dataset.task_pos_weights).to(device), reduction='none') optimizer = Adam(model.parameters(), lr=learning_rate) stopper = EarlyStopping(patience=10) model.to(device) for epoch in range(num_epochs): model.train() print('Start training') train_meter = Meter() for batch_id, batch_data in enumerate(train_loader): smiles, bg, labels, mask = batch_data atom_feats = bg.ndata.pop(atom_data_field) atom_feats, labels, mask = atom_feats.to(device), labels.to(device), mask.to(device) logits = model(atom_feats, bg) # Mask non-existing labels loss = (loss_criterion(logits, labels) * (mask != 0).float()).mean() optimizer.zero_grad() loss.backward() optimizer.step() print('epoch {:d}/{:d}, batch {:d}/{:d}, loss {:.4f}'.format( epoch + 1, num_epochs, batch_id + 1, len(train_loader), loss.item())) train_meter.update(logits, labels, mask) train_roc_auc = train_meter.roc_auc_averaged_over_tasks() print('epoch {:d}/{:d}, training roc-auc score {:.4f}'.format( epoch + 1, num_epochs, train_roc_auc)) val_meter = Meter() model.eval() with torch.no_grad(): for batch_id, batch_data in enumerate(val_loader): smiles, bg, labels, mask = batch_data atom_feats = bg.ndata.pop(atom_data_field) atom_feats, labels = atom_feats.to(device), labels.to(device) logits = model(atom_feats, bg) val_meter.update(logits, labels, mask) val_roc_auc = val_meter.roc_auc_averaged_over_tasks() if stopper.step(val_roc_auc, model): break print('epoch {:d}/{:d}, validation roc-auc score {:.4f}, best validation roc-auc score {:.4f}'.format( epoch + 1, num_epochs, val_roc_auc, stopper.best_score)) test_meter = Meter() model.eval() for batch_id, batch_data in enumerate(test_loader): smiles, bg, labels, mask = batch_data atom_feats = bg.ndata.pop(atom_data_field) atom_feats, labels = atom_feats.to(device), labels.to(device) logits = model(atom_feats, bg) test_meter.update(logits, labels, mask) print('test roc-auc score {:.4f}'.format(test_meter.roc_auc_averaged_over_tasks()))
parser.add_argument("-p", "--load-pretrain", action="store_true", help="load model.pt") args = parser.parse_args() data = ClassificationData(args.data_file[0]) dataset = get_data(data) dgl_graphs = [build_graph(data) for data in dataset] labels = data.get_labels() dataset = list(zip(dgl_graphs, labels)) train_dataset, valid_dataset, test_dataset = split_dataset(dataset, [0.8, 0.1, 0.1], shuffle=True) def collate(samples): graphs, labels = map(list, zip(*samples)) batched_graph = dgl.batch(graphs) return batched_graph, torch.tensor(labels) train_loader = DataLoader(train_dataset, hyperparams["bsz"], shuffle=True, collate_fn=collate) valid_loader = DataLoader(valid_dataset, hyperparams["bsz"], shuffle=True, collate_fn=collate)
def main(args): # Step 1: Prepare graph data and retrieve train/validation/test index ============================= # # Load from DGL dataset dataset = LegacyTUDataset(args.dataset) # node degree, clustering coefficients, node labels as additional node features dataset = add_degree_feature(dataset) dataset = add_clustering_coefficients_feature(dataset) dataset = add_node_label_feature(dataset) # data split train_data, valid_data, test_data = split_dataset(dataset) # data loader train_loader = GraphDataLoader(train_data, batch_size=args.batch_size, shuffle=True) valid_loader = GraphDataLoader(valid_data, batch_size=args.batch_size, shuffle=False) test_loader = GraphDataLoader(test_data, batch_size=args.batch_size, shuffle=False) # check cuda device = f'cuda:{args.gpu}' if args.gpu >= 0 and torch.cuda.is_available() else 'cpu' # retrieve the number of classes and node features n_features, n_classes, _ = dataset.statistics() # Step 2: Create model =================================================================== # model = ARMA4GC(in_dim=n_features, hid_dim=args.hid_dim, out_dim=n_classes, num_stacks=args.num_stacks, num_layers=args.num_layers, activation=nn.ReLU(), dropout=args.dropout).to(device) best_model = copy.deepcopy(model) # Step 3: Create training components ===================================================== # loss_fn = nn.CrossEntropyLoss() opt = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.lamb) # Step 4: training epoches =============================================================== # acc = 0 no_improvement = 0 epochs = trange(args.epochs, desc='Accuracy & Loss') for _ in epochs: # Training train_loss = train(device, model, opt, loss_fn, train_loader) # Validation valid_acc = evaluate(device, model, valid_loader) # Print out performance epochs.set_description(f'Train Loss {train_loss:.4f} | Valid Acc {valid_acc:.4f}') if valid_acc < acc: no_improvement += 1 if no_improvement == args.early_stopping: print('Early stop.') break else: no_improvement = 0 acc = valid_acc best_model = copy.deepcopy(model) test_acc = evaluate(device, best_model, test_loader) print(f'Test Acc {test_acc:.4f}') return test_acc