Beispiel #1
0
parser.add_argument('--mini_batch', type=int, default=4)
parser.add_argument('--lr', type=float, default=1e-3)
parser.add_argument('--num_workers', type=int, default=2)

parser.add_argument('--hidden_dim', type=int, default=100)
parser.add_argument('--depth', type=int, default=3)
parser.add_argument('--n_layers', type=int, default=2)

parser.add_argument('--optimizer', type=str, default='adam')
parser.add_argument('--scheduler', type=str, default=None)
parser.add_argument('--verbose', action='store_true', default=False)

args = parser.parse_args()

# construct loader
train_loader, val_loader = construct_loader(args)

# set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# define paths
# note that these weights are for a model trained only on the training set
yaml_file_name = 'best_model/model_parameters.yml'
state_dict = 'best_model/epoch_95_state_dict'

# create the network with the best architecture from hyperopt and load the corresponding best weights
with open(yaml_file_name, 'r') as f:
    content = yaml.load(stream=f, Loader=yaml.FullLoader)
print(content)
model = G2C(**content).to(device)
model.load_state_dict(torch.load(state_dict, map_location=device))
model.eval()
Beispiel #2
0
def optimize(trial, args):

    setattr(args, 'hidden_size',
            int(trial.suggest_discrete_uniform('hidden_size', 300, 1200, 300)))
    setattr(args, 'depth',
            int(trial.suggest_discrete_uniform('depth', 2, 6, 1)))
    setattr(args, 'dropout',
            int(trial.suggest_discrete_uniform('dropout', 0, 1, 0.2)))
    setattr(args, 'lr', trial.suggest_loguniform('lr', 1e-5, 1e-3))
    setattr(args, 'batch_size',
            int(trial.suggest_categorical('batch_size', [25, 50, 100])))
    setattr(
        args, 'graph_pool',
        trial.suggest_categorical('graph_pool',
                                  ['sum', 'mean', 'max', 'attn', 'set2set']))

    setattr(args, 'log_dir',
            os.path.join(args.hyperopt_dir, str(trial._trial_id)))
    modify_train_args(args)

    torch.manual_seed(args.seed)
    train_logger = create_logger('train', args.log_dir)

    train_loader, val_loader = construct_loader(args)
    mean = train_loader.dataset.mean
    std = train_loader.dataset.std
    stdzer = Standardizer(mean, std, args.task)

    # create model, optimizer, scheduler, and loss fn
    model = GNN(args, train_loader.dataset.num_node_features,
                train_loader.dataset.num_edge_features).to(args.device)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    scheduler = build_lr_scheduler(optimizer, args, len(train_loader.dataset))
    loss = get_loss_func(args)
    best_val_loss = math.inf
    best_epoch = 0

    # record args, optimizer, and scheduler info
    train_logger.info('Arguments are...')
    for arg in vars(args):
        train_logger.info(f'{arg}: {getattr(args, arg)}')
    train_logger.info(f'\nOptimizer parameters are:\n{optimizer}\n')
    train_logger.info(f'Scheduler state dict is:')
    for key, value in scheduler.state_dict().items():
        train_logger.info(f'{key}: {value}')
    train_logger.info('')

    # train
    train_logger.info("Starting training...")
    for epoch in range(0, args.n_epochs):
        train_loss, train_acc = train(model, train_loader, optimizer, loss,
                                      stdzer, args.device, scheduler,
                                      args.task)
        train_logger.info(f"Epoch {epoch}: Training Loss {train_loss}")

        val_loss, val_acc = eval(model, val_loader, loss, stdzer, args.device,
                                 args.task)
        train_logger.info(f"Epoch {epoch}: Validation Loss {val_loss}")

        if val_loss <= best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch
            torch.save(model.state_dict(),
                       os.path.join(args.log_dir, 'best_model'))

        # report intermediate results for early stopping
        trial.report(val_loss, epoch)

        # handle pruning based on the intermediate value
        if trial.should_prune():
            train_logger.handlers = []
            raise optuna.TrialPruned()

    train_logger.info(
        f"Best Validation Loss {best_val_loss} on Epoch {best_epoch}")

    # load best model
    model = GNN(args, train_loader.dataset.num_node_features,
                train_loader.dataset.num_edge_features).to(args.device)
    state_dict = torch.load(os.path.join(args.log_dir, 'best_model'),
                            map_location=args.device)
    model.load_state_dict(state_dict)

    # predict test data
    test_loader = construct_loader(args, modes='test')
    preds, test_loss, test_acc, test_auc = test(model, test_loader, loss,
                                                stdzer, args.device, args.task)
    train_logger.info(f"Test Loss {test_loss}")

    # save predictions
    smiles = test_loader.dataset.smiles
    preds_path = os.path.join(args.log_dir, 'preds.csv')
    pd.DataFrame(list(zip(smiles, preds)),
                 columns=['smiles', 'prediction']).to_csv(preds_path,
                                                          index=False)

    train_logger.handlers = []
    return best_val_loss
Beispiel #3
0
def optimize(trial, args):

    setattr(args, 'hidden_dim',
            int(trial.suggest_categorical('d_model', [128, 256, 512])))
    setattr(args, 'depth',
            int(trial.suggest_discrete_uniform('n_enc', 2, 6, 1)))
    setattr(args, 'n_layers',
            int(trial.suggest_discrete_uniform('n_enc', 1, 3, 1)))
    setattr(args, 'lr', trial.suggest_loguniform('lr', 1e-5, 1e-2))
    setattr(args, 'batch_size',
            int(trial.suggest_categorical('batch_size', [16, 32, 64, 128])))

    setattr(args, 'log_dir',
            os.path.join(args.hyperopt_dir, str(trial._trial_id)))

    torch.manual_seed(0)
    train_logger = create_logger('train', args.log_dir)

    train_logger.info('Arguments are...')
    for arg in vars(args):
        train_logger.info(f'{arg}: {getattr(args, arg)}')

    # construct loader and set device
    train_loader, val_loader = construct_loader(args)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # build model
    model_parameters = {
        'node_dim': train_loader.dataset.num_node_features,
        'edge_dim': train_loader.dataset.num_edge_features,
        'hidden_dim': args.hidden_dim,
        'depth': args.depth,
        'n_layers': args.n_layers
    }
    model = G2C(**model_parameters).to(device)

    # multi gpu training
    if torch.cuda.device_count() > 1:
        train_logger.info(
            f'Using {torch.cuda.device_count()} GPUs for training...')
        model = torch.nn.DataParallel(model)

    # get optimizer and scheduler
    optimizer, scheduler = get_optimizer_and_scheduler(
        args, model, len(train_loader.dataset))
    loss = torch.nn.MSELoss(reduction='sum')

    # record parameters
    train_logger.info(
        f'\nModel parameters are:\n{dict_to_str(model_parameters)}\n')
    save_yaml_file(os.path.join(args.log_dir, 'model_paramaters.yml'),
                   model_parameters)
    train_logger.info(f'Optimizer parameters are:\n{optimizer}\n')
    train_logger.info(f'Scheduler state dict is:')
    if scheduler:
        for key, value in scheduler.state_dict().items():
            train_logger.info(f'{key}: {value}')
        train_logger.info('')

    best_val_loss = math.inf
    best_epoch = 0

    model.to(device)
    train_logger.info("Starting training...")
    for epoch in range(1, args.n_epochs):
        train_loss = train(model, train_loader, optimizer, loss, device,
                           scheduler, logger if args.verbose else None)
        train_logger.info("Epoch {}: Training Loss {}".format(
            epoch, train_loss))

        val_loss = test(model, val_loader, loss, device, args.log_dir, epoch)
        train_logger.info("Epoch {}: Validation Loss {}".format(
            epoch, val_loss))
        if scheduler and not isinstance(scheduler, NoamLR):
            scheduler.step(val_loss)

        if val_loss <= best_val_loss:
            best_val_loss = val_loss
            best_epoch = epoch
            torch.save(model.state_dict(),
                       os.path.join(args.log_dir, f'epoch_{epoch}_state_dict'))
    train_logger.info("Best Validation Loss {} on Epoch {}".format(
        best_val_loss, best_epoch))

    train_logger.handlers = []
    return best_val_loss
Beispiel #4
0
import math
import torch
import pandas as pd

from features.featurization import construct_loader
from utils import Standardizer, create_logger, get_loss_func

from model.gnn import GNN
from model.training import train, eval, test, build_lr_scheduler
from model.parsing import parse_train_args

args = parse_train_args()
torch.manual_seed(args.seed)
logger = create_logger('train', args.log_dir)

train_loader, val_loader = construct_loader(args)
mean = train_loader.dataset.mean
std = train_loader.dataset.std
stdzer = Standardizer(mean, std, args.task)

# create model, optimizer, scheduler, and loss fn
model = GNN(args, train_loader.dataset.num_node_features,
            train_loader.dataset.num_edge_features).to(args.device)
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
scheduler = build_lr_scheduler(optimizer, args, len(train_loader.dataset))
loss = get_loss_func(args)
best_val_loss = math.inf
best_epoch = 0

# record args, optimizer, and scheduler info
logger.info('Arguments are...')