def main(): global args, best_mae_error # load data dataset = CIFData(*args.data_options) collate_fn = collate_pool train_loader, val_loader, test_loader = get_train_val_test_loader( dataset=dataset, collate_fn=collate_fn, batch_size=args.batch_size, train_ratio=args.train_ratio, num_workers=args.workers, val_ratio=args.val_ratio, test_ratio=args.test_ratio, pin_memory=args.cuda, train_size=args.train_size, val_size=args.val_size, test_size=args.test_size, return_test=True) # obtain target value normalizer if args.task == 'classification': normalizer = Normalizer(torch.zeros(2)) normalizer.load_state_dict({'mean': 0., 'std': 1.}) else: if len(dataset) < 500: warnings.warn('Dataset has less than 500 data points. ' 'Lower accuracy is expected. ') sample_data_list = [dataset[i] for i in range(len(dataset))] else: sample_data_list = [ dataset[i] for i in sample(range(len(dataset)), 500) ] _, sample_target, _ = collate_pool(sample_data_list) normalizer = Normalizer(sample_target) # build model structures, _, _ = dataset[0] orig_atom_fea_len = structures[0].shape[-1] nbr_fea_len = structures[1].shape[-1] model = CrystalGraphConvNet( orig_atom_fea_len, nbr_fea_len, atom_fea_len=args.atom_fea_len, n_conv=args.n_conv, h_fea_len=args.h_fea_len, n_h=args.n_h, classification=True if args.task == 'classification' else False) if args.cuda: model.cuda() # define loss func and optimizer if args.task == 'classification': criterion = nn.NLLLoss() else: criterion = nn.MSELoss() if args.optim == 'SGD': optimizer = optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optim == 'Adam': optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay) else: raise NameError('Only SGD or Adam is allowed as --optim') # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_mae_error = checkpoint['best_mae_error'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) normalizer.load_state_dict(checkpoint['normalizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) scheduler = MultiStepLR(optimizer, milestones=args.lr_milestones, gamma=0.1) for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch, normalizer) # evaluate on validation set mae_error = validate(val_loader, model, criterion, normalizer) if mae_error != mae_error: print('Exit due to NaN') sys.exit(1) scheduler.step() # remember the best mae_eror and save checkpoint if args.task == 'regression': is_best = mae_error < best_mae_error best_mae_error = min(mae_error, best_mae_error) else: is_best = mae_error > best_mae_error best_mae_error = max(mae_error, best_mae_error) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_mae_error': best_mae_error, 'optimizer': optimizer.state_dict(), 'normalizer': normalizer.state_dict(), 'args': vars(args) }, is_best) # test best model print('---------Evaluate Model on Test Set---------------') best_checkpoint = torch.load('model_best.pth.tar') model.load_state_dict(best_checkpoint['state_dict']) validate(test_loader, model, criterion, normalizer, test=True)
def main(): global args, best_mae_error # load data dataset = CIFData(*args.data_options) collate_fn = collate_pool train_loader, val_loader, test_loader = get_train_val_test_loader( dataset=dataset, collate_fn=collate_fn, batch_size=args.batch_size, train_ratio=args.train_ratio, num_workers=args.workers, val_ratio=args.val_ratio, test_ratio=args.test_ratio, pin_memory=args.cuda, train_size=args.train_size, val_size=args.val_size, test_size=args.test_size, return_val=True, return_test=True, ) # obtain target value normalizer if args.task == "classification": normalizer = Normalizer(torch.zeros(2)) normalizer.load_state_dict({"mean": 0.0, "std": 1.0}) else: if len(dataset) < 500: warnings.warn( "Dataset has less than 500 data points. " "Lower accuracy is expected. " ) sample_data_list = [dataset[i] for i in range(len(dataset))] else: sample_data_list = [dataset[i] for i in sample(range(len(dataset)), 500)] _, sample_target, _ = collate_pool(sample_data_list) normalizer = Normalizer(sample_target) # build model structures, _, _ = dataset[0] orig_atom_fea_len = structures[0].shape[-1] nbr_fea_len = structures[1].shape[-1] model = CrystalGraphConvNet( orig_atom_fea_len, nbr_fea_len, atom_fea_len=args.atom_fea_len, n_conv=args.n_conv, h_fea_len=args.h_fea_len, n_h=args.n_h, classification=True if args.task == "classification" else False, dropout_rate=args.dropout_rate, ) if args.cuda: model.cuda() # define loss func and optimizer if args.task == "classification": criterion = nn.NLLLoss() else: criterion = nn.MSELoss() if args.optim == "SGD": optimizer = optim.SGD( model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay, ) elif args.optim == "Adam": optimizer = optim.Adam( model.parameters(), args.lr, weight_decay=args.weight_decay ) else: raise NameError("Only SGD or Adam is allowed as --optim") # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint["epoch"] best_mae_error = checkpoint["best_mae_error"] model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) normalizer.load_state_dict(checkpoint["normalizer"]) print( "=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint["epoch"] ) ) else: print("=> no checkpoint found at '{}'".format(args.resume)) scheduler = MultiStepLR(optimizer, milestones=args.lr_milestones, gamma=0.1) for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch, normalizer) # evaluate on validation set mae_error = validate(val_loader, model, criterion, normalizer) if mae_error != mae_error: print("Exit due to NaN") sys.exit(1) scheduler.step() # remember the best mae_eror and save checkpoint if args.task == "regression": is_best = mae_error < best_mae_error best_mae_error = min(mae_error, best_mae_error) else: is_best = mae_error > best_mae_error best_mae_error = max(mae_error, best_mae_error) save_checkpoint( { "epoch": epoch + 1, "state_dict": model.state_dict(), "best_mae_error": best_mae_error, "optimizer": optimizer.state_dict(), "normalizer": normalizer.state_dict(), "args": vars(args), }, is_best, ) # test best model best_checkpoint = torch.load("model_best.pth.tar") model.load_state_dict(best_checkpoint["state_dict"]) validate( train_loader, model, criterion, normalizer, test=True, fname="train_results" ) validate(val_loader, model, criterion, normalizer, test=True, fname="val_results") validate(test_loader, model, criterion, normalizer, test=True, fname="test_results")
def main(): global args, best_mae_error # load data dataset = CIFData(*args.data_options, disable_save_torch=args.disable_save_torch) collate_fn = collate_pool train_loader, val_loader, test_loader = get_train_val_test_loader( dataset=dataset, collate_fn=collate_fn, batch_size=args.batch_size, train_ratio=args.train_ratio, num_workers=args.workers, val_ratio=args.val_ratio, test_ratio=args.test_ratio, pin_memory=args.cuda, train_size=args.train_size, val_size=args.val_size, test_size=args.test_size, return_test=True) # Make sure >1 class is present if args.task == 'classification': total_train = 0 total_val = 0 total_test = 0 for i, (_, target, _) in enumerate(train_loader): for target_i in target.squeeze(): total_train += target_i if bool(total_train == 0): raise ValueError('All 0s in train') elif bool(total_train == 1): raise ValueError('All 1s in train') for i, (_, target, _) in enumerate(val_loader): if len(target) == 1: raise ValueError('Only single entry in val') for target_i in target.squeeze(): total_val += target_i if bool(total_val == 0): raise ValueError('All 0s in val') elif bool(total_val == 1): raise ValueError('All 1s in val') for i, (_, target, _) in enumerate(test_loader): if len(target) == 1: raise ValueError('Only single entry in test') for target_i in target.squeeze(): total_test += target_i if bool(total_test == 0): raise ValueError('All 0s in test') elif bool(total_test == 1): raise ValueError('All 1s in test') # make output folder if needed if not os.path.exists('output'): os.mkdir('output') # make and clean torch files if needed torch_data_path = os.path.join(args.data_options[0], 'cifdata') if args.clean_torch and os.path.exists(torch_data_path): shutil.rmtree(torch_data_path) if os.path.exists(torch_data_path): if not args.clean_torch: warnings.warn('Found cifdata folder at ' + torch_data_path+'. Will read in .jsons as-available') else: os.mkdir(torch_data_path) # obtain target value normalizer if args.task == 'classification': normalizer = Normalizer(torch.zeros(2)) normalizer.load_state_dict({'mean': 0., 'std': 1.}) else: if len(dataset) < 500: warnings.warn('Dataset has less than 500 data points. ' 'Lower accuracy is expected. ') sample_data_list = [dataset[i] for i in range(len(dataset))] else: sample_data_list = [dataset[i] for i in sample(range(len(dataset)), 500)] _, sample_target, _ = collate_pool(sample_data_list) normalizer = Normalizer(sample_target) # build model structures, _, _ = dataset[0] orig_atom_fea_len = structures[0].shape[-1] nbr_fea_len = structures[1].shape[-1] model = CrystalGraphConvNet(orig_atom_fea_len, nbr_fea_len, atom_fea_len=args.atom_fea_len, n_conv=args.n_conv, h_fea_len=args.h_fea_len, n_h=args.n_h, classification=True if args.task == 'classification' else False) if args.cuda: model.cuda() # define loss func and optimizer if args.task == 'classification': criterion = nn.NLLLoss() else: criterion = nn.MSELoss() if args.optim == 'SGD': optimizer = optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optim == 'Adam': optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay) else: raise NameError('Only SGD or Adam is allowed as --optim') # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_mae_error = checkpoint['best_mae_error'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) normalizer.load_state_dict(checkpoint['normalizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) scheduler = MultiStepLR(optimizer, milestones=args.lr_milestones, gamma=0.1) for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch, normalizer) # evaluate on validation set mae_error = validate(val_loader, model, criterion, normalizer) if mae_error != mae_error: print('Exit due to NaN') sys.exit(1) scheduler.step() # remember the best mae_eror and save checkpoint if args.task == 'regression': is_best = mae_error < best_mae_error best_mae_error = min(mae_error, best_mae_error) else: is_best = mae_error > best_mae_error best_mae_error = max(mae_error, best_mae_error) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_mae_error': best_mae_error, 'optimizer': optimizer.state_dict(), 'normalizer': normalizer.state_dict(), 'args': vars(args) }, is_best) # test best model best_checkpoint = torch.load(os.path.join('output', 'model_best.pth.tar')) model.load_state_dict(best_checkpoint['state_dict']) print('---------Evaluate Best Model on Train Set---------------') validate(train_loader, model, criterion, normalizer, test=True, csv_name='train_results.csv') print('---------Evaluate Best Model on Val Set---------------') validate(val_loader, model, criterion, normalizer, test=True, csv_name='val_results.csv') print('---------Evaluate Best Model on Test Set---------------') validate(test_loader, model, criterion, normalizer, test=True, csv_name='test_results.csv')
def cv(): global args, best_mae_error if not os.path.exists("./checkpoints"): os.mkdir("./checkpoints") # load data dataset = CIFData(*args.data_options) collate_fn = collate_pool i = 0 train_maes = [] val_maes = [] test_maes = [] for train_loader, val_loader, test_loader in get_cv_loader( dataset=dataset, collate_fn=collate_fn, batch_size=args.batch_size, train_ratio=args.train_ratio, num_workers=args.workers, test_ratio=args.test_ratio, pin_memory=args.cuda, train_size=args.train_size, val_size=args.val_size, test_size=args.test_size, cross_validation=args.cross_validation, ): i += 1 # obtain target value normalizer if args.task == "classification": normalizer = Normalizer(torch.zeros(2)) normalizer.load_state_dict({"mean": 0.0, "std": 1.0}) else: if len(dataset) < 500: warnings.warn( "Dataset has less than 500 data points. " "Lower accuracy is expected. " ) sample_data_list = [dataset[i] for i in range(len(dataset))] else: sample_data_list = [ dataset[i] for i in sample(range(len(dataset)), 500) ] _, sample_target, _ = collate_pool(sample_data_list) normalizer = Normalizer(sample_target) # build model structures, _, _ = dataset[0] orig_atom_fea_len = structures[0].shape[-1] nbr_fea_len = structures[1].shape[-1] model = CrystalGraphConvNet( orig_atom_fea_len, nbr_fea_len, atom_fea_len=args.atom_fea_len, n_conv=args.n_conv, h_fea_len=args.h_fea_len, n_h=args.n_h, classification=True if args.task == "classification" else False, dropout_rate=args.dropout_rate, ) if args.cuda: model.cuda() # define loss func and optimizer if args.task == "classification": criterion = nn.NLLLoss() else: criterion = nn.MSELoss() if args.optim == "SGD": optimizer = optim.SGD( model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay, ) elif args.optim == "Adam": optimizer = optim.Adam( model.parameters(), args.lr, weight_decay=args.weight_decay ) else: raise NameError("Only SGD or Adam is allowed as --optim") scheduler = MultiStepLR(optimizer, milestones=args.lr_milestones, gamma=0.1) print(f"Split {i}") if args.task == "regression": best_mae_error = 1e10 else: best_mae_error = 0.0 for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch, normalizer) # evaluate on validation set mae_error = validate(val_loader, model, criterion, normalizer) if mae_error != mae_error: print("Exit due to NaN") sys.exit(1) scheduler.step() # remember the best mae_eror and save checkpoint if args.task == "regression": is_best = mae_error < best_mae_error best_mae_error = min(mae_error, best_mae_error) else: is_best = mae_error > best_mae_error best_mae_error = max(mae_error, best_mae_error) save_checkpoint( { "epoch": epoch + 1, "state_dict": model.state_dict(), "best_mae_error": best_mae_error, "optimizer": optimizer.state_dict(), "normalizer": normalizer.state_dict(), "args": vars(args), }, is_best, pad_string=f"./checkpoints/{i}", ) # test best model best_checkpoint = torch.load(f"./checkpoints/{i}_model_best.pth.tar") model.load_state_dict(best_checkpoint["state_dict"]) train_mae = validate( train_loader, model, criterion, normalizer, test=True, split=i, fname="train", to_save=False, ) val_mae = validate( val_loader, model, criterion, normalizer, test=True, fname="val", to_save=False, ) test_mae = validate( test_loader, model, criterion, normalizer, test=True, fname="test", to_save=False, ) train_maes.append(train_mae.detach().item()) val_maes.append(val_mae.detach().item()) test_maes.append(test_mae.detach().item()) with open("results.out", "a+") as fw: fw.write("\n") fw.write(f"Avg Train MAE: {np.mean(train_maes):.4f}\n") fw.write(f"Avg Val MAE: {np.mean(val_maes):.4f}\n") fw.write(f"Avg Test MAE: {np.mean(test_maes):.4f}\n")
def main(): global args, best_mae_error # load dataset: (atom_fea, nbr_fea, nbr_fea_idx), target, cif_id dataset = CIFData(args.root + args.target) collate_fn = collate_pool train_loader, val_loader, test_loader = get_train_val_test_loader( dataset=dataset, collate_fn=collate_fn, batch_size=args.batch_size, train_ratio=args.train_ratio, num_workers=args.workers, val_ratio=args.val_ratio, test_ratio=args.test_ratio, pin_memory=args.cuda, return_test=True) # obtain target value normalizer if args.task == 'classification': normalizer = Normalizer(torch.zeros(2)) normalizer.load_state_dict({'mean': 0., 'std': 1.}) else: sample_data_list = [dataset[i] for i in \ sample(range(len(dataset)), 500)] _, sample_target, _ = collate_pool(sample_data_list) normalizer = Normalizer(sample_target) # build model structures, _, _ = dataset[0] orig_atom_fea_len = structures[0].shape[-1] nbr_fea_len = structures[1].shape[-1] model = CrystalGraphConvNet( orig_atom_fea_len, nbr_fea_len, atom_fea_len=args.atom_fea_len, n_conv=args.n_conv, h_fea_len=args.h_fea_len, n_h=args.n_h, classification=True if args.task == 'classification' else False) # pring number of trainable model parameters trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print('=> number of trainable model parameters: {:d}'.format( trainable_params)) if args.cuda: model.cuda() # define loss func and optimizer if args.task == 'classification': criterion = nn.NLLLoss() else: criterion = nn.MSELoss() if args.optim == 'SGD': optimizer = optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optim == 'Adam': optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=args.weight_decay) else: raise NameError('Only SGD or Adam is allowed as --optim') # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) args.start_epoch = checkpoint['epoch'] best_mae_error = checkpoint['best_mae_error'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) normalizer.load_state_dict(checkpoint['normalizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # TensorBoard writer summary_root = './runs/' if not os.path.exists(summary_root): os.mkdir(summary_root) summary_file = summary_root + args.target if os.path.exists(summary_file): shutil.rmtree(summary_file) writer = SummaryWriter(summary_file) scheduler = MultiStepLR(optimizer, milestones=args.lr_milestones, gamma=0.1) for epoch in range(args.start_epoch, args.start_epoch + args.epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch, normalizer, writer) # evaluate on validation set mae_error = validate(val_loader, model, criterion, epoch, normalizer, writer) scheduler.step() # remember the best mae_eror and save checkpoint if args.task == 'regression': is_best = mae_error < best_mae_error best_mae_error = min(mae_error, best_mae_error) else: is_best = mae_error > best_mae_error best_mae_error = max(mae_error, best_mae_error) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_mae_error': best_mae_error, 'optimizer': optimizer.state_dict(), 'normalizer': normalizer.state_dict(), 'args': vars(args) }, args.target, is_best)