def test_pdbloader_ligand_coordinates(testdata, testdir): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate) iloader = iter(loader) ids, labels, (species, coordinates) = next(iloader) assert (ids == np.array(["1a4r", "1a4w"])).all() assert species.shape == (batch_size, 42) # Ligand 1a4w is the largest assert coordinates.shape == (batch_size, 42, 3 ) # Ligand 1a4w is the largest assert np.allclose(coordinates[0, 0], [102.486, 24.870, -2.909]) assert np.allclose(coordinates[0, -1], [0.0, 0.0, 0.0]) # 1a4r is padded assert np.allclose(coordinates[1, 0], [17.735, -17.178, 22.612]) assert np.allclose(coordinates[1, -1], [18.049, -13.554, 14.106])
def test_pdbloader_batch(testdata, testdir, distance, n1_atoms, n2_atoms): data = loaders.PDBData(testdata, distance, testdir) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate) iloader = iter(loader) ids, labels, (species, coordinates) = next(iloader) assert isinstance(ids, np.ndarray) assert ids.shape == (batch_size, ) assert isinstance(labels, torch.Tensor) assert labels.shape == (batch_size, ) assert isinstance(species, torch.Tensor) assert species.shape == (batch_size, max(n1_atoms, n2_atoms)) assert isinstance(coordinates, torch.Tensor) assert coordinates.shape == (batch_size, max(n1_atoms, n2_atoms), 3)
def test_pdbloader_removeHs(testdata, testdir, distance, n1_atoms, n2_atoms): data = loaders.PDBData(testdata, distance, testdir, removeHs=True) batch_size = 1 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate) iloader = iter(loader) n_atoms_iter = iter([n1_atoms, n2_atoms]) for ids, label, (species, coordinates) in iloader: n_atoms = next(n_atoms_iter) assert isinstance(ids, np.ndarray) assert ids.shape == (batch_size, ) assert isinstance(label, torch.Tensor) assert label.shape == (batch_size, ) assert isinstance(species, torch.Tensor) assert species.shape == (batch_size, n_atoms) assert isinstance(coordinates, torch.Tensor) assert coordinates.shape == (batch_size, n_atoms, 3)
def test_predict(testdata, testdir): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) n_species = len(amap) loader = torch.utils.data.DataLoader( data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate ) # Define AEVComputer AEVC = torchani.AEVComputer(RcR, RcA, EtaR, RsR, EtaA, Zeta, RsA, TsA, n_species) # Radial functions: 1 # Angular functions: 1 # Number of species: 5 # AEV: 1 * 5 + 1 * 5 * (5 + 1) // 2 = 5 (R) + 15 (A) = 20 assert AEVC.aev_length == 20 model = models.AffinityModel(n_species, AEVC.aev_length) ids, true, predicted = predict.predict(model, AEVC, loader) assert isinstance(true, np.ndarray) assert len(true) == batch_size assert isinstance(predicted, np.ndarray) assert len(predicted) == batch_size
def test_train_small_save(testdata, testdir, modelidx, tmpdir): with mlflow.start_run(): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) n_species = len(amap) loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate) # Define AEVComputer AEVC = torchani.AEVComputer(RcR, RcA, EtaR, RsR, EtaA, Zeta, RsA, TsA, n_species) # Radial functions: 1 # Angular functions: 1 # Number of species: 5 # AEV: 1 * 5 + 1 * 5 * (5 + 1) // 2 = 5 (R) + 15 (A) = 20 assert AEVC.aev_length == 20 model = models.AffinityModel(n_species, AEVC.aev_length, layers_sizes=[1]) optimizer = optim.SGD(model.parameters(), lr=0.01) mse = nn.MSELoss() # Check number of ANNs assert len(model) == n_species train_losses, valid_losses = train.train( model, optimizer, mse, AEVC, loader, loader, epochs=15, # torchani.AEVComputer savepath=tmpdir, idx=modelidx, ) assert os.path.isfile( os.path.join( tmpdir, "best.pth" if modelidx is None else f"best_{modelidx}.pth")) # Validation loss is shifted when trainloader and testloader are the same assert np.allclose(train_losses[1:], valid_losses[:-1])
def test_atomicnum_map(testdata, testdir): data = loaders.PDBData(testdata, 2.0, testdir) amap = loaders.anummap(data.species) # Elements: H, C, N, O, P, S assert len(amap) == 6 assert [1, 6, 7, 8, 15, 16] == list(amap.keys()) assert list(range(6)) == list(amap.values())
def test_forward_atomic(testdata, testdir): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) n_species = len(amap) loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate) iloader = iter(loader) _, labels, (species, coordinates) = next(iloader) # Move everything to device labels = labels.to(device) species = species.to(device) coordinates = coordinates.to(device) AEVC = torchani.AEVComputer(RcR, RcA, EtaR, RsR, EtaA, Zeta, RsA, TsA, n_species) # Radial functions: 1 # Angular functions: 1 # Number of species: 5 # AEV: 1 * 5 + 1 * 5 * (5 + 1) // 2 = 5 (R) + 15 (A) = 20 assert AEVC.aev_length == 20 aev = AEVC.forward((species, coordinates)) assert aev.species.shape == species.shape assert aev.aevs.shape == (batch_size, 42, 20) model = models.AffinityModel(n_species, AEVC.aev_length) # Move model to device model.to(device) output = model(aev.species, aev.aevs) assert output.shape == (batch_size, ) atomic_constributions = model._forward_atomic(aev.species, aev.aevs) assert atomic_constributions.shape == species.shape o = torch.sum(atomic_constributions, dim=1) assert np.allclose(output.cpu().detach().numpy(), o.cpu().detach().numpy())
def test_train_small_cmap(testdata, testdir): # Map all elements to dummy atom cmap = {"C": ["N", "O"]} # Map N and O to C, leave P and S with mlflow.start_run(): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir, cmap) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) n_species = len(amap) # cmap maps everything to single dummy element assert n_species == 3 loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate) # Define AEVComputer AEVC = torchani.AEVComputer(RcR, RcA, EtaR, RsR, EtaA, Zeta, RsA, TsA, n_species) # Radial functions: 1 # Angular functions: 1 # Number of species: 3 # AEV: 1 * 3 + 1 * 3 * (3 + 1) // 2 = 3 (R) + 6 (A) = 9 assert AEVC.aev_length == 9 model = models.AffinityModel(n_species, AEVC.aev_length) optimizer = optim.SGD(model.parameters(), lr=0.0001) mse = nn.MSELoss() # Check number of ANNs assert len(model) == n_species train_losses, valid_losses = train.train( model, optimizer, mse, AEVC, loader, loader, epochs=15, # torchani.AEVComputer ) # Validation loss is shifted when trainloader and testloader are the same assert np.allclose(train_losses[1:], valid_losses[:-1])
def test_predict_baseline(testdata, testdir): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) n_species = len(amap) loader = torch.utils.data.DataLoader( data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate ) # Define AEVComputer AEVC = torchani.AEVComputer(RcR, RcA, EtaR, RsR, EtaA, Zeta, RsA, TsA, n_species) # Radial functions: 1 # Angular functions: 1 # Number of species: 5 # AEV: 1 * 5 + 1 * 5 * (5 + 1) // 2 = 5 (R) + 15 (A) = 20 assert AEVC.aev_length == 20 model = models.AffinityModel(n_species, AEVC.aev_length) ids, true, predicted = predict.predict(model, AEVC, loader) assert isinstance(true, np.ndarray) assert len(true) == batch_size assert isinstance(predicted, np.ndarray) assert len(predicted) == batch_size # Systems are the other way around with respect to file order # This is to test that deltas are added to the correct ID delta_ids = np.array(["1a4w", "1a4r"]) delta_baseline = np.array([500, 600]) delta = np.array([5.92, 6.66]) s = np.argsort(delta_ids) ids_b, true_b, predicted_b = predict.predict( model, AEVC, loader, baseline=(delta_ids, delta_baseline, delta) ) sort = np.argsort(ids) bsort = np.argsort(ids_b) assert (ids[sort] == ids_b[bsort]).all() assert np.allclose(true[sort], true[bsort]) assert np.allclose(predicted[sort], predicted_b[bsort] - delta_baseline[s])
def test_atomic(testdata, testdir): with mlflow.start_run(): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir) n_systems = len(data) assert n_systems == 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) n_species = len(amap) # Define AEVComputer AEVC = torchani.AEVComputer(RcR, RcA, EtaR, RsR, EtaA, Zeta, RsA, TsA, n_species) # Radial functions: 1 # Angular functions: 1 # Number of species: 5 # AEV: 1 * 5 + 1 * 5 * (5 + 1) // 2 = 5 (R) + 15 (A) = 20 assert AEVC.aev_length == 20 model = models.AffinityModel(n_species, AEVC.aev_length) # Move model and AEVComputer to device model.to(device) AEVC.to(device) # Model in evaluation mode model.eval() for pdbid, _, (species, coordinates) in data: atomic = grad.atomic(species, coordinates, model, AEVC, device) # Add fictitious batch dimension species = species.unsqueeze(0) coordinates = coordinates.unsqueeze(0) assert atomic.shape == species.shape aevs = AEVC.forward((species, coordinates)).aevs prediction = model(species, aevs) assert np.allclose( torch.sum(atomic, dim=1).cpu().detach().numpy(), prediction.cpu().detach().numpy(), )
def test_pdbloader_species_cmap_toX(testdata, testdir): # Map all elements to dummy atom cmap = {"X": ["C", "N", "O", "S", "P"]} # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir, cmap) # TODO: Access to data loader is quite ugly... NamedTuple? assert np.allclose( data[0][2][0], np.zeros(28), # Species for first ligand # Element X maps to 0 ) assert np.allclose( data[1][2][0], np.zeros(42), # Species for second ligand # Element X maps to 0 ) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) assert len(amap) == 1 loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate) iloader = iter(loader) ids, labels, (species, coordinates) = next(iloader) assert (ids == np.array(["1a4r", "1a4w"])).all() assert species.shape == (batch_size, 42) # Ligand 1a4w is the largest # Test ligand 1a4r (padded with -1) assert torch.allclose( species[0, :], torch.tensor([0] * 28 + 14 * [-1]), ) # Test ligand 1a4w (no padding) assert torch.allclose( species[1, :], torch.zeros(42, dtype=int), )
def test_pdbloader_species_cmap_OtoS(testdata, testdir): # Map all elements to dummy atom cmap = {"S": "O"} # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir, cmap) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) # Check O is not in amap with pytest.raises(KeyError): amap[8] loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate) iloader = iter(loader) ids, labels, (species, coordinates) = next(iloader) assert (ids == np.array(["1a4r", "1a4w"])).all() assert species.shape == (batch_size, 42) # Ligand 1a4w is the largest # Test ligand 1a4r (padded with -1) assert torch.allclose( species[0, :], torch.tensor( elements_to_idxs("NPSSSPSSSCCSCSCSCNCNCCSNCNNC", amap) + 14 * [-1]), ) # Test ligand 1a4w (no padding) assert torch.allclose( species[1, :], torch.tensor( elements_to_idxs("CCCCCCCCCCNCCSSSNCCSCCCNCNNNCCCCCCCSSCCCCN", amap)), )
def test_predict_scaling(testdata, testdir): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir) original_labels = data.labels.copy() # Scale labels scaler = utils.labels_scaler(data) assert np.allclose(data.labels, [1.0, -1.0]) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) n_species = len(amap) loader = torch.utils.data.DataLoader( data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate ) # Define AEVComputer AEVC = torchani.AEVComputer(RcR, RcA, EtaR, RsR, EtaA, Zeta, RsA, TsA, n_species) # Radial functions: 1 # Angular functions: 1 # Number of species: 5 # AEV: 1 * 5 + 1 * 5 * (5 + 1) // 2 = 5 (R) + 15 (A) = 20 assert AEVC.aev_length == 20 model = models.AffinityModel(n_species, AEVC.aev_length) ids, true_scaled, predicted_scaled = predict.predict(model, AEVC, loader) assert np.allclose(true_scaled, data.labels) assert (-1 <= true_scaled).all() and (true_scaled <= 1).all() ids, true, predicted = predict.predict(model, AEVC, loader, scaler=scaler) assert np.allclose(true, original_labels) assert np.allclose(predicted, scaler.inverse_transform(predicted_scaled))
def test_grad(testdata, testdir): with mlflow.start_run(): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir) n_systems = len(data) assert n_systems == 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) n_species = len(amap) # Define AEVComputer AEVC = torchani.AEVComputer(RcR, RcA, EtaR, RsR, EtaA, Zeta, RsA, TsA, n_species) # Radial functions: 1 # Angular functions: 1 # Number of species: 5 # AEV: 1 * 5 + 1 * 5 * (5 + 1) // 2 = 5 (R) + 15 (A) = 20 assert AEVC.aev_length == 20 model = models.AffinityModel(n_species, AEVC.aev_length) loss = nn.MSELoss() # Move model and AEVComputer to device model.to(device) AEVC.to(device) # Model in evaluation mode model.eval() for i in range(n_systems): pdbid, label, (species, coordinates) = data[i] gradient = grad.gradient(species, coordinates, label, model, AEVC, loss, device) assert gradient.shape == coordinates.shape
def test_vsloader(testvsdata, testdir, distance, n_atoms, f_label, l_label): data = loaders.VSData(testvsdata, distance, testdir, labelspath=testdir) # One batch here corresponds to one target batch_size = 10 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate) iloader = iter(loader) n_atoms_iter = iter(n_atoms) f_label_iter = iter(f_label) # Iterator over first label (in batch) l_label_iter = iter(l_label) # Iterator over last label (in batch) for ids, label, (species, coordinates) in iloader: n_atoms = next(n_atoms_iter) f_label = next(f_label_iter) l_label = next(l_label_iter) assert isinstance(ids, np.ndarray) assert ids.shape == (batch_size, ) assert ids[0][4:] == "_pose_1" assert ids[-2][4:] == "_pose_9" assert ids[-1][4:] == "_ligand" assert isinstance(label, torch.Tensor) assert label.shape == (batch_size, ) assert label[0].item() == pytest.approx(f_label) assert label[-1].item() == pytest.approx(l_label) assert isinstance(species, torch.Tensor) assert species.shape == (batch_size, n_atoms) assert isinstance(coordinates, torch.Tensor) assert coordinates.shape == (batch_size, n_atoms, 3)
def test_pdbloader_labels(testdata, testdir): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate) iloader = iter(loader) ids, labels, (species, coordinates) = next(iloader) assert ids.shape == (batch_size, ) assert (ids == np.array(["1a4r", "1a4w"])).all() assert labels.shape == (batch_size, ) assert torch.allclose(labels, torch.tensor([6.66, 5.92]))
def test_aev_from_loader(testdata, testdir): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir) batch_size = 2 # Compute map of atomic numbers to indices from species amap = loaders.anummap(data.species) # Transform atomic number to species in data data.atomicnums_to_idxs(amap) n_species = len(amap) loader = torch.utils.data.DataLoader( data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate ) iloader = iter(loader) _, labels, (species, coordinates) = next(iloader) # Move everything to device labels = labels.to(device) species = species.to(device) coordinates = coordinates.to(device) AEVC = torchani.AEVComputer(RcR, RcA, EtaR, RsR, EtaA, Zeta, RsA, TsA, n_species) # Radial functions: 1 # Angular functions: 1 # Number of species: 5 # AEV: 1 * 5 + 1 * 5 * (5 + 1) // 2 = 5 (R) + 15 (A) = 20 assert AEVC.aev_length == 20 aev = AEVC.forward((species, coordinates)) assert aev.species.shape == species.shape assert aev.aevs.shape == (batch_size, 42, 20)
def test_pdbloader_ligand_species(testdata, testdir): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) loader = torch.utils.data.DataLoader(data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate) iloader = iter(loader) ids, labels, (species, coordinates) = next(iloader) assert (ids == np.array(["1a4r", "1a4w"])).all() assert species.shape == (batch_size, 42) # Ligand 1a4w is the largest # Test ligand 1a4r (padded with -1) assert torch.allclose( species[0, :], torch.tensor( elements_to_idxs("NPOOOPOOOCCOCOCOCNCNCCONCNNC", amap) + 14 * [-1]), ) # Test ligand 1a4w (no padding) assert torch.allclose( species[1, :], torch.tensor( elements_to_idxs("CCCCCCCCCCNCCSOONCCOCCCNCNNNCCCCCCCSOCCCCN", amap)), )
def test_evaluate(testdata, testdir, tmpdir): # Distance 0.0 produces a segmentation fault (see MDAnalysis#2656) data = loaders.PDBData(testdata, 0.1, testdir) batch_size = 2 # Transform atomic numbers to species amap = loaders.anummap(data.species) data.atomicnums_to_idxs(amap) n_species = len(amap) loader = torch.utils.data.DataLoader( data, batch_size=batch_size, shuffle=False, collate_fn=loaders.pad_collate ) # Define AEVComputer AEVC = torchani.AEVComputer(RcR, RcA, EtaR, RsR, EtaA, Zeta, RsA, TsA, n_species) # Radial functions: 1 # Angular functions: 1 # Number of species: 5 # AEV: 1 * 5 + 1 * 5 * (5 + 1) // 2 = 5 (R) + 15 (A) = 20 assert AEVC.aev_length == 20 mods = [ models.AffinityModel(n_species, AEVC.aev_length), models.AffinityModel(n_species, AEVC.aev_length), ] with mlflow.start_run(): predict.evaluate(mods, loader, AEVC, outpath=tmpdir) assert os.path.isfile(os.path.join(tmpdir, "predict.csv")) assert os.path.isfile(os.path.join(tmpdir, "regplot-predict.pdf")) assert os.path.isfile(os.path.join(tmpdir, "regplot-predict.png"))
cmap, desc="Test set", removeHs=args.removeHs, ) else: testdata = loaders.VSData( args.testfile, args.distance, args.datapaths, cmap, desc="Test set", removeHs=args.removeHs, labelspath=args.vscreening, ) amap = loaders.anummap(traindata.species, validdata.species, testdata.species) else: amap = loaders.anummap(traindata.species, validdata.species) n_species = len(amap) if args.scale: if args.testfile is None: scaler = utils.labels_scaler(traindata, validdata) else: scaler = utils.labels_scaler(traindata, validdata, testdata) else: scaler = None # Transform atomic numbers to 0-based indices traindata.atomicnums_to_idxs(amap)