def test_custom_typer_example_provider(): fname = datadir + "/small.types" t = molgrid.ElementIndexTyper(80) e = molgrid.ExampleProvider(t, data_root=datadir + "/structs") e.populate(fname) batch = e.next_batch(10) c = batch[0].coord_sets[0] assert c.max_type == 80
def test_type_sizing(): fname = datadir+"/ligonly.types" e = molgrid.ExampleProvider(data_root=datadir+"/structs",make_vector_types=True) e.populate(fname) batch_size = 10 b = e.next_batch(batch_size) #provider and example should agree on number of types, even if one coordset is empty assert e.num_types() == b[0].num_types()
def test_a_grid(): fname = datadir+"/small.types" e = molgrid.ExampleProvider(data_root=datadir+"/structs") e.populate(fname) ex = e.next() c = ex.coord_sets[1] assert np.min(c.type_index.tonumpy()) >= 0 gmaker = molgrid.GridMaker() dims = gmaker.grid_dimensions(c.max_type) # this should be grid_dims or get_grid_dims center = c.center() center = tuple(center) mgridout = molgrid.MGrid4f(*dims) mgridgpu = molgrid.MGrid4f(*dims) npout = np.zeros(dims, dtype=np.float32) torchout = torch.zeros(dims, dtype=torch.float32) cudaout = torch.zeros(dims, dtype=torch.float32, device='cuda') gmaker.forward(center, c, mgridout.cpu()) gmaker.forward(center, c, mgridgpu.gpu()) gmaker.forward(center, c, npout) gmaker.forward(center, c, torchout) gmaker.forward(center, c, cudaout) newt = gmaker.make_tensor(center, c) newa = gmaker.make_ndarray(center, c) assert 1.438691 == approx(mgridout.tonumpy().max()) assert 1.438691 == approx(mgridgpu.tonumpy().max()) assert 1.438691 == approx(npout.max()) assert 1.438691 == approx(torchout.numpy().max()) assert 1.438691 == approx(cudaout.cpu().numpy().max()) assert 1.438691 == approx(newt.cpu().numpy().max()) assert 1.438691 == approx(newa.max()) #should overwrite by default, yes? gmaker.forward(center, c, mgridout.cpu()) gmaker.forward(center, c, mgridgpu.gpu()) assert 1.438691 == approx(mgridout.tonumpy().max()) assert 1.438691 == approx(mgridgpu.tonumpy().max()) dims = gmaker.grid_dimensions(e.num_types()) mgridout = molgrid.MGrid4f(*dims) mgridgpu = molgrid.MGrid4f(*dims) gmaker.forward(ex, mgridout.cpu()) gmaker.forward(ex, mgridgpu.gpu()) gmaker.forward(ex, mgridout.cpu()) gmaker.forward(ex, mgridgpu.gpu()) assert 2.094017 == approx(mgridout.tonumpy().max()) assert 2.094017 == approx(mgridgpu.tonumpy().max())
def get_model_gmaker_eproviders(args): #train example provider eptrain=molgrid.ExampleProvider(shuffle=True, stratify_receptor=True, labelpos=0, stratify_pos=0, stratify_min=0, stratify_max=12, stratify_step=2, recmolcache=args.recmolcache, ligmolcache=args.ligmolcache,data_root='/net/pulsar/home/koes/rishal/rmsd_paper/pdbbind/general_minus_refined') eptrain.populate(args.train_types) #test example provider eptest = molgrid.ExampleProvider(shuffle=True, stratify_receptor=True, labelpos=0, stratify_pos=0, stratify_min=0, stratify_max=12, stratify_step=2, recmolcache=args.recmolcache, ligmolcache=args.ligmolcache,data_root='/net/pulsar/home/koes/rishal/rmsd_paper/pdbbind/general_minus_refined') eptest.populate(args.test_types) #gridmaker with defaults gmaker = molgrid.GridMaker() dims = gmaker.grid_dimensions(eptrain.num_types()) model_file = imp.load_source("model", args.model) #load model with seed torch.manual_seed(args.seed) model=model_file.Model(dims) return model, gmaker, eptrain, eptest
def test_cached_with_typer_example_provider(): fname = datadir + "/ligonly.types" t = molgrid.ElementIndexTyper(80) e = molgrid.ExampleProvider(t, ligmolcache=datadir + '/lig.molcache2') e.populate(fname) batch = e.next_batch(10) c = batch[0].coord_sets[1] assert c.max_type == 80 assert c.type_index[0] == 7
def test_example_provider_iterator_interface(): fname = datadir+"/small.types" BSIZE=25 e = molgrid.ExampleProvider(data_root=datadir+"/structs",default_batch_size=BSIZE) e.populate(fname) e2 = molgrid.ExampleProvider(data_root=datadir+"/structs",default_batch_size=BSIZE) e2.populate(fname) nlabels = e.num_labels() labels = molgrid.MGrid2f(BSIZE,nlabels) labels2 = molgrid.MGrid2f(BSIZE,nlabels) for (i, b) in enumerate(e): b2 = e2.next_batch() b.extract_labels(labels.cpu()) b2.extract_labels(labels2.cpu()) np.testing.assert_allclose(labels,labels2) if i > 10: break
def test_gnina_example_provider(): fname = datadir + "/small.types" e = molgrid.ExampleProvider(data_root=datadir + "/structs") e.populate(fname) batch_size = 100 batch = e.next_batch(batch_size) #extract labels nlabels = e.num_labels() assert nlabels == 3 labels = molgrid.MGrid2f(batch_size, nlabels) gpulabels = molgrid.MGrid2f(batch_size, nlabels) batch.extract_labels(labels.cpu()) batch.extract_labels(gpulabels.gpu()) assert np.array_equal(labels.tonumpy(), gpulabels.tonumpy()) label0 = molgrid.MGrid1f(batch_size) label1 = molgrid.MGrid1f(batch_size) label2 = molgrid.MGrid1f(batch_size) batch.extract_label(0, label0.cpu()) batch.extract_label(1, label1.cpu()) batch.extract_label(2, label2.gpu()) assert label0[0] == 1 assert label1[0] == approx(6.05) assert label2[0] == approx(0.162643) assert labels[0, 0] == 1 assert labels[0][1] == approx(6.05) assert labels[0][2] == approx(0.162643) for i in range(nlabels): assert label0[i] == labels[i][0] assert label1[i] == labels[i][1] assert label2[i] == labels[i][2] ex = batch[0] crec = ex.coord_sets[0] assert crec.size() == 1781 assert list(crec.coords[0]) == approx([45.042, 12.872, 13.001]) assert crec.radii[0] == approx(1.8) assert list(crec.type_index)[:10] == [ 6.0, 1.0, 1.0, 7.0, 0.0, 6.0, 1.0, 1.0, 7.0, 1.0 ] clig = ex.coord_sets[1] assert clig.size() == 10 assert list(clig.coords[9]) == approx([27.0536, 3.2453, 32.4511]) assert list(clig.type_index) == [ 8.0, 1.0, 1.0, 9.0, 10.0, 0.0, 0.0, 1.0, 9.0, 8.0 ] batch = e.next_batch(1) a = np.array([0], dtype=np.float32) batch.extract_label(1, a)
def test_pytorch_dataset(): fname = datadir + "/small.types" e = molgrid.ExampleProvider(data_root=datadir + "/structs") e.populate(fname) m = molgrid.MolDataset(fname, data_root=datadir + "/structs") assert len(m) == 1000 ex = e.next() coordinates = ex.merge_coordinates() center, coords, types, radii, labels = m[0] assert list(center.shape) == [3] np.testing.assert_allclose(coords, coordinates.coords.tonumpy()) np.testing.assert_allclose(types, coordinates.type_index.tonumpy()) np.testing.assert_allclose(radii, coordinates.radii.tonumpy()) assert len(labels) == 3 assert labels[0] == 1 np.testing.assert_allclose(labels[1], 6.05) np.testing.assert_allclose(labels[-1], 0.162643) center, coords, types, radii, labels = m[-1] assert labels[0] == 0 np.testing.assert_allclose(labels[1], -10.3) '''Testing out the collate_fn when used with torch.utils.data.DataLoader''' torch_loader = torch.utils.data.DataLoader( m, batch_size=8, collate_fn=molgrid.MolDataset.collateMolDataset) iterator = iter(torch_loader) next(iterator) lengths, center, coords, types, radii, labels = next(iterator) assert len(lengths) == 8 assert center.shape[0] == 8 assert coords.shape[0] == 8 assert types.shape[0] == 8 assert radii.shape[0] == 8 assert radii.shape[0] == 8 assert labels.shape[0] == 8 mcenter, mcoords, mtypes, mradii, mlabels = m[10] np.testing.assert_allclose(center[2], mcenter) np.testing.assert_allclose(coords[2][:lengths[2]], mcoords) np.testing.assert_allclose(types[2][:lengths[2]], mtypes) np.testing.assert_allclose(radii[2][:lengths[2]], mradii.unsqueeze(1)) assert len(labels[2]) == len(mlabels) assert labels[2][0] == mlabels[0] assert labels[2][1] == mlabels[1]
def test_duplicated_examples(): '''This is for files with multiple ligands''' fname = datadir+"/multilig.types" e = molgrid.ExampleProvider(data_root=datadir+"/structs") e.populate(fname) batch_size = 10 b = e.next_batch(batch_size) for i in range(1,batch_size): assert len(b[i].coord_sets) == 3 #one rec and two ligands #ligands should be different sqsum = np.square(b[i].coord_sets[1].coords.tonumpy() - b[i].coord_sets[2].coords.tonumpy()).sum() assert sqsum > 0 e = molgrid.ExampleProvider(data_root=datadir+"/structs",duplicate_first=True) e.populate(fname) batch_size = 10 b = e.next_batch(batch_size) for i in range(1,batch_size): assert len(b[i].coord_sets) == 4 #rec lig rec lig #ligands should be different sqsum = np.square(b[i].coord_sets[1].coords.tonumpy() - b[i].coord_sets[3].coords.tonumpy()).sum() assert sqsum > 0 #receptors should be the same sqsum = np.square(b[i].coord_sets[0].coords.tonumpy() - b[i].coord_sets[2].coords.tonumpy()).sum()
def test_vector_sum_types(): fname = datadir+"/ligonly.types" e = molgrid.ExampleProvider(data_root=datadir+"/structs",make_vector_types=True) e.populate(fname) batch_size = 10 b = e.next_batch(batch_size) sum = molgrid.MGrid2f(batch_size, e.num_types()) b.sum_types(sum) sum2 = np.zeros(sum.shape,np.float32) b.sum_types(sum2) sum3 = torch.empty(sum.shape,dtype=torch.float32,device='cuda') b.sum_types(sum3) np.testing.assert_allclose(sum.tonumpy(),sum3.detach().cpu().numpy(),atol=1e-5) np.testing.assert_allclose(sum.tonumpy(),sum2,atol=1e-5) np.testing.assert_allclose(sum[0].tonumpy(), [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2., 3., 0., 0., 0., 0., 0., 0., 2., 2., 1., 0., 0., 0.], atol=1e-5) e = molgrid.ExampleProvider(molgrid.NullIndexTyper(), molgrid.defaultGninaLigandTyper, data_root=datadir+"/structs",make_vector_types=True) e.populate(fname) b = e.next_batch(batch_size) sum = molgrid.MGrid2f(batch_size, e.num_types()) b.sum_types(sum) np.testing.assert_allclose(sum[0].tonumpy(), [ 2., 3., 0., 0., 0., 0., 0., 0., 2., 2., 1., 0., 0., 0.], atol=1e-5)
def setup_gmaker_eprov(resolution: float, radius: float, data_file: Path): """Setup the molgrid GridMaker and ExampleProvider for the data specified in data_file. Args: resolution (float): Resolution of the grid radius (float): Radius of the grid in Angstrom types_file (Path): File specifying the types file pairings making up the data set Returns: tuple: GridMaker, ExampleProvider """ # dim is 1 voxel length less than 2xradius to ensure that center is on node between 8 voxels gmaker = molgrid.GridMaker(resolution=resolution, dimension=2 * radius - resolution) e_provider_test = molgrid.ExampleProvider(data_root="", balanced=False, shuffle=False) e_provider_test.populate(str(data_file)) return gmaker, e_provider_test
def test_vector_types_duplicate(): fname = datadir+"/smalldup.types" teste = molgrid.ExampleProvider(molgrid.GninaVectorTyper(),shuffle=False, duplicate_first=True,data_root=datadir+"/structs") teste.populate(fname) batch_size = 1 gmaker = molgrid.GridMaker() dims = gmaker.grid_dimensions(molgrid.GninaVectorTyper().num_types()*4) tensor_shape = (batch_size,)+dims input_tensor_1 = torch.zeros(tensor_shape, dtype=torch.float32, device='cuda') batch_1 = teste.next_batch(batch_size) gmaker.forward(batch_1, input_tensor_1,random_translation=0.0, random_rotation=False) input_tensor_2 = torch.zeros(tensor_shape, dtype=torch.float32, device='cpu') gmaker.forward(batch_1, input_tensor_2,random_translation=0.0, random_rotation=False) np.testing.assert_allclose(input_tensor_1.cpu().detach().numpy(),input_tensor_2.detach().numpy(),atol=1e-4) assert input_tensor_1.cpu().detach().numpy().max() < 75
def test_dx(): fname = datadir + "/small.types" e = molgrid.ExampleProvider(data_root=datadir + "/structs") e.populate(fname) ex = e.next() c = ex.coord_sets[1] assert np.min(c.type_index.tonumpy()) >= 0 gmaker = molgrid.GridMaker() dims = gmaker.grid_dimensions( e.type_size()) # this should be grid_dims or get_grid_dims center = c.coord.tonumpy().mean(axis=0) center = tuple(center.astype(float)) mgridout = molgrid.MGrid4f(*dims) gmaker.forward(center, c, mgridout.cpu()) molgrid.write_dx("tmp.dx", mgridout[0].cpu(), center, 0.5) mgridin = molgrid.read_dx("tmp.dx") os.remove("tmp.dx") g = mgridin.grid().tonumpy() go = mgridout[0].tonumpy() np.testing.assert_array_almost_equal(g, go, decimal=5) assert center == approx(list(mgridin.center())) assert mgridin.resolution() == 0.5 #dump everything molgrid.write_dx_grids("/tmp/tmp", e.get_type_names(), mgridout.cpu(), center, gmaker.get_resolution(), 0.5) checkgrid = molgrid.MGrid4f(*dims) molgrid.read_dx_grids("/tmp/tmp", e.get_type_names(), checkgrid.cpu()) np.testing.assert_array_almost_equal(mgridout.tonumpy(), 2.0 * checkgrid.tonumpy(), decimal=5)
def test_mol_example_provider(capsys): fname = datadir+"/smallmol.types" e = molgrid.ExampleProvider(data_root=datadir+"/structs") e.populate(fname) with capsys.disabled(): #bunch openbabel garbage ex = e.next() b = e.next_batch(10) #should wrap around #with defaults, file should be read in order assert ex.labels[0] == 1 assert ex.labels[1] == approx(3.3747) assert ex.coord_sets[0].size() == 1289 assert ex.coord_sets[1].size() == 8 coords = ex.coord_sets[1].coord.tonumpy() assert tuple(coords[0]) == approx((26.6450,6.1410,4.6680)) assert len(ex.coord_sets) == 2 l0 = [ex.labels[0] for ex in b] l1 = [ex.labels[1] for ex in b] #labels should be in order assert (1,1,0,0,0,1,1,1,0,0) == tuple(l0) assert (6.0000, 3.8697, -6.6990, -4.3010, -9.0000, 3.3747, 6.0000, 3.8697, -6.6990, -4.3010) == approx(tuple(l1))
tgs = make_tags(args) + args.tags wandb.init(entity='andmcnutt', project='DDG_model_Regression', config=args, tags=tgs) #Parameters that are not important for hyperparameter sweep batch_size = args.batch_size epochs = args.epoch # print('ligtr={}, rectr={}'.format(args.ligtr,args.rectr)) traine = molgrid.ExampleProvider( ligmolcache=args.ligtr, recmolcache=args.rectr, balanced=True, shuffle=True, duplicate_first=True, default_batch_size=batch_size, iteration_scheme=molgrid.IterationScheme.SmallEpoch) traine.populate(args.trainfile) teste = molgrid.ExampleProvider( ligmolcache=args.ligte, recmolcache=args.recte, shuffle=True, duplicate_first=True, default_batch_size=batch_size, iteration_scheme=molgrid.IterationScheme.SmallEpoch) teste.populate(args.testfile) gmaker = molgrid.GridMaker(binary=args.binary_rep) dims = gmaker.grid_dimensions(14 * 4) # only one rec+onelig per example
def test_train_torch_cnn(): batch_size = 50 datadir = os.path.dirname(__file__) + '/data' fname = datadir + "/small.types" molgrid.set_random_seed(0) torch.manual_seed(0) np.random.seed(0) class Net(nn.Module): def __init__(self, dims): super(Net, self).__init__() self.pool0 = nn.MaxPool3d(2) self.conv1 = nn.Conv3d(dims[0], 32, kernel_size=3, padding=1) self.pool1 = nn.MaxPool3d(2) self.conv2 = nn.Conv3d(32, 64, kernel_size=3, padding=1) self.pool2 = nn.MaxPool3d(2) self.conv3 = nn.Conv3d(64, 128, kernel_size=3, padding=1) self.last_layer_size = dims[1] // 8 * dims[2] // 8 * dims[ 3] // 8 * 128 self.fc1 = nn.Linear(self.last_layer_size, 2) def forward(self, x): x = self.pool0(x) x = F.relu(self.conv1(x)) x = self.pool1(x) x = F.relu(self.conv2(x)) x = self.pool2(x) x = F.relu(self.conv3(x)) x = x.view(-1, self.last_layer_size) x = self.fc1(x) return x def weights_init(m): if isinstance(m, nn.Conv3d) or isinstance(m, nn.Linear): init.xavier_uniform_(m.weight.data) batch_size = 50 e = molgrid.ExampleProvider(data_root=datadir + "/structs", balanced=True, shuffle=True) e.populate(fname) gmaker = molgrid.GridMaker() dims = gmaker.grid_dimensions(e.num_types()) tensor_shape = (batch_size, ) + dims model = Net(dims).to('cuda') model.apply(weights_init) optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9) input_tensor = torch.zeros(tensor_shape, dtype=torch.float32, device='cuda') float_labels = torch.zeros(batch_size, dtype=torch.float32) losses = [] for iteration in range(100): #load data batch = e.next_batch(batch_size) gmaker.forward(batch, input_tensor, 0, random_rotation=False ) #not rotating since convergence is faster this way batch.extract_label(0, float_labels) labels = float_labels.long().to('cuda') optimizer.zero_grad() output = model(input_tensor) loss = F.cross_entropy(output, labels) loss.backward() optimizer.step() losses.append(float(loss)) avefinalloss = np.array(losses[-5:]).mean() assert avefinalloss < .4
def main(args): # Fix seeds molgrid.set_random_seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # Set CuDNN options for reproducibility torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Set up libmolgrid e = molgrid.ExampleProvider(data_root=args.data_root, balanced=True, shuffle=True) e.populate(args.train_file) gmaker = molgrid.GridMaker() dims = gmaker.grid_dimensions(e.num_types()) tensor_shape = (args.batch_size, ) + dims # Construct input tensors input_tensor = torch.zeros(tensor_shape, dtype=torch.float32, device='cuda') float_labels = torch.zeros(args.batch_size, dtype=torch.float32) # Initialise network - Two models currently available (see models.py for details) if args.model == 'Ragoza': model = Basic_CNN(dims).to('cuda') elif args.model == 'Imrie': model = DenseNet(dims, block_config=(4, 4, 4)).to('cuda') else: print("Please specify a valid architecture") exit() # Set weights for network if args.weights: model.load_state_dict(torch.load(args.weights)) print("Loaded model parameters") else: model.apply(weights_init) print("Randomly initialised model parameters") # Print number of parameters in model print("Number of model params: %dK" % (sum([x.nelement() for x in model.parameters()]) / 1000)) # Train network # Construct optimizer optimizer = optim.SGD(model.parameters(), lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = lr_scheduler.ExponentialLR(optimizer, args.anneal_rate) print("Initial learning rate: %.6f" % scheduler.get_lr()[0]) # Train loop losses = [] for it in range(1, args.iterations + 1): # Load data batch = e.next_batch(args.batch_size) gmaker.forward(batch, input_tensor, random_rotation=args.rotate, random_translation=args.translate) batch.extract_label(0, float_labels) labels = float_labels.long().to('cuda') # Train optimizer.zero_grad() output = model(input_tensor) loss = F.cross_entropy(output, labels) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.clip_gradients) optimizer.step() losses.append(float(loss)) # Anneal learning rate if it % args.anneal_iter == 0: scheduler.step() print("Current iteration: %d, Annealing learning rate: %.6f" % (it, scheduler.get_lr()[0])) # Progress if it % args.display_iter == 0: print("Current iteration: %d, Loss: %.3f" % (it, float(np.mean(losses[-args.display_iter:])))) # Save model if it % args.save_iter == 0: print("Saving model after %d iterations." % it) torch.save( model.state_dict(), args.save_dir + "/" + args.save_prefix + ".iter-" + str(it)) # Test model if args.test_file != '' and it % args.test_iter == 0: # Set to test mode model.eval() predictions = [] labs = [] e_test = molgrid.ExampleProvider(data_root=args.data_root, balanced=False, shuffle=False) e_test.populate(args.test_file) num_samples = e_test.size() num_batches = -(-num_samples // args.batch_size) for _ in range(num_batches): # Load data batch = e_test.next_batch(args.batch_size) batch_predictions = [] batch.extract_label(0, float_labels) labs.extend(list(float_labels.detach().cpu().numpy())) for _ in range(args.num_rotate): gmaker.forward(batch, input_tensor, random_rotation=args.rotate, random_translation=0.0) # Predict output = F.softmax(model(input_tensor), dim=1) batch_predictions.append( list(output.detach().cpu().numpy()[:, 0])) predictions.extend(list(np.mean(batch_predictions, axis=0))) # Print performance labs = labs[:num_samples] predictions = predictions[:num_samples] print("Current iter: %d, AUC: %.2f" % (it, roc_auc_score(labs, predictions)), flush=True) # Set to train mode model.train()
def main(args): # Fix seeds molgrid.set_random_seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) # Set CuDNN options for reproducibility torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False # Set up libmolgrid e = molgrid.ExampleProvider(data_root=args.data_root, balanced=False, shuffle=False) e.populate(args.test_file) gmaker = molgrid.GridMaker() dims = gmaker.grid_dimensions(e.num_types()) tensor_shape = (args.batch_size, ) + dims # Load test file examples (NOTE: not possible to do directly via molgrid) with open(args.test_file, 'r') as f: lines = f.readlines() # Construct input tensors input_tensor = torch.zeros(tensor_shape, dtype=torch.float32, device='cuda') float_labels = torch.zeros(args.batch_size, dtype=torch.float32) # Initialise network - Two models currently available (see models.py for details) if args.model == 'Ragoza': model = Basic_CNN(dims).to('cuda') elif args.model == 'Imrie': model = DenseNet(dims, block_config=(4, 4, 4)).to('cuda') else: print("Please specify a valid architecture") exit() # Load weights for network model.load_state_dict(torch.load(args.weights)) print("Loaded model parameters") # Print number of parameters in model print("Number of model params: %dK" % (sum([x.nelement() for x in model.parameters()]) / 1000, )) # Test network # Ensure model in eval mode model.eval() # Test loop predictions = [] labels = [] num_samples = e.size() num_batches = -(-num_samples // args.batch_size) print("Number of examples: %d" % num_samples) for it in range(num_batches): # Load data batch = e.next_batch(args.batch_size) gmaker.forward(batch, input_tensor, random_rotation=args.rotate, random_translation=args.translate) batch.extract_label(0, float_labels) labels.extend(list(float_labels.detach().cpu().numpy())) batch_predictions = [] for _ in range(args.num_rotate): gmaker.forward(batch, input_tensor, random_rotation=args.rotate, random_translation=args.translate) # Predict output = F.softmax(model(input_tensor), dim=1) batch_predictions.append(list(output.detach().cpu().numpy()[:, 1])) predictions.extend(list(np.mean(batch_predictions, axis=0))) # Progress if it % args.display_iter == 0: print("Processed: %d / %d examples" % (it * args.batch_size, num_samples)) # Print performance labels = labels[:num_samples] predictions = predictions[:num_samples] print("Test AUC: %.2f" % (roc_auc_score(labels, predictions)), flush=True) # Save predictions output_lines = [] for line, pred in zip(lines, predictions): output_lines.append(str(pred) + ' ' + line) with open(args.output_path, 'w') as f: for line in output_lines: f.write(line)
import sys, molgrid import numpy as np sys.path.insert(0, '.') import liGAN rec_typer = molgrid.FileMappedGninaTyper('data/my_rec_map') lig_typer = molgrid.FileMappedGninaTyper('data/my_lig_map') lig_channels = liGAN.atom_types.get_channels_from_map(lig_typer) print('loading data') ex_provider = molgrid.ExampleProvider( rec_typer, lig_typer, data_root='data/molport', recmolcache='data/molportFULL_rec.molcache2' and '', ligmolcache='data/molportFULL_lig.molcache2' and '', shuffle=True) ex_provider.populate('data/molportFULL_rand_test0_1000.types') batch_size = 1000 n_examples = ex_provider.size() n_batches = n_examples // batch_size type_counts = np.zeros(lig_typer.num_types()) mol_count = 0 for i in range(n_batches): for ex in ex_provider.next_batch(batch_size): struct = liGAN.atom_structs.AtomStruct.from_coord_set( ex.coord_sets[1], lig_channels) type_counts += struct.type_counts
def __init__( self, data_file, data_root, batch_size, rec_typer, lig_typer, use_rec_elems=True, resolution=0.5, dimension=None, grid_size=None, shuffle=False, random_rotation=False, random_translation=0.0, diff_cond_transform=False, diff_cond_structs=False, n_samples=1, rec_molcache=None, lig_molcache=None, cache_structs=True, device='cuda', debug=False, ): super().__init__() assert (dimension or grid_size) and not (dimension and grid_size), \ 'must specify one of either dimension or grid_size' if grid_size: dimension = atom_grids.size_to_dimension(grid_size, resolution) # create receptor and ligand atom typers self.lig_typer = AtomTyper.get_typer(*lig_typer.split('-'), rec=False) self.rec_typer = \ AtomTyper.get_typer(*rec_typer.split('-'), rec=use_rec_elems) atom_typers = [self.rec_typer, self.lig_typer] if diff_cond_structs: # duplicate atom typers atom_typers *= 2 # create example provider self.ex_provider = molgrid.ExampleProvider( *atom_typers, data_root=data_root, recmolcache=rec_molcache or '', ligmolcache=lig_molcache or '', cache_structs=cache_structs, shuffle=shuffle, num_copies=n_samples, ) # create molgrid maker self.grid_maker = molgrid.GridMaker( resolution=resolution, dimension=dimension, gaussian_radius_multiple=-1.5, ) self.batch_size = batch_size # transformation settings self.random_rotation = random_rotation self.random_translation = random_translation self.diff_cond_transform = diff_cond_transform self.diff_cond_structs = diff_cond_structs self.debug = debug self.device = device # transform interpolation state self.cond_interp = TransformInterpolation(n_samples=n_samples) # load data from file self.ex_provider.populate(data_file)
def test_example_provider_epoch_iteration(): fname = datadir + "/small.types" e = molgrid.ExampleProvider( data_root=datadir + "/structs", default_batch_size=10, iteration_scheme=molgrid.IterationScheme.LargeEpoch) e.populate(fname) assert e.small_epoch_size() == 1000 assert e.large_epoch_size() == 1000 cnt = 0 for batch in e: cnt += 1 assert cnt == 100 e = molgrid.ExampleProvider( data_root=datadir + "/structs", default_batch_size=10, balanced=True, iteration_scheme=molgrid.IterationScheme.LargeEpoch) e.populate(fname) assert e.small_epoch_size() == 326 assert e.large_epoch_size() == 1674 cnt = 0 for batch in e: cnt += 1 assert cnt == 168 e = molgrid.ExampleProvider( data_root=datadir + "/structs", default_batch_size=10, balanced=False, stratify_receptor=True, iteration_scheme=molgrid.IterationScheme.SmallEpoch) e.populate(fname) assert e.small_epoch_size() == 120 assert e.large_epoch_size() == 1260 cnt = 0 for batch in e: cnt += 1 assert cnt == 12 values = set() e = molgrid.ExampleProvider( data_root=datadir + "/structs", default_batch_size=8, balanced=True, stratify_receptor=True, iteration_scheme=molgrid.IterationScheme.SmallEpoch) e.populate(fname) assert e.small_epoch_size() == 112 assert e.large_epoch_size() == 2240 cnt = 0 small = 0 large = 0 for batch in e: for ex in batch: key = ex.coord_sets[0].src + ":" + ex.coord_sets[1].src #small epoch should see an example at _most_ once assert key not in values values.add(key) cnt += 1 s = e.get_small_epoch_num() assert s >= small if s > small: assert s == small + 1 small = s l = e.get_large_epoch_num() assert l >= large if l > large: assert l == large + 1 large = l assert cnt == 14 e = molgrid.ExampleProvider( data_root=datadir + "/structs", default_batch_size=10, balanced=True, stratify_receptor=True, iteration_scheme=molgrid.IterationScheme.LargeEpoch) e.populate(fname) assert e.small_epoch_size() == 112 assert e.large_epoch_size() == 2240 values = set() cnt = 0 small = 0 large = 0 for batch in e: for ex in batch: key = ex.coord_sets[0].src + ":" + ex.coord_sets[1].src values.add(key) cnt += 1 s = e.get_small_epoch_num() assert s >= small if s > small: assert s == small + 1 small = s l = e.get_large_epoch_num() assert l >= large if l > large: assert l == large + 1 large = l assert cnt == 224 assert len( values) == e.size() #large epoch should see everything at least once
def test_vector_types_mol(): '''Test vector types with a real molecule''' fname = datadir + "/small.types" e = molgrid.ExampleProvider(data_root=datadir + "/structs") e.populate(fname) ex = e.next() ev = molgrid.ExampleProvider(data_root=datadir + "/structs", make_vector_types=True) ev.populate(fname) exv = ev.next() assert exv.has_vector_types() assert not ex.has_vector_types() gmaker = molgrid.GridMaker() dims = gmaker.grid_dimensions( ex.num_types()) # this should be grid_dims or get_grid_dims mgridout = molgrid.MGrid4f(*dims) mgridgpu = molgrid.MGrid4f(*dims) mgridoutv = molgrid.MGrid4f(*dims) mgridgpuv = molgrid.MGrid4f(*dims) d = np.ones(dims, np.float32) diff = molgrid.MGrid4f(*dims) diff.copyFrom(d) gmaker.forward(ex, mgridout.cpu()) gmaker.forward(ex, mgridgpu.gpu()) center = ex.coord_sets[-1].center() c = ex.merge_coordinates() backcoordscpu = molgrid.MGrid2f(c.size(), 3) backcoordsgpu = molgrid.MGrid2f(c.size(), 3) gmaker.backward(center, c, diff.cpu(), backcoordscpu.cpu()) gmaker.backward(center, c, diff.gpu(), backcoordsgpu.gpu()) #vector types gmaker.set_radii_type_indexed(True) gmaker.forward(exv, mgridoutv.cpu()) gmaker.forward(exv, mgridgpuv.gpu()) cv = exv.merge_coordinates() vbackcoordscpu = molgrid.MGrid2f(cv.size(), 3) vbackcoordsgpu = molgrid.MGrid2f(cv.size(), 3) vbacktypescpu = molgrid.MGrid2f(cv.size(), cv.num_types()) vbacktypesgpu = molgrid.MGrid2f(cv.size(), cv.num_types()) gmaker.backward(center, cv, diff.cpu(), vbackcoordscpu.cpu(), vbacktypescpu.cpu()) gmaker.backward(center, cv, diff.gpu(), vbackcoordsgpu.gpu(), vbacktypesgpu.gpu()) np.testing.assert_allclose(mgridout.tonumpy(), mgridoutv.tonumpy(), atol=1e-5) np.testing.assert_allclose(mgridgpu.tonumpy(), mgridgpuv.tonumpy(), atol=1e-5) np.testing.assert_allclose(mgridoutv.tonumpy(), mgridgpuv.tonumpy(), atol=1e-5) np.testing.assert_allclose(vbackcoordscpu.tonumpy(), backcoordscpu.tonumpy(), atol=1e-5) np.testing.assert_allclose(vbackcoordsgpu.tonumpy(), backcoordsgpu.tonumpy(), atol=1e-5) np.testing.assert_allclose(vbackcoordscpu.tonumpy(), vbackcoordsgpu.tonumpy(), atol=1e-4) np.testing.assert_allclose(vbacktypescpu.tonumpy(), vbacktypesgpu.tonumpy(), atol=1e-4)