def test_init_sequential_layers_default(): sequential_layers = SequentialLayers(in_feats=1, hidden_feats=[2]) assert len(sequential_layers) == 3 assert isinstance(sequential_layers[0], torch.nn.Linear) assert isinstance(sequential_layers[1], torch.nn.ReLU) assert isinstance(sequential_layers[2], torch.nn.Dropout) assert numpy.isclose(sequential_layers[2].p, 0.0)
def test_init(self): model = DGLMoleculeLightningModel( convolution_module=ConvolutionModule("SAGEConv", in_feats=1, hidden_feats=[2, 2]), readout_modules={ "atom": ReadoutModule( pooling_layer=PoolAtomFeatures(), readout_layers=SequentialLayers(in_feats=2, hidden_feats=[2], activation=["Identity"]), postprocess_layer=ComputePartialCharges(), ), "bond": ReadoutModule( pooling_layer=PoolBondFeatures( layers=SequentialLayers(in_feats=4, hidden_feats=[4])), readout_layers=SequentialLayers(in_feats=4, hidden_feats=[8]), ), }, learning_rate=0.01, ) assert model.convolution_module is not None assert isinstance(model.convolution_module, ConvolutionModule) assert isinstance(model.convolution_module.gcn_layers, GCNStack) assert len(model.convolution_module.gcn_layers) == 2 assert all(x in model.readout_modules for x in ["atom", "bond"]) assert isinstance(model.readout_modules["atom"].pooling_layer, PoolAtomFeatures) assert isinstance(model.readout_modules["bond"].pooling_layer, PoolBondFeatures) assert numpy.isclose(model.learning_rate, 0.01)
def test_init_sequential_layers_invalid(): with pytest.raises(ValueError) as error_info: SequentialLayers( in_feats=1, hidden_feats=[2], activation=["ReLU", "LeakyReLU"], dropout=[0.0, 0.5], ) assert "The `hidden_feats`, `activation`, and `dropout` lists must be the" in str( error_info.value)
def mock_atom_model() -> DGLMoleculeLightningModel: return DGLMoleculeLightningModel( convolution_module=ConvolutionModule("SAGEConv", in_feats=4, hidden_feats=[4]), readout_modules={ "atom": ReadoutModule( pooling_layer=PoolAtomFeatures(), readout_layers=SequentialLayers(in_feats=4, hidden_feats=[2]), postprocess_layer=ComputePartialCharges(), ), }, learning_rate=0.01, )
def __init__( self, convolution_config: ConvolutionConfig, readout_configs: Dict[str, ReadoutConfig], ): super(MolSAGE, self).__init__() self.convolution = GraphSAGE( in_feats=convolution_config.in_feats, hidden_feats=convolution_config.hidden_feats, activation=convolution_config.activation, aggregator_type=["mean"] * len(convolution_config.hidden_feats), ) self._pooling_layers: Dict[str, PoolingLayer] = { readout_type: readout_config.pooling_layer for readout_type, readout_config in readout_configs.items() } self._postprocess_layers: Dict[str, PostprocessLayer] = { readout_type: readout_config.postprocess_layer for readout_type, readout_config in readout_configs.items() if readout_config.postprocess_layer is not None } self._readouts: Dict[str, SequentialLayers] = { readout_type: SequentialLayers( (convolution_config.hidden_feats[-1] * readout_config.pooling_layer.n_feature_columns()), readout_config.hidden_feats, activation=readout_config.activation, dropout=readout_config.dropout, ) for readout_type, readout_config in readout_configs.items() } # Add the layers directly to the model. This is required for pytorch to detect # the parameters of the child models. for readout_type, pooling_layer in self._pooling_layers.items(): setattr(self, f"pooling_{readout_type}", pooling_layer) for readout_type, readout_layer in self._readouts.items(): setattr(self, f"readout_{readout_type}", readout_layer)
def test_init_sequential_layers_inputs(): sequential_layers = SequentialLayers( in_feats=1, hidden_feats=[2, 1], activation=["ReLU", "LeakyReLU"], dropout=[0.0, 0.5], ) assert len(sequential_layers) == 6 assert isinstance(sequential_layers[0], torch.nn.Linear) assert isinstance(sequential_layers[1], torch.nn.ReLU) assert isinstance(sequential_layers[2], torch.nn.Dropout) assert numpy.isclose(sequential_layers[2].p, 0.0) assert isinstance(sequential_layers[3], torch.nn.Linear) assert isinstance(sequential_layers[4], torch.nn.LeakyReLU) assert isinstance(sequential_layers[5], torch.nn.Dropout) assert numpy.isclose(sequential_layers[5].p, 0.5)
def test_forward(self, dgl_methane): model = MoleculeGCNModel( convolution_module=ConvolutionModule("SAGEConv", in_feats=4, hidden_feats=[4]), readout_modules={ "atom": ReadoutModule( pooling_layer=PoolAtomFeatures(), readout_layers=SequentialLayers(in_feats=4, hidden_feats=[2]), postprocess_layer=ComputePartialCharges(), ), }, ) output = model.forward(dgl_methane) assert "atom" in output assert output["atom"].shape == (5, 1)
def main(): # Define the features of interest. atom_features = [ AtomicElement(["C", "O", "H", "N", "S", "F", "Br", "Cl"]), AtomConnectivity(), AtomFormalCharge([0]), # AtomIsAromatic(), AtomIsInRing(), ] bond_features = [ # BondIsAromatic(), BondIsInRing(), ] # Load in the pre-processed training and test molecules and store them in # featurized graphs. training_set, test_set, n_features = load_data_sets(atom_features, bond_features) # Define the model. model = MolSAGE( convolution_config=ConvolutionConfig( in_feats=n_features, hidden_feats=[128, 128, 128], ), readout_configs={ "am1_charges": ReadoutConfig( pooling_layer=PoolAtomFeatures(), hidden_feats=[128, 128, 128, 2], postprocess_layer=ComputePartialCharges(), ), "am1_wbo": ReadoutConfig( pooling_layer=PoolBondFeatures( layers=SequentialLayers( in_feats=128 * 2, hidden_feats=[128 * 2], ) ), hidden_feats=[256, 256, 256, 1], ), } ) print(model) # Define the optimizer and the loss function. optimizer = torch.optim.Adam(model.parameters(), lr=0.0002) criterion = torch.nn.MSELoss() losses = [] for epoch in range(100): graph: dgl.DGLGraph for batch, (graph, features, labels) in enumerate(training_set): # Perform the models forward pass. y_pred = model(graph, features) # compute loss loss = torch.zeros(1) for label_name, label in labels.items(): loss += torch.sqrt(criterion(y_pred[label_name], label)) losses.append(loss.detach().numpy().item()) # backward propagation optimizer.zero_grad() loss.backward() optimizer.step() print( f"epoch={epoch} " f"batch={batch} " f"loss={loss.item():.6f} " # f"q_tot={y_pred['am1_charges'].sum().detach().item():.4f} " ) # Compute the test accuracy test_graph, test_features, test_labels = next(iter(test_set)) model.eval() with torch.no_grad(): test_pred = model(test_graph, test_features) test_loss = 0.0 for label_name, label in test_labels.items(): test_loss += torch.sqrt(criterion(test_pred[label_name], label)) print("________________") print(f"test loss={test_loss}") # Plot the training losses. pyplot.plot(losses) pyplot.savefig("train-losses.png") pyplot.cla() # Plot the predicted vs reference values. for label in test_labels: pyplot.scatter( test_labels[label].flatten().numpy(), test_pred[label].flatten().numpy(), label="test" ) pyplot.scatter( labels[label].flatten().numpy(), y_pred[label].flatten().detach().numpy(), label="train", alpha=0.3, ) pyplot.legend() pyplot.gcf().set_size_inches(4, 4) pyplot.xlabel("OpenEye") pyplot.ylabel("Predicted") pyplot.tight_layout() pyplot.savefig(f"{label}.png") pyplot.cla()
def main(): print(torch.seed()) # Define the atom / bond features of interest. atom_features = [ AtomicElement(["C", "O", "H"]), AtomConnectivity(), ] bond_features = [ BondOrder(), ] # Compute the total length of the input atomic feature vector n_atom_features = sum(len(feature) for feature in atom_features) # Load in the training and test data training_smiles = ["CO", "CCO", "CCCO", "CCCCO"] training_data = DGLMoleculeDataset.from_smiles( training_smiles, atom_features, bond_features, label_function, ) training_loader = DGLMoleculeDataLoader(training_data, batch_size=len(training_smiles), shuffle=False) test_smiles = [ "CCCCCCCCCO", ] test_loader = DGLMoleculeDataLoader( DGLMoleculeDataset.from_smiles( test_smiles, atom_features, bond_features, label_function, ), batch_size=len(test_smiles), shuffle=False, ) # Define the model. n_gcn_layers = 5 n_gcn_hidden_features = 128 n_am1_layers = 2 n_am1_hidden_features = 64 learning_rate = 0.001 model = DGLMoleculeLightningModel( convolution_module=ConvolutionModule( architecture="SAGEConv", in_feats=n_atom_features, hidden_feats=[n_gcn_hidden_features] * n_gcn_layers, ), readout_modules={ # The keys of the readout modules should correspond to keys in the # label dictionary. "am1-charges": ReadoutModule( pooling_layer=PoolAtomFeatures(), readout_layers=SequentialLayers( in_feats=n_gcn_hidden_features, hidden_feats=[n_am1_hidden_features] * n_am1_layers + [2], activation=["ReLU"] * n_am1_layers + ["Identity"], ), postprocess_layer=ComputePartialCharges(), ) }, learning_rate=learning_rate, ) print(model) # Train the model n_epochs = 100 n_gpus = 0 if not torch.cuda.is_available() else 1 print(f"Using {n_gpus} GPUs") trainer = pl.Trainer(gpus=n_gpus, min_epochs=n_epochs, max_epochs=n_epochs) trainer.fit(model, train_dataloaders=training_loader) trainer.test(model, test_dataloaders=test_loader)
def test_init(self): module = ReadoutModule(PoolAtomFeatures(), SequentialLayers(1, [1]), ComputePartialCharges()) assert isinstance(module.pooling_layer, PoolAtomFeatures) assert isinstance(module.readout_layers, SequentialLayers) assert isinstance(module.postprocess_layer, ComputePartialCharges)
def main( train_set_path, train_batch_size, val_set_path, test_set_path, n_gcn_layers, n_gcn_hidden_features, n_am1_layers, n_am1_hidden_features, learning_rate, n_epochs, ): pprint(locals()) # pl.seed_everything(3992210414) # h-parameter sweep v1 # Define the features of interest. atom_features = [ AtomicElement(["C", "O", "H", "N", "S", "F", "Br", "Cl", "I", "P"]), AtomConnectivity(), AtomAverageFormalCharge(), ] bond_features = [ # BondIsInRing(), # BondOrder() ] # Load in the pre-processed training and test molecules and store them in # featurized graphs. data_module = DGLMoleculeDataModule( atom_features, bond_features, partial_charge_method="am1", bond_order_method=None, train_set_path=train_set_path, train_batch_size=train_batch_size, val_set_path=val_set_path, val_batch_size=None, test_set_path=test_set_path, test_batch_size=None, use_cached_data=True, ) n_atom_features = data_module.n_atom_features # Define the model. model = DGLMoleculeLightningModel( convolution_module=ConvolutionModule( architecture="SAGEConv", in_feats=n_atom_features, hidden_feats=[n_gcn_hidden_features] * n_gcn_layers, ), readout_modules={ "am1-charges": ReadoutModule( pooling_layer=PoolAtomFeatures(), readout_layers=SequentialLayers( in_feats=n_gcn_hidden_features, hidden_feats=[n_am1_hidden_features] * n_am1_layers + [2], activation=["ReLU"] * n_am1_layers + ["Identity"], ), postprocess_layer=ComputePartialCharges(), ) }, learning_rate=learning_rate, ) print(model) # Train the model n_gpus = 0 if not torch.cuda.is_available() else 1 print(f"Using {n_gpus} GPUs") logger = TensorBoardLogger( "lightning-logs", version=( f"{train_batch_size}-" f"{n_gcn_layers}-" f"{n_gcn_hidden_features}-" f"{n_am1_layers}-" f"{n_am1_hidden_features}-" f"{learning_rate}" ), ) trainer = pl.Trainer( gpus=n_gpus, min_epochs=n_epochs, max_epochs=n_epochs, logger=logger ) trainer.fit(model, datamodule=data_module) trainer.test(model, data_module)