Esempio n. 1
0
def test_init_sequential_layers_default():

    sequential_layers = SequentialLayers(in_feats=1, hidden_feats=[2])

    assert len(sequential_layers) == 3

    assert isinstance(sequential_layers[0], torch.nn.Linear)
    assert isinstance(sequential_layers[1], torch.nn.ReLU)
    assert isinstance(sequential_layers[2], torch.nn.Dropout)
    assert numpy.isclose(sequential_layers[2].p, 0.0)
Esempio n. 2
0
    def test_init(self):

        model = DGLMoleculeLightningModel(
            convolution_module=ConvolutionModule("SAGEConv",
                                                 in_feats=1,
                                                 hidden_feats=[2, 2]),
            readout_modules={
                "atom":
                ReadoutModule(
                    pooling_layer=PoolAtomFeatures(),
                    readout_layers=SequentialLayers(in_feats=2,
                                                    hidden_feats=[2],
                                                    activation=["Identity"]),
                    postprocess_layer=ComputePartialCharges(),
                ),
                "bond":
                ReadoutModule(
                    pooling_layer=PoolBondFeatures(
                        layers=SequentialLayers(in_feats=4, hidden_feats=[4])),
                    readout_layers=SequentialLayers(in_feats=4,
                                                    hidden_feats=[8]),
                ),
            },
            learning_rate=0.01,
        )

        assert model.convolution_module is not None
        assert isinstance(model.convolution_module, ConvolutionModule)

        assert isinstance(model.convolution_module.gcn_layers, GCNStack)
        assert len(model.convolution_module.gcn_layers) == 2

        assert all(x in model.readout_modules for x in ["atom", "bond"])

        assert isinstance(model.readout_modules["atom"].pooling_layer,
                          PoolAtomFeatures)
        assert isinstance(model.readout_modules["bond"].pooling_layer,
                          PoolBondFeatures)

        assert numpy.isclose(model.learning_rate, 0.01)
Esempio n. 3
0
def test_init_sequential_layers_invalid():

    with pytest.raises(ValueError) as error_info:

        SequentialLayers(
            in_feats=1,
            hidden_feats=[2],
            activation=["ReLU", "LeakyReLU"],
            dropout=[0.0, 0.5],
        )

    assert "The `hidden_feats`, `activation`, and `dropout` lists must be the" in str(
        error_info.value)
Esempio n. 4
0
def mock_atom_model() -> DGLMoleculeLightningModel:

    return DGLMoleculeLightningModel(
        convolution_module=ConvolutionModule("SAGEConv",
                                             in_feats=4,
                                             hidden_feats=[4]),
        readout_modules={
            "atom":
            ReadoutModule(
                pooling_layer=PoolAtomFeatures(),
                readout_layers=SequentialLayers(in_feats=4, hidden_feats=[2]),
                postprocess_layer=ComputePartialCharges(),
            ),
        },
        learning_rate=0.01,
    )
Esempio n. 5
0
    def __init__(
        self,
        convolution_config: ConvolutionConfig,
        readout_configs: Dict[str, ReadoutConfig],
    ):

        super(MolSAGE, self).__init__()

        self.convolution = GraphSAGE(
            in_feats=convolution_config.in_feats,
            hidden_feats=convolution_config.hidden_feats,
            activation=convolution_config.activation,
            aggregator_type=["mean"] * len(convolution_config.hidden_feats),
        )

        self._pooling_layers: Dict[str, PoolingLayer] = {
            readout_type: readout_config.pooling_layer
            for readout_type, readout_config in readout_configs.items()
        }
        self._postprocess_layers: Dict[str, PostprocessLayer] = {
            readout_type: readout_config.postprocess_layer
            for readout_type, readout_config in readout_configs.items()
            if readout_config.postprocess_layer is not None
        }

        self._readouts: Dict[str, SequentialLayers] = {
            readout_type: SequentialLayers(
                (convolution_config.hidden_feats[-1] *
                 readout_config.pooling_layer.n_feature_columns()),
                readout_config.hidden_feats,
                activation=readout_config.activation,
                dropout=readout_config.dropout,
            )
            for readout_type, readout_config in readout_configs.items()
        }

        # Add the layers directly to the model. This is required for pytorch to detect
        # the parameters of the child models.
        for readout_type, pooling_layer in self._pooling_layers.items():
            setattr(self, f"pooling_{readout_type}", pooling_layer)

        for readout_type, readout_layer in self._readouts.items():
            setattr(self, f"readout_{readout_type}", readout_layer)
Esempio n. 6
0
def test_init_sequential_layers_inputs():

    sequential_layers = SequentialLayers(
        in_feats=1,
        hidden_feats=[2, 1],
        activation=["ReLU", "LeakyReLU"],
        dropout=[0.0, 0.5],
    )

    assert len(sequential_layers) == 6

    assert isinstance(sequential_layers[0], torch.nn.Linear)
    assert isinstance(sequential_layers[1], torch.nn.ReLU)
    assert isinstance(sequential_layers[2], torch.nn.Dropout)
    assert numpy.isclose(sequential_layers[2].p, 0.0)

    assert isinstance(sequential_layers[3], torch.nn.Linear)
    assert isinstance(sequential_layers[4], torch.nn.LeakyReLU)
    assert isinstance(sequential_layers[5], torch.nn.Dropout)
    assert numpy.isclose(sequential_layers[5].p, 0.5)
Esempio n. 7
0
    def test_forward(self, dgl_methane):

        model = MoleculeGCNModel(
            convolution_module=ConvolutionModule("SAGEConv",
                                                 in_feats=4,
                                                 hidden_feats=[4]),
            readout_modules={
                "atom":
                ReadoutModule(
                    pooling_layer=PoolAtomFeatures(),
                    readout_layers=SequentialLayers(in_feats=4,
                                                    hidden_feats=[2]),
                    postprocess_layer=ComputePartialCharges(),
                ),
            },
        )

        output = model.forward(dgl_methane)
        assert "atom" in output

        assert output["atom"].shape == (5, 1)
Esempio n. 8
0
def main():

    # Define the features of interest.
    atom_features = [
        AtomicElement(["C", "O", "H", "N", "S", "F", "Br", "Cl"]),
        AtomConnectivity(),
        AtomFormalCharge([0]),
        # AtomIsAromatic(),
        AtomIsInRing(),
    ]
    bond_features = [
        # BondIsAromatic(),
        BondIsInRing(),
    ]

    # Load in the pre-processed training and test molecules and store them in
    # featurized graphs.
    training_set, test_set, n_features = load_data_sets(atom_features, bond_features)

    # Define the model.
    model = MolSAGE(
        convolution_config=ConvolutionConfig(
            in_feats=n_features,
            hidden_feats=[128, 128, 128],
        ),
        readout_configs={
            "am1_charges": ReadoutConfig(
                pooling_layer=PoolAtomFeatures(),
                hidden_feats=[128, 128, 128, 2],
                postprocess_layer=ComputePartialCharges(),
            ),
            "am1_wbo":  ReadoutConfig(
                pooling_layer=PoolBondFeatures(
                    layers=SequentialLayers(
                        in_feats=128 * 2,
                        hidden_feats=[128 * 2],
                    )
                ),
                hidden_feats=[256, 256, 256, 1],
            ),
        }
    )

    print(model)

    # Define the optimizer and the loss function.
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0002)
    criterion = torch.nn.MSELoss()

    losses = []

    for epoch in range(100):

        graph: dgl.DGLGraph

        for batch, (graph, features, labels) in enumerate(training_set):

            # Perform the models forward pass.
            y_pred = model(graph, features)

            # compute loss
            loss = torch.zeros(1)

            for label_name, label in labels.items():
                loss += torch.sqrt(criterion(y_pred[label_name], label))

            losses.append(loss.detach().numpy().item())

            # backward propagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            print(
                f"epoch={epoch} "
                f"batch={batch} "
                f"loss={loss.item():.6f} "
                # f"q_tot={y_pred['am1_charges'].sum().detach().item():.4f} "
            )

    # Compute the test accuracy
    test_graph, test_features, test_labels = next(iter(test_set))
    model.eval()

    with torch.no_grad():

        test_pred = model(test_graph, test_features)

        test_loss = 0.0

        for label_name, label in test_labels.items():
            test_loss += torch.sqrt(criterion(test_pred[label_name], label))

        print("________________")
        print(f"test loss={test_loss}")

    # Plot the training losses.
    pyplot.plot(losses)
    pyplot.savefig("train-losses.png")
    pyplot.cla()

    # Plot the predicted vs reference values.
    for label in test_labels:

        pyplot.scatter(
            test_labels[label].flatten().numpy(),
            test_pred[label].flatten().numpy(),
            label="test"
        )
        pyplot.scatter(
            labels[label].flatten().numpy(),
            y_pred[label].flatten().detach().numpy(),
            label="train",
            alpha=0.3,
        )
        pyplot.legend()
        pyplot.gcf().set_size_inches(4, 4)
        pyplot.xlabel("OpenEye")
        pyplot.ylabel("Predicted")
        pyplot.tight_layout()
        pyplot.savefig(f"{label}.png")
        pyplot.cla()
def main():

    print(torch.seed())

    # Define the atom / bond features of interest.
    atom_features = [
        AtomicElement(["C", "O", "H"]),
        AtomConnectivity(),
    ]
    bond_features = [
        BondOrder(),
    ]

    # Compute the total length of the input atomic feature vector
    n_atom_features = sum(len(feature) for feature in atom_features)

    # Load in the training and test data
    training_smiles = ["CO", "CCO", "CCCO", "CCCCO"]
    training_data = DGLMoleculeDataset.from_smiles(
        training_smiles,
        atom_features,
        bond_features,
        label_function,
    )
    training_loader = DGLMoleculeDataLoader(training_data,
                                            batch_size=len(training_smiles),
                                            shuffle=False)

    test_smiles = [
        "CCCCCCCCCO",
    ]
    test_loader = DGLMoleculeDataLoader(
        DGLMoleculeDataset.from_smiles(
            test_smiles,
            atom_features,
            bond_features,
            label_function,
        ),
        batch_size=len(test_smiles),
        shuffle=False,
    )

    # Define the model.
    n_gcn_layers = 5
    n_gcn_hidden_features = 128

    n_am1_layers = 2
    n_am1_hidden_features = 64

    learning_rate = 0.001

    model = DGLMoleculeLightningModel(
        convolution_module=ConvolutionModule(
            architecture="SAGEConv",
            in_feats=n_atom_features,
            hidden_feats=[n_gcn_hidden_features] * n_gcn_layers,
        ),
        readout_modules={
            # The keys of the readout modules should correspond to keys in the
            # label dictionary.
            "am1-charges":
            ReadoutModule(
                pooling_layer=PoolAtomFeatures(),
                readout_layers=SequentialLayers(
                    in_feats=n_gcn_hidden_features,
                    hidden_feats=[n_am1_hidden_features] * n_am1_layers + [2],
                    activation=["ReLU"] * n_am1_layers + ["Identity"],
                ),
                postprocess_layer=ComputePartialCharges(),
            )
        },
        learning_rate=learning_rate,
    )

    print(model)

    # Train the model
    n_epochs = 100

    n_gpus = 0 if not torch.cuda.is_available() else 1
    print(f"Using {n_gpus} GPUs")

    trainer = pl.Trainer(gpus=n_gpus, min_epochs=n_epochs, max_epochs=n_epochs)

    trainer.fit(model, train_dataloaders=training_loader)
    trainer.test(model, test_dataloaders=test_loader)
Esempio n. 10
0
 def test_init(self):
     module = ReadoutModule(PoolAtomFeatures(), SequentialLayers(1, [1]),
                            ComputePartialCharges())
     assert isinstance(module.pooling_layer, PoolAtomFeatures)
     assert isinstance(module.readout_layers, SequentialLayers)
     assert isinstance(module.postprocess_layer, ComputePartialCharges)
Esempio n. 11
0
def main(
    train_set_path,
    train_batch_size,
    val_set_path,
    test_set_path,
    n_gcn_layers,
    n_gcn_hidden_features,
    n_am1_layers,
    n_am1_hidden_features,
    learning_rate,
    n_epochs,
):

    pprint(locals())

    # pl.seed_everything(3992210414)  # h-parameter sweep v1

    # Define the features of interest.
    atom_features = [
        AtomicElement(["C", "O", "H", "N", "S", "F", "Br", "Cl", "I", "P"]),
        AtomConnectivity(),
        AtomAverageFormalCharge(),
    ]
    bond_features = [
        # BondIsInRing(),
        # BondOrder()
    ]

    # Load in the pre-processed training and test molecules and store them in
    # featurized graphs.
    data_module = DGLMoleculeDataModule(
        atom_features,
        bond_features,
        partial_charge_method="am1",
        bond_order_method=None,
        train_set_path=train_set_path,
        train_batch_size=train_batch_size,
        val_set_path=val_set_path,
        val_batch_size=None,
        test_set_path=test_set_path,
        test_batch_size=None,
        use_cached_data=True,
    )
    n_atom_features = data_module.n_atom_features

    # Define the model.
    model = DGLMoleculeLightningModel(
        convolution_module=ConvolutionModule(
            architecture="SAGEConv",
            in_feats=n_atom_features,
            hidden_feats=[n_gcn_hidden_features] * n_gcn_layers,
        ),
        readout_modules={
            "am1-charges": ReadoutModule(
                pooling_layer=PoolAtomFeatures(),
                readout_layers=SequentialLayers(
                    in_feats=n_gcn_hidden_features,
                    hidden_feats=[n_am1_hidden_features] * n_am1_layers + [2],
                    activation=["ReLU"] * n_am1_layers + ["Identity"],
                ),
                postprocess_layer=ComputePartialCharges(),
            )
        },
        learning_rate=learning_rate,
    )
    print(model)

    # Train the model
    n_gpus = 0 if not torch.cuda.is_available() else 1
    print(f"Using {n_gpus} GPUs")

    logger = TensorBoardLogger(
        "lightning-logs",
        version=(
            f"{train_batch_size}-"
            f"{n_gcn_layers}-"
            f"{n_gcn_hidden_features}-"
            f"{n_am1_layers}-"
            f"{n_am1_hidden_features}-"
            f"{learning_rate}"
        ),
    )

    trainer = pl.Trainer(
        gpus=n_gpus, min_epochs=n_epochs, max_epochs=n_epochs, logger=logger
    )

    trainer.fit(model, datamodule=data_module)
    trainer.test(model, data_module)