Example #1
0
def test_BaseFeaturizer():
    ligand = SmilesLigand.from_smiles("CCCC")
    systems = System(components=[ligand]), System(components=[ligand]), System(components=[ligand])
    featurizer = BaseFeaturizer()
    with pytest.raises(NotImplementedError):
        featurizer(systems)

    with pytest.raises(NotImplementedError):
        featurizer.featurize(systems)
Example #2
0
def test_single_ligand_featurizer(LigandClass):
    ligand1 = LigandClass.from_smiles("CCCC")
    single_ligand_system = System(components=[ligand1])
    featurizer = SingleLigandFeaturizer()
    featurizer.supports(single_ligand_system)

    ligand2 = Ligand.from_smiles("COCC")
    double_ligand_system = System(components=[ligand1, ligand2])
    with pytest.raises(ValueError):
        featurizer.featurize(double_ligand_system)
Example #3
0
def test_Pipeline():
    ligand = SmilesLigand.from_smiles("CCCC")
    systems = [
        System(components=[ligand]),
        System(components=[ligand]),
        System(components=[ligand]),
    ]
    featurizers = (NullFeaturizer(), NullFeaturizer())
    pipeline = Pipeline(featurizers)
    pipeline.featurize(systems)
    assert [s.featurizations["last"] for s in systems] == systems
Example #4
0
def test_system():
    from kinoml.core.components import MolecularComponent
    from kinoml.core.systems import System
    from kinoml.core.measurements import BaseMeasurement
    from kinoml.core.conditions import AssayConditions

    components = [MolecularComponent()]
    system = System(components=components)
    # This doesn't raise an error
    System(components=[], strict=False)
    # This does
    with pytest.raises(AssertionError):
        System(components=[])
Example #5
0
def test_datasetprovider():
    conditions = AssayConditions()
    measurements = [
        BaseMeasurement(50,
                        conditions=conditions,
                        system=System([MolecularComponent()])),
        BaseMeasurement(30,
                        conditions=conditions,
                        system=System([MolecularComponent()])),
    ]
    dataset = DatasetProvider(measurements=measurements)
    assert len(dataset.conditions) == 1
    assert next(iter(dataset.conditions)) == conditions
Example #6
0
def test_ClearFeaturizations_removeall():
    from kinoml.features.ligand import OneHotSMILESFeaturizer

    systems = (
        System([RDKitLigand.from_smiles("C")]),
        System([RDKitLigand.from_smiles("CC")]),
        System([RDKitLigand.from_smiles("CCC")]),
    )
    OneHotSMILESFeaturizer().featurize(systems)
    PadFeaturizer().featurize(systems)
    ClearFeaturizations(keys=tuple(), style="keep").featurize(systems)

    for s in systems:
        assert not s.featurizations
Example #7
0
def test_PadFeaturizer():
    from kinoml.features.ligand import OneHotSMILESFeaturizer

    systems = (
        System([RDKitLigand.from_smiles("C")]),
        System([RDKitLigand.from_smiles("CC")]),
        System([RDKitLigand.from_smiles("CCC")]),
    )
    OneHotSMILESFeaturizer().featurize(systems)
    PadFeaturizer().featurize(systems)

    for s in systems:
        assert s.featurizations["last"].shape == (53, 3)

    return systems
Example #8
0
def test_ClearFeaturizations_keeplast():
    from kinoml.features.ligand import OneHotSMILESFeaturizer

    systems = (
        System([RDKitLigand.from_smiles("C")]),
        System([RDKitLigand.from_smiles("CC")]),
        System([RDKitLigand.from_smiles("CCC")]),
    )
    OneHotSMILESFeaturizer().featurize(systems)
    PadFeaturizer().featurize(systems)
    ClearFeaturizations().featurize(systems)

    for s in systems:
        assert len(s.featurizations) == 1
        assert "last" in s.featurizations
Example #9
0
def test_SmilesToLigandFeaturizer_rdkit():
    ligand = SmilesLigand.from_smiles("CCCCC")
    system = System([ligand])
    featurizer = SmilesToLigandFeaturizer(ligand_type="openforcefield")
    featurizer.featurize([system])
    molecule = system.featurizations[featurizer.name]
    assert type(molecule) == OpenForceFieldLigand
Example #10
0
def test_SmilesToLigandFeaturizer_rdkit():
    ligand = SmilesLigand.from_smiles("CCCCC")
    system = System([ligand])
    featurizer = SmilesToLigandFeaturizer(ligand_type="rdkit")
    featurizer.featurize([system])
    molecule = system.featurizations[featurizer.name]
    assert type(molecule) == RDKitLigand
Example #11
0
def test_SmilesToLigandFeaturizer_fails():
    ligand = RDKitLigand.from_smiles("CCCCC")
    system = System([ligand])
    featurizer = SmilesToLigandFeaturizer(ligand_type="openforcefield")
    with pytest.raises(ValueError):
        featurizer.featurize([system])
        molecule = system.featurizations[featurizer.name]
        assert type(molecule) == OpenForceFieldLigand
Example #12
0
def test_Concatenated():
    from kinoml.features.ligand import MorganFingerprintFeaturizer

    ligand = RDKitLigand.from_smiles("CCCC")
    system = System([ligand])
    featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512)
    featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=512)
    concatenated = Concatenated([featurizer1, featurizer2], axis=1)
    concatenated.featurize([system])
    assert system.featurizations["last"].shape[0] == 1024
Example #13
0
def test_datasetprovider_awkward_exporter_single_tensor_same_shape():
    """
    The core assumption for simplest cases is that one
    system will be featurized as a single tensor, and that
    all the tensors will be of the same shape across systems.

    If that's the case:

    - DatasetProvider.to_numpy() will work and will return
      a X, y tuple of arrays
    - DatasetProvider.to_dict_of_arrays() will work and will
      return a dict of arrays with X, y keys.

    Note that `.to_numpy()` won't work if the core assumptions
    are broken. For those cases, `.to_dict_of_arrays()` is
    recommended instead.
    """
    from kinoml.core.ligands import RDKitLigand
    from kinoml.features.ligand import MorganFingerprintFeaturizer
    from kinoml.features.core import Concatenated, TupleOfArrays
    import awkward as ak

    conditions = AssayConditions()
    systems = [
        System([RDKitLigand.from_smiles(smi)]) for smi in ("CCCCC", "CCCCCCCC")
    ]
    measurements = [
        BaseMeasurement(50, conditions=conditions, system=systems[0]),
        BaseMeasurement(30, conditions=conditions, system=systems[1]),
    ]

    dataset = DatasetProvider(measurements=measurements)

    featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512)
    featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=1024)
    concatenate = Concatenated([featurizer1, featurizer2], axis=1)
    aggregated = TupleOfArrays([concatenate])
    aggregated.featurize(systems)
    for system in systems:
        assert system.featurizations["last"][0].shape[0] == (1024 + 512)

    # With a single tensor per system, we build a unified X tensor
    # First dimension in X and y must match
    X, y = dataset.to_numpy()
    # This extra dimension here V
    # comes from the TupleOfArrays aggregation
    assert X.shape[:3] == (2, 1, (1024 + 512))
    assert X.shape[0] == y.shape[0]

    # With dict_of_arrays and single tensor per system,
    # the behavior is essentially the same
    (Xa, ), ya = dataset.to_awkward()

    assert ak.to_numpy(Xa).shape == (2, (1024 + 512))
    assert ak.to_numpy(ya).shape == (2, )
Example #14
0
def test_ligand_GraphLigandFeaturizer_RDKit(smiles, solution):
    """
    OFFTK _will_ add hydrogens to all ingested SMILES, and export a canonicalized output,
    so the representation you get might not be the one you expect if you compute it directly.
    That's why we use RDKitLigand here.
    """
    ligand = RDKitLigand.from_smiles(smiles)
    system = System([ligand])
    GraphLigandFeaturizer().featurize([system])
    connectivity, features = system.featurizations["last"]
    assert (connectivity == solution[0]).all()
    assert features == pytest.approx(solution[1])
Example #15
0
def test_ligand_MorganFingerprintFeaturizer(smiles, solution):
    """
    OFFTK _will_ add hydrogens to all ingested SMILES, and export a canonicalized output,
    so the representation you get might not be the one you expect if you compute it directly.
    """
    ligand = RDKitLigand.from_smiles(smiles)
    system = System([ligand])
    featurizer = MorganFingerprintFeaturizer(radius=2, nbits=512)
    featurizer.featurize(system)
    fingerprint = system.featurizations[featurizer.name]
    solution_array = np.array(list(map(int, solution)), dtype="uint8")
    assert (fingerprint == solution_array).all()
Example #16
0
def test_measurements():
    from kinoml.core.measurements import BaseMeasurement, PercentageDisplacementMeasurement
    from kinoml.core.conditions import AssayConditions
    from kinoml.core.components import MolecularComponent
    from kinoml.core.systems import System

    conditions = AssayConditions()
    system = System([MolecularComponent()])
    measurement = BaseMeasurement(50, conditions=conditions, system=system)
    assert isinstance(measurement, BaseMeasurement)
    assert measurement == BaseMeasurement(50, conditions=conditions, system=system)
    assert measurement != BaseMeasurement(10, conditions=conditions, system=system)
Example #17
0
def test_datasetprovider():
    from kinoml.datasets.core import DatasetProvider
    from kinoml.core.systems import System
    from kinoml.core.components import MolecularComponent
    from kinoml.core.measurements import BaseMeasurement
    from kinoml.core.conditions import AssayConditions
    from kinoml.features.core import BaseFeaturizer

    conditions = AssayConditions()
    measurements = [
        BaseMeasurement(50,
                        conditions=conditions,
                        system=System([MolecularComponent()])),
        BaseMeasurement(30,
                        conditions=conditions,
                        system=System([MolecularComponent()])),
    ]
    provider = DatasetProvider(measurements=measurements,
                               featurizers=[BaseFeaturizer()])
    assert len(provider.conditions) == 1
    assert next(iter(provider.conditions)) == conditions
Example #18
0
def test_TupleOfArrays():
    from kinoml.features.ligand import MorganFingerprintFeaturizer

    ligand = RDKitLigand.from_smiles("CCCC")
    system = System([ligand])
    featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512)
    featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=1024)
    aggregated = TupleOfArrays([featurizer1, featurizer2])
    aggregated.featurize([system])
    assert len(system.featurizations["last"]) == 2
    assert system.featurizations["last"][0].shape[0] == 512
    assert system.featurizations["last"][1].shape[0] == 1024
Example #19
0
def test_ligand_OneHotSMILESFeaturizer(smiles, solution):
    """
    OFFTK _will_ add hydrogens to all ingested SMILES, and export a canonicalized output,
    so the representation you get might not be the one you expect if you compute it directly.
    That's why we use RDKitLigand here.
    """
    ligand = RDKitLigand.from_smiles(smiles)
    system = System([ligand])
    featurizer = OneHotSMILESFeaturizer()
    featurizer.featurize(system)
    matrix = system.featurizations[featurizer.name]
    assert matrix.shape == solution.T.shape
    assert (matrix == solution.T).all()
Example #20
0
def test_ligand_GraphLigandFeaturizer(smiles, solution):
    """
    OFFTK _will_ add hydrogens to all ingested SMILES, and export a canonicalized output,
    so the representation you get might not be the one you expect if you compute it directly.
    That's why we use RDKitLigand here.
    """
    ligand = RDKitLigand.from_smiles(smiles)
    system = System([ligand])
    featurizer = GraphLigandFeaturizer()
    featurizer.featurize(system)
    graph = system.featurizations[featurizer.name]
    assert (graph[0] == solution[0]).all()  # connectivity
    assert (graph[1] == solution[1]).all()  # features
Example #21
0
def test_datasetprovider_awkward_exporter_single_tensor_different_shapes():
    """
    When a featurizer returns arrays of different shape for each
    system, one can either choose between:

    A. Pad them to the same dimension with PadFeaturizer,
       and apply the Concatenated aggregator.
    B. Keep them separate and export them with `.dict_of_arrays()`,

    This creates a dictionary of arrays, where each key is
    autogenerated like `X_s{int}`, where `s` is the
    system index.
    """
    from kinoml.core.ligands import RDKitLigand
    from kinoml.features.ligand import OneHotSMILESFeaturizer
    from kinoml.features.core import TupleOfArrays
    import awkward as ak

    conditions = AssayConditions()
    smiles = ("CCCCC", "CCCCCCCC")
    systems = [System([RDKitLigand.from_smiles(smi)]) for smi in smiles]
    measurements = [
        BaseMeasurement(50, conditions=conditions, system=systems[0]),
        BaseMeasurement(30, conditions=conditions, system=systems[1]),
    ]

    dataset = DatasetProvider(measurements=measurements)

    featurizer = OneHotSMILESFeaturizer()
    aggregated = TupleOfArrays([featurizer])
    aggregated.featurize(dataset.systems)
    for system, smi in zip(systems, smiles):
        assert system.featurizations["last"][0].shape == (53, len(smi))

    # X is returned as single-item list thanks to TupleOfArrays
    (X, ), y = dataset.to_awkward()
    assert X.type.length == len(y)
    assert ak.to_numpy(X[0]).shape == (53, len(smiles[0]))
    assert ak.to_numpy(X[1]).shape == (53, len(smiles[1]))
Example #22
0
def test_datasetprovider_awkward_exporter_multiple_subtensors():
    """
    When we use an aggregator like TupleOfArrays, which
    breaks the one system -> one tensor assumption,
    we need to use the `.dict_of_arrays()` exporter.

    This creates a dictionary of arrays, where each key is
    autogenerated like `X_s{int}_a{int}`, where `s` is the
    system index and `a` is the array index.
    """
    from kinoml.core.ligands import RDKitLigand
    from kinoml.features.ligand import MorganFingerprintFeaturizer
    from kinoml.features.core import TupleOfArrays
    import awkward as ak

    conditions = AssayConditions()
    systems = [
        System([RDKitLigand.from_smiles(smi)]) for smi in ("CCCCC", "CCCCCCCC")
    ]
    measurements = [
        BaseMeasurement(50, conditions=conditions, system=systems[0]),
        BaseMeasurement(30, conditions=conditions, system=systems[1]),
    ]

    dataset = DatasetProvider(measurements=measurements)

    featurizer1 = MorganFingerprintFeaturizer(radius=2, nbits=512)
    featurizer2 = MorganFingerprintFeaturizer(radius=2, nbits=1024)
    aggregated = TupleOfArrays([featurizer1, featurizer2])
    aggregated.featurize(dataset.systems)
    for system in systems:
        assert len(system.featurizations["last"]) == 2
        assert system.featurizations["last"][0].shape[0] == 512
        assert system.featurizations["last"][1].shape[0] == 1024

    (x1, x2), y = dataset.to_awkward()
    assert len(x1) == len(x2) == len(y)
    assert ak.to_numpy(x1).shape == (2, 512)
    assert ak.to_numpy(x2).shape == (2, 1024)
Example #23
0
def test_datasetprovider_exporter_single_tensor_different_shapes():
    """
    When a featurizer returns arrays of different shape for each
    system, one can either choose between:

    A. Pad them to the same dimension with PadFeaturizer,
       and apply the Concatenated aggregator.
    B. Keep them separate and export them with `.dict_of_arrays()`,

    This creates a dictionary of arrays, where each key is
    autogenerated like `X_s{int}`, where `s` is the
    system index.
    """
    from kinoml.core.ligands import RDKitLigand
    from kinoml.features.ligand import OneHotSMILESFeaturizer

    conditions = AssayConditions()
    smiles = ("CCCCC", "CCCCCCCC")
    systems = [System([RDKitLigand.from_smiles(smi)]) for smi in smiles]
    measurements = [
        BaseMeasurement(50, conditions=conditions, system=systems[0]),
        BaseMeasurement(30, conditions=conditions, system=systems[1]),
    ]

    dataset = DatasetProvider(measurements=measurements)

    featurizer = OneHotSMILESFeaturizer()
    featurizer.featurize(dataset.systems)
    for system, smi in zip(systems, smiles):
        assert system.featurizations["last"].shape == (53, len(smi))

    arrays = dataset.to_dict_of_arrays()
    X_keys = [k for k in arrays.keys() if k.startswith("X")]
    assert sorted(X_keys) == ["X_s0_", "X_s1_"]
    for X_key, smi in zip(X_keys, smiles):
        assert arrays[X_key].shape == (53, len(smi))