Beispiel #1
0
def test_filter_by_smiles():
    """A test to ensure that data sets may be filtered by which smiles their
    measured properties contain."""

    methanol_substance = Substance()
    methanol_substance.add_component(Substance.Component('CO'), Substance.MoleFraction(1.0))

    ethanol_substance = Substance()
    ethanol_substance.add_component(Substance.Component('CCO'), Substance.MoleFraction(1.0))

    property_a = create_dummy_property(Density)
    property_a.substance = methanol_substance

    property_b = create_dummy_property(Density)
    property_b.substance = ethanol_substance

    data_set = PhysicalPropertyDataSet()
    data_set.properties[methanol_substance.identifier] = [property_a]
    data_set.properties[ethanol_substance.identifier] = [property_b]

    data_set.filter_by_smiles('CO')

    assert data_set.number_of_properties == 1
    assert methanol_substance.identifier in data_set.properties
    assert ethanol_substance.identifier not in data_set.properties
Beispiel #2
0
def test_to_pandas():
    """A test to ensure that data sets are convertable to pandas objects."""

    source = CalculationSource('Dummy', {})

    pure_substance = Substance.from_components('C')
    binary_substance = Substance.from_components('C', 'O')

    data_set = PhysicalPropertyDataSet()

    data_set.properties[pure_substance.identifier] = []
    data_set.properties[binary_substance.identifier] = []

    for temperature in [298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin]:

        thermodynamic_state = ThermodynamicState(temperature=temperature, pressure=1.0 * unit.atmosphere)

        density_property = Density(thermodynamic_state=thermodynamic_state,
                                   phase=PropertyPhase.Liquid,
                                   substance=pure_substance,
                                   value=1 * unit.gram / unit.milliliter,
                                   uncertainty=0.11 * unit.gram / unit.milliliter,
                                   source=source)

        dielectric_property = DielectricConstant(thermodynamic_state=thermodynamic_state,
                                                 phase=PropertyPhase.Liquid,
                                                 substance=pure_substance,
                                                 value=1 * unit.dimensionless,
                                                 uncertainty=0.11 * unit.dimensionless,
                                                 source=source)

        data_set.properties[pure_substance.identifier].append(density_property)
        data_set.properties[pure_substance.identifier].append(dielectric_property)

    for temperature in [298 * unit.kelvin, 300 * unit.kelvin, 302 * unit.kelvin]:

        thermodynamic_state = ThermodynamicState(temperature=temperature, pressure=1.0 * unit.atmosphere)

        enthalpy_property = EnthalpyOfMixing(thermodynamic_state=thermodynamic_state,
                                             phase=PropertyPhase.Liquid,
                                             substance=binary_substance,
                                             value=1 * unit.kilojoules / unit.mole,
                                             uncertainty=0.11 * unit.kilojoules / unit.mole,
                                             source=source)

        excess_property = ExcessMolarVolume(thermodynamic_state=thermodynamic_state,
                                            phase=PropertyPhase.Liquid,
                                            substance=binary_substance,
                                            value=1 * unit.meter**3 / unit.mole,
                                            uncertainty=0.11 * unit.meter**3 / unit.mole,
                                            source=source)

        data_set.properties[binary_substance.identifier].append(enthalpy_property)
        data_set.properties[binary_substance.identifier].append(excess_property)

    data_set_pandas = data_set.to_pandas()

    assert data_set_pandas is not None
    assert len(data_set_pandas) == 6
Beispiel #3
0
def create_filterable_data_set():
    """Creates a dummy data with a diverse set of properties to
    be filtered, namely:

        - a liquid density measured at 298 K and 0.5 atm with 1 component containing only carbon.
        - a gaseous dielectric measured at 288 K and 1 atm with 2 components containing only nitrogen.
        - a solid EoM measured at 308 K and 1.5 atm with 3 components containing only oxygen.

    Returns
    -------
    PhysicalPropertyDataSet
        The created data set.
    """

    source = CalculationSource('Dummy', {})
    carbon_substance = create_dummy_substance(number_of_components=1,
                                              elements=['C'])

    density_property = Density(thermodynamic_state=ThermodynamicState(
        temperature=298 * unit.kelvin, pressure=0.5 * unit.atmosphere),
                               phase=PropertyPhase.Liquid,
                               substance=carbon_substance,
                               value=1 * unit.gram / unit.milliliter,
                               uncertainty=0.11 * unit.gram / unit.milliliter,
                               source=source)

    nitrogen_substance = create_dummy_substance(number_of_components=2,
                                                elements=['N'])

    dielectric_property = DielectricConstant(
        thermodynamic_state=ThermodynamicState(temperature=288 * unit.kelvin,
                                               pressure=1 * unit.atmosphere),
        phase=PropertyPhase.Gas,
        substance=nitrogen_substance,
        value=1 * unit.dimensionless,
        uncertainty=0.11 * unit.dimensionless,
        source=source)

    oxygen_substance = create_dummy_substance(number_of_components=3,
                                              elements=['O'])

    enthalpy_property = EnthalpyOfMixing(
        thermodynamic_state=ThermodynamicState(temperature=308 * unit.kelvin,
                                               pressure=1.5 * unit.atmosphere),
        phase=PropertyPhase.Solid,
        substance=oxygen_substance,
        value=1 * unit.kilojoules / unit.mole,
        uncertainty=0.11 * unit.kilojoules / unit.mole,
        source=source)

    data_set = PhysicalPropertyDataSet()
    data_set.properties[carbon_substance.identifier] = [density_property]
    data_set.properties[nitrogen_substance.identifier] = [dielectric_property]
    data_set.properties[oxygen_substance.identifier] = [enthalpy_property]

    return data_set
def test_estimate_request():
    """Test sending an estimator request to a server."""

    from openforcefield.typing.engines import smirnoff

    with tempfile.TemporaryDirectory() as temporary_directory:

        storage_directory = path.join(temporary_directory, 'storage')
        working_directory = path.join(temporary_directory, 'working')

        dummy_property = create_dummy_property(Density)

        dummy_data_set = PhysicalPropertyDataSet()
        dummy_data_set.properties[dummy_property.substance.identifier] = [
            dummy_property
        ]

        force_field = smirnoff.ForceField(
            get_data_filename('forcefield/smirnoff99Frosst.offxml'))

        calculation_backend = DaskLocalClusterBackend(1, ComputeResources())
        storage_backend = LocalFileStorage(storage_directory)

        PropertyEstimatorServer(calculation_backend,
                                storage_backend,
                                working_directory=working_directory)

        property_estimator = PropertyEstimatorClient()
        options = PropertyEstimatorOptions(
            allowed_calculation_layers=[TestCalculationLayer])

        request = property_estimator.request_estimate(dummy_data_set,
                                                      force_field, options)
        result = request.results(synchronous=True, polling_interval=0)

        assert not isinstance(result, PropertyEstimatorException)
Beispiel #5
0
def main():

    setup_timestamp_logging()

    # Load in the force field
    force_field_path = 'smirnoff99Frosst-1.1.0.offxml'
    force_field_source = SmirnoffForceFieldSource.from_path(force_field_path)

    # Load in the data set containing a single dielectric
    # property.
    with open('pure_data_set.json') as file:
        data_set = PhysicalPropertyDataSet.parse_json(file.read())

    data_set.filter_by_property_types('DielectricConstant')

    # Set up the server object which run the calculations.
    setup_server(backend_type=BackendType.LocalGPU,
                 max_number_of_workers=1,
                 port=8001)

    # Request the estimates.
    property_estimator = client.PropertyEstimatorClient(
        client.ConnectionOptions(server_port=8001))

    options = PropertyEstimatorOptions()
    options.allowed_calculation_layers = ['SimulationLayer']

    options.workflow_options = {
        'DielectricConstant': {
            'SimulationLayer':
            WorkflowOptions(WorkflowOptions.ConvergenceMode.NoChecks),
            'ReweightingLayer':
            WorkflowOptions(WorkflowOptions.ConvergenceMode.NoChecks)
        }
    }

    parameter_gradient_keys = [
        ParameterGradientKey(tag='vdW', smirks='[#6X4:1]',
                             attribute='epsilon'),
        ParameterGradientKey(tag='vdW',
                             smirks='[#6X4:1]',
                             attribute='rmin_half')
    ]

    request = property_estimator.request_estimate(
        property_set=data_set,
        force_field_source=force_field_source,
        options=options,
        parameter_gradient_keys=parameter_gradient_keys)

    # Wait for the results.
    results = request.results(True, 5)

    # Save the result to file.
    with open('dielectric_simulation.json', 'wb') as file:

        json_results = json.dumps(results,
                                  sort_keys=True,
                                  indent=2,
                                  separators=(',', ': '),
                                  cls=TypedJSONEncoder)

        file.write(json_results.encode('utf-8'))

    # Attempt to reweight the cached data.
    options.allowed_calculation_layers = ['ReweightingLayer']

    request = property_estimator.request_estimate(
        property_set=data_set,
        force_field_source=force_field_source,
        options=options,
        parameter_gradient_keys=parameter_gradient_keys)

    # Wait for the results.
    results = request.results(True, 5)

    # Save the result to file.
    with open('dielectric_reweight.json', 'wb') as file:

        json_results = json.dumps(results,
                                  sort_keys=True,
                                  indent=2,
                                  separators=(',', ': '),
                                  cls=TypedJSONEncoder)

        file.write(json_results.encode('utf-8'))
def _create_data_set():
    """Create a small data set of three properties taken from the
    FreeSolv data set: https://github.com/mobleylab/FreeSolv.

    Returns
    -------
    PhysicalPropertyDataSet
        The data set of three select FreeSolv properties.
    """

    butan_1_ol = Substance()
    butan_1_ol.add_component(
        Substance.Component('CCCCO', role=Substance.ComponentRole.Solute),
        Substance.ExactAmount(1))
    butan_1_ol.add_component(
        Substance.Component('O', role=Substance.ComponentRole.Solvent),
        Substance.MoleFraction(1.0))

    butan_1_ol_property = SolvationFreeEnergy(
        thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                               1.0 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=butan_1_ol,
        value=-4.72 * unit.kilocalorie / unit.mole,
        uncertainty=0.6 * unit.kilocalorie / unit.mole,
        source=MeasurementSource(doi=' 10.1021/ct050097l'))

    methyl_propanoate = Substance()
    methyl_propanoate.add_component(
        Substance.Component('CCC(=O)OC', role=Substance.ComponentRole.Solute),
        Substance.ExactAmount(1))
    methyl_propanoate.add_component(
        Substance.Component('O', role=Substance.ComponentRole.Solvent),
        Substance.MoleFraction(1.0))

    methyl_propanoate_property = SolvationFreeEnergy(
        thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                               1.0 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=methyl_propanoate,
        value=-2.93 * unit.kilocalorie / unit.mole,
        uncertainty=0.6 * unit.kilocalorie / unit.mole,
        source=MeasurementSource(doi=' 10.1021/ct050097l'))

    benzamide = Substance()
    benzamide.add_component(
        Substance.Component('c1ccc(cc1)C(=O)N',
                            role=Substance.ComponentRole.Solute),
        Substance.ExactAmount(1))
    benzamide.add_component(
        Substance.Component('O', role=Substance.ComponentRole.Solvent),
        Substance.MoleFraction(1.0))

    benzamide_property = SolvationFreeEnergy(
        thermodynamic_state=ThermodynamicState(298.15 * unit.kelvin,
                                               1.0 * unit.atmosphere),
        phase=PropertyPhase.Liquid,
        substance=benzamide,
        value=-11.0 * unit.kilocalorie / unit.mole,
        uncertainty=0.2 * unit.kilocalorie / unit.mole,
        source=MeasurementSource(doi=' 10.1021/ct050097l'))

    data_set = PhysicalPropertyDataSet()
    data_set.properties[butan_1_ol.identifier] = [butan_1_ol_property]
    data_set.properties[methyl_propanoate.identifier] = [
        methyl_propanoate_property
    ]
    data_set.properties[benzamide.identifier] = [benzamide_property]

    return data_set
    def _initialize(self):
        """Initializes the property estimator target from an input json file.

        1. Reads the user specified input file.
        2. Creates a `propertyestimator` client object.
        3. Loads in a reference experimental data set.
        4. Assigns and normalises weights for each property.
        """

        # Load in the options from a user provided JSON file.
        print(os.path.join(self.tgtdir, self.prop_est_input))
        options_file_path = os.path.join(self.tgtdir, self.prop_est_input)
        self._options = self.OptionsFile.from_json(options_file_path)

        # Attempt to create a property estimator client object using the specified
        # connection options.
        self._client = PropertyEstimatorClient(
            connection_options=self._options.connection_options)

        # Load in the experimental data set.
        data_set_path = os.path.join(self.tgtdir, self._options.data_set_path)

        with open(data_set_path, 'r') as file:
            self._data_set = PhysicalPropertyDataSet.parse_json(file.read())

        if len(self._data_set.properties) == 0:
            raise ValueError(
                'The physical property data set to optimise against is empty.')

        # Convert the reference data into a more easily comparable form.
        self._reference_properties = self._refactor_properties_dictionary(
            self._data_set.properties)

        # Print the reference data, and count the number of instances of
        # each property type.
        printcool("Loaded experimental data from property estimator")

        number_of_properties = {
            property_name: 0.0
            for property_name in self._reference_properties
        }

        for property_name in self._reference_properties:

            for substance_id in self._reference_properties[property_name]:

                dict_for_print = {}

                for state_tuple in self._reference_properties[property_name][
                        substance_id]:

                    value = self._reference_properties[property_name][
                        substance_id][state_tuple]['value']
                    uncertainty = self._reference_properties[property_name][
                        substance_id][state_tuple]['uncertainty']

                    dict_for_print["%sK-%satm" %
                                   state_tuple] = ("%f+/-%f" %
                                                   (value, uncertainty))

                    number_of_properties[property_name] += 1.0

                printcool_dictionary(dict_for_print,
                                     title="Reference %s (%s) data" %
                                     (property_name, substance_id))

        # Assign and normalize weights for each phase point (average for now)
        self._normalised_weights = {}

        for property_name in self._reference_properties:

            self._normalised_weights[property_name] = (
                self._options.weights[property_name] /
                number_of_properties[property_name])
Beispiel #8
0
def curate(processed_data_directory, output_path):
    """Curate the benchmarking data set.
    """

    # Define the desired number of unique substances which should have data points
    # for each of the properties of interest
    desired_substances_per_property = {
        (EnthalpyOfMixing, SubstanceType.Binary): 10,
        (ExcessMolarVolume, SubstanceType.Binary): 10,
        (Density, SubstanceType.Pure): 30,
        (DielectricConstant, SubstanceType.Pure): 30,
        (EnthalpyOfVaporization, SubstanceType.Pure): 30
    }

    full_data_set = PhysicalPropertyDataSet()

    # Define the order of preference for which data binary substances should have.
    mixture_property_order = [
        [
            # We prioritise those molecules for which we have both binary enthalpies
            # of mixing and excess molar volumes.
            (ExcessMolarVolume, SubstanceType.Binary),
            (EnthalpyOfMixing, SubstanceType.Binary)
        ],
        [
            # Failing that, we pick molecules for which we only have enthalpies
            # of mixing.
            (EnthalpyOfMixing, SubstanceType.Binary)
        ],
        [
            # Finally, choose molecules for which we only have excess molar volumes.
            (ExcessMolarVolume, SubstanceType.Binary),
        ]
    ]

    # Build the mixture data sets.
    mixture_data_set = curate_data_set(
        processed_data_directory,
        mixture_property_order,
        desired_substances_per_property,
        required_smiles_to_include=None,
        smiles_to_exclude=[*test_set_smiles, 'O'],
        vdw_smirks_to_exercise=optimized_vdw_smirks,
        output_data_set_path='mixture_data_set.json')

    # We explicitly ask for aqueous mixture data.
    water_mixture_data_set = curate_data_set(
        processed_data_directory,
        mixture_property_order,
        desired_substances_per_property,
        required_smiles_to_include=['O'],
        smiles_to_exclude=test_set_smiles,
        vdw_smirks_to_exercise=optimized_vdw_smirks,
        output_data_set_path='water_mixture_data_set.json')

    full_data_set.merge(mixture_data_set)
    full_data_set.merge(water_mixture_data_set)

    # Next, build the pure data sets. Start by collating all of the previously chosen
    # molecules
    chosen_mixture_smiles = set()

    for properties in full_data_set.properties.values():

        for physical_property in properties:
            chosen_mixture_smiles.update([
                component.smiles
                for component in physical_property.substance.components
            ])

    # Define the order of preference for which data pure substances should have.
    pure_property_order = [[(Density, SubstanceType.Pure),
                            (DielectricConstant, SubstanceType.Pure)],
                           [(Density, SubstanceType.Pure)],
                           [(DielectricConstant, SubstanceType.Pure)],
                           [(EnthalpyOfVaporization, SubstanceType.Pure)]]

    # Ideally choose molecules for which we have also chosen binary data.
    # We exclude water as we did not aim to refit that in this release.
    pure_data_set = curate_data_set(
        processed_data_directory,
        pure_property_order,
        desired_substances_per_property,
        required_smiles_to_include=chosen_mixture_smiles,
        smiles_to_exclude=[*test_set_smiles, 'O'],
        vdw_smirks_to_exercise=optimized_vdw_smirks,
        minimum_data_points_per_property_per_smirks=3,
        output_data_set_path='pure_data_set_binary_compounds.json')

    chosen_pure_smiles = set()

    for properties in pure_data_set.properties.values():

        for physical_property in properties:
            chosen_pure_smiles.update([
                component.smiles
                for component in physical_property.substance.components
            ])

    # Relax the criteria to include other molecules (again excluding water).
    pure_data_set.merge(
        curate_data_set(
            processed_data_directory,
            pure_property_order,
            desired_substances_per_property,
            required_smiles_to_include=None,
            smiles_to_exclude=[*test_set_smiles, 'O', *chosen_pure_smiles],
            vdw_smirks_to_exercise=optimized_vdw_smirks,
            minimum_data_points_per_property_per_smirks=3,
            output_data_set_path='pure_data_set.json'))

    full_data_set.merge(pure_data_set)

    with open(output_path, 'w') as file:
        file.write(full_data_set.json())
def main():
    """The main script which will create an estimation server, request
    the curated data set be estimated for each force field of interest,
    wait for the calculations to be complete, and save the results.
    """

    setup_timestamp_logging()
    logger = logging.getLogger()

    # Define those force fields to use in the calculations
    force_field_sources = {
        'smirnoff99frosst 1.1.0':
        SmirnoffForceFieldSource.from_path('smirnoff99Frosst-1.1.0.offxml'),
        'parsley 0.0.9':
        SmirnoffForceFieldSource.from_path('smirnoff_release_1_v0_0_9.offxml'),
        'parsley rc 1':
        SmirnoffForceFieldSource.from_path('openff_hbonds-1.0.0-RC1.offxml'),
        'gaff 1.81':
        TLeapForceFieldSource(leap_source='leaprc.gaff'),
        'gaff 2.11':
        TLeapForceFieldSource(leap_source='leaprc.gaff2')
    }

    # Set up the server object which will run the calculations.
    setup_server(max_number_of_workers=50)

    # Set up the client which will request the estimates.
    estimator_client = PropertyEstimatorClient()

    # Load in the data set to estimate.
    with open('curated_data_set.json') as file:
        data_set = PhysicalPropertyDataSet.parse_json(file.read())

    # Specify the estimation options
    protocol_replacements = {
        'gaff_1': {
            'BuildSmirnoffSystem': 'BuildTLeapSystem'
        },
        'gaff_2': {
            'BuildSmirnoffSystem': 'BuildTLeapSystem'
        }
    }

    requests = {}

    # Request estimates using each force field, storing the request
    # object used to query the status of the results.
    for force_field_key in force_field_sources:

        force_field_source = force_field_sources[force_field_key]

        options = get_estimation_options(
            protocol_replacements.get(force_field_key, {}))

        requests[force_field_key] = estimator_client.request_estimate(
            property_set=data_set,
            force_field_source=force_field_source,
            options=options)

    # Wait for the results.
    should_run = True
    finished_force_fields = []

    while should_run:

        sleep(60)

        for force_field_key in force_field_sources:

            if force_field_key in finished_force_fields:
                continue

            results = requests[force_field_key].results(False)

            if isinstance(results, PropertyEstimatorResult) and len(
                    results.queued_properties) > 0:
                continue

            logger.info(f'The server has completed {force_field_key}.')

            # Save the result to file.
            save_results(force_field_key, results)
            finished_force_fields.append(force_field_key)

        if len(finished_force_fields) == len(force_field_sources):
            should_run = False
def save_results(force_field_key, results):
    """Saves the estimated results to disk.

    Parameters
    ----------
    force_field_key: str
        The key of the force field which these results were
        estimated for.
    results: PropertyEstimatorResult
        The results of an estimation request.
    """

    with open(f'{force_field_key} results.json', 'w') as file:

        json.dump(results,
                  file,
                  sort_keys=True,
                  indent=2,
                  separators=(',', ': '),
                  cls=TypedJSONEncoder)

    # Save the estimated and unsuccessful properties in separate data sets.
    estimated_data_set = PhysicalPropertyDataSet()
    unsuccessful_data_set = PhysicalPropertyDataSet()

    # Gather up the successfully estimated properties.
    for substance_id in results.estimated_properties:

        estimated_properties = results.estimated_properties[substance_id]

        for estimated_property in estimated_properties:

            if substance_id not in estimated_data_set.properties:
                estimated_data_set.properties[substance_id] = []

            estimated_property.source.provenance = {}
            estimated_data_set.properties[substance_id].append(
                estimated_property)

    estimated_data_set.to_pandas().to_csv(f'{force_field_key}.csv')

    with open(f'{force_field_key}.json', 'w') as file:
        json.dump(estimated_data_set,
                  file,
                  sort_keys=True,
                  indent=2,
                  separators=(',', ': '),
                  cls=TypedJSONEncoder)

    # Gather up the properties which could not be estimated.
    for substance_id in results.unsuccessful_properties:

        unsuccessful_properties = results.unsuccessful_properties[
            substance_id][0]

        for unsuccessful_property in unsuccessful_properties:

            if substance_id not in unsuccessful_data_set.properties:
                unsuccessful_data_set.properties[substance_id] = []

            unsuccessful_property.source.provenance = None
            unsuccessful_data_set.properties[substance_id].append(
                unsuccessful_property)

    with open(f'{force_field_key} unsuccessful.json', 'w') as file:
        json.dump(unsuccessful_data_set,
                  file,
                  sort_keys=True,
                  indent=2,
                  separators=(',', ': '),
                  cls=TypedJSONEncoder)

    # Save any exceptions that occured in a more human readable file.
    with open(f'{force_field_key} exceptions.txt', 'w') as file:

        for index, exception in enumerate(results.exceptions):

            file.write(f'\n{exception.directory}\n')
            file.write(exception.message.replace('\\n', '\n'))