Exemple #1
0
    def test_batch_fit(self, tmp_path):
        hdx_set = HDXMeasurementSet([self.hdxm_apo, self.hdxm_dimer])
        guess = csv_to_dataframe(output_dir / 'ecSecB_guess.csv')

        # Create rates dataframe
        rates_df = pd.DataFrame(
            {name: guess['rate']
             for name in hdx_set.names})

        gibbs_guess = hdx_set.guess_deltaG(rates_df)
        fr_global = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=1000)

        fpath = Path(tmp_path) / 'fit_result_batch.csv'
        fr_global.to_file(fpath)
        df = csv_to_dataframe(fpath)
        assert df.attrs['metadata'] == fr_global.metadata

        output = fr_global.output

        check_protein = csv_to_protein(output_dir / 'ecSecB_batch.csv')
        states = ['SecB WT apo', 'SecB his dimer apo']

        for state in states:
            from pandas.testing import assert_series_equal

            result = output[state]['dG']
            test = check_protein[state]['dG']

            assert_series_equal(result, test, rtol=0.1)

        errors = fr_global.get_squared_errors()
        assert errors.shape == (hdx_set.Ns, hdx_set.Np, hdx_set.Nt)

        mock_alignment = {
            'apo':
            'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA',
            'dimer':
            'MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVY--------------EVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGA----YCPNILFPAARECIASMVARGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA-----------------',
        }

        hdx_set.add_alignment(list(mock_alignment.values()))

        gibbs_guess = hdx_set[0].guess_deltaG(
            guess['rate'])  # Guesses from first measurement
        aligned_result = fit_gibbs_global_batch_aligned(hdx_set,
                                                        gibbs_guess,
                                                        r1=2,
                                                        r2=5,
                                                        epochs=1000)
        output = aligned_result.output
        check_protein = csv_to_protein(output_dir / 'ecSecB_batch_aligned.csv')
        states = ['SecB WT apo', 'SecB his dimer apo']

        for state in states:
            from pandas.testing import assert_series_equal
            result = output[state]['dG']
            test = check_protein[state]['dG']

            assert_series_equal(result, test, rtol=0.1)
Exemple #2
0
def yaml_to_hdxmset(yaml_dict, data_dir=None, **kwargs):
    """reads files according to `yaml_dict` spec from `data_dir into HDXMEasurementSet"""

    hdxm_list = []
    for k, v in yaml_dict.items():
        hdxm = yaml_to_hdxm(v, data_dir=data_dir, name=k)
        hdxm_list.append(hdxm)

    return HDXMeasurementSet(hdxm_list)
Exemple #3
0
    def test_batch_fit(self):
        hdx_set = HDXMeasurementSet([self.series_apo, self.series_dimer])
        guess = csv_to_protein(
            os.path.join(directory, 'test_data', 'ecSecB_guess.txt'))

        gibbs_guess = hdx_set.guess_deltaG([guess['rate'], guess['rate']])
        result = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=1000)

        output = result.output

        check_protein = csv_to_protein(os.path.join(directory, 'test_data',
                                                    'ecSecB_batch.csv'),
                                       column_depth=2)
        states = ['SecB WT apo', 'SecB his dimer apo']

        for state in states:
            from pandas.testing import assert_series_equal

            result = output[state]['deltaG']
            test = check_protein[state]['deltaG']

            assert_series_equal(result, test, rtol=0.1)

        mock_alignment = {
            'apo':
            'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA',
            'dimer':
            'MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVY--------------EVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGA----YCPNILFPAARECIASMVARGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA-----------------',
        }

        hdx_set.add_alignment(list(mock_alignment.values()))

        gibbs_guess = hdx_set.guess_deltaG([guess['rate'], guess['rate']])
        aligned_result = fit_gibbs_global_batch_aligned(hdx_set,
                                                        gibbs_guess,
                                                        r1=2,
                                                        r2=5,
                                                        epochs=1000)
        output = aligned_result.output
        check_protein = csv_to_protein(os.path.join(
            directory, 'test_data', 'ecSecB_batch_aligned.csv'),
                                       column_depth=2)
        states = ['SecB WT apo', 'SecB his dimer apo']

        for state in states:
            from pandas.testing import assert_series_equal
            result = output[state]['deltaG']
            test = check_protein[state]['deltaG']

            assert_series_equal(result, test, rtol=0.1)
current_dir = Path(__file__).parent
#current_dir = Path().cwd() / 'templates'  # pycharm scientific compat

output_dir = current_dir / 'output'
output_dir.mkdir(exist_ok=True)
data_dir = current_dir.parent / 'tests' / 'test_data'
data = read_dynamx(data_dir / 'input' / 'ecSecB_apo.csv', data_dir / 'input' / 'ecSecB_dimer.csv')

pmt = PeptideMasterTable(data)
pmt.set_control(('Full deuteration control', 0.167*60))

st1 = HDXMeasurement(pmt.get_state('SecB his dimer apo'), pH=8, temperature=273.15 + 30)
st2 = HDXMeasurement(pmt.get_state('SecB WT apo'), pH=8, temperature=273.15 + 30)

hdx_set = HDXMeasurementSet([st1, st2])
guess = csv_to_protein(data_dir / 'output' / 'ecSecB_guess.csv')
gibbs_guess = hdx_set[0].guess_deltaG(guess['rate'])


# Example fit with only 5000 epochs and high learning rate
# Checkpoint stores model history every `epoch_step` epochs
checkpoint = CheckPoint(epoch_step=250)
result = fit_gibbs_global_batch(hdx_set, gibbs_guess, r1=0.5, r2=0.1, epochs=5000, lr=1e5, callbacks=[checkpoint])
print(f"MSE loss: {result.mse_loss:.2f}, "
      f"Reg loss: {result.reg_loss:.2f}, "
      f"Reg percent: {result.regularization_percentage:.0f}%")


df = checkpoint.to_dataframe(hdx_set.names)
dataframe_to_file(output_dir / 'model_history.csv', df)
Exemple #5
0
data_dir = current_dir.parent / 'tests' / 'test_data'
yaml_stream = Path(current_dir / 'yaml_files' / 'SecB.yaml').read_text()
data_dict = yaml.safe_load(yaml_stream)

output_dir = current_dir / 'fit'
output_dir.mkdir(exist_ok=True)

hdxm_list = [
    load_from_yaml(dic, data_dir=data_dir, name=name)
    for name, dic in data_dict.items()
]
rates_list = [
    csv_to_protein(current_dir / 'guesses' / f'{name}_rates_guess.txt')['rate']
    for name in data_dict.keys()
]
hdx_set = HDXMeasurementSet(hdxm_list)

gibbs_guess = hdx_set.guess_deltaG(rates_list)

log_file = output_dir / f"fitting_log.txt"
now = datetime.now()
date = f'# {now.strftime("%Y/%m/%d %H:%M:%S")} ({int(now.timestamp())})'

lines = [VERSION_STRING, date]

r2 = 0.5
for r1 in [0, 0.01, 0.25, 0.5, 1]:
    t0 = time.time()
    result = fit_gibbs_global_batch(hdx_set,
                                    gibbs_guess,
                                    epochs=1000,
Exemple #6
0
    'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA',
}

current_dir = Path(__file__).parent

data_dir = current_dir.parent / 'tests' / 'test_data'
data = read_dynamx(data_dir / 'ecSecB_apo.csv', data_dir / 'ecSecB_dimer.csv')

pmt = PeptideMasterTable(data)
pmt.set_control(('Full deuteration control', 0.167))

st1 = HDXMeasurement(pmt.get_state('SecB his dimer apo'),
                     pH=8,
                     temperature=273.15 + 30)
st2 = HDXMeasurement(pmt.get_state('SecB WT apo'),
                     pH=8,
                     temperature=273.15 + 30)

guess = csv_to_protein(data_dir / 'ecSecB_guess.txt')

hdx_set = HDXMeasurementSet([st1, st2])
gibbs_guess = hdx_set.guess_deltaG([guess['rate'], guess['rate']])
hdx_set.add_alignment(list(mock_alignment.values()))
result = fit_gibbs_global_batch_aligned(hdx_set,
                                        gibbs_guess,
                                        r1=2,
                                        r2=5,
                                        epochs=1000)

print(result.output)
if guess:
    client = default_client()
    wt_avg_result = fit_rates_weighted_average(hdxm, bounds=(1e-2, 800))
    output = wt_avg_result.output
    output.to_file(directory / 'test_data' / 'ecSecB_guess.txt')
else:
    output = csv_to_protein(directory / 'test_data' / 'ecSecB_guess.txt')

gibbs_guess = hdxm.guess_deltaG(output['rate'])
fr_torch = fit_gibbs_global(hdxm, gibbs_guess, epochs=epochs, r1=2)
fr_torch.output.to_file(directory / 'test_data' / 'ecSecB_torch_fit.txt')

hdxm_dimer = HDXMeasurement(pmt.get_state('SecB his dimer apo'), sequence=sequence_dimer,
                            temperature=temperature, pH=pH)

hdx_set = HDXMeasurementSet([hdxm_dimer, hdxm])

gibbs_guess = hdx_set.guess_deltaG([output['rate'], output['rate']])
batch_result = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=epochs)

batch_result.output.to_file(directory / 'test_data' / 'ecSecB_batch.csv')
batch_result.output.to_file(directory / 'test_data' / 'ecSecB_batch.txt', fmt='pprint')

# Order is inverted compared to test!
mock_alignment = {
    'dimer':   'MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVY--------------EVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGA----YCPNILFPAARECIASMVARGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA-----------------',
    'apo':     'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA',
}

hdx_set.add_alignment(list(mock_alignment.values()))
Exemple #8
0
 def hdx_set(self):
     return HDXMeasurementSet(list(self.hdxm_objects.values()))
Exemple #9
0
def fit_gibbs_global(
    hdxm,
    initial_guess,
    r1=R1,
    epochs=EPOCHS,
    patience=PATIENCE,
    stop_loss=STOP_LOSS,
    optimizer="SGD",
    callbacks=None,
    **optimizer_kwargs,
):
    """
    Fit Gibbs free energies globally to all D-uptake data in the supplied hdxm

    Parameters
    ----------
    hdxm : :class:`~pyhdx.models.HDXMeasurement`
        Input HDX measurement
    initial_guess : :class:`~pandas.Series` or :class:`~numpy.ndarray`
        Gibbs free energy initial guesses (shape Nr, units J/mol)
    r1 : :obj:`float`
        Regularizer value r1 (along residues)
    epochs: :obj:`int`
        Maximum number of fitting iterations
    patience: :obj:`int`
        Number of epochs to wait until termination when progress between epochs is below `stop_loss`
    stop_loss: :obj:`float`
        Threshold for difference in loss between epochs when an epoch is considered to make no more progress.
    optimizer : :obj:`str`
        Which optimizer to use. Default is Stochastic Gradient Descent. See PyTorch documentation for information.
    callbacks: :obj:`list` or None
        List of callback objects. Call signature is callback(epoch, model, optimizer)
    **optimizer_kwargs
        Additional keyword arguments passed to the optimizer.

    Returns
    -------
    result: :class:`~pyhdx.fitting_torch.TorchSingleFitResult`

    """

    fit_keys = ["r1", "epochs", "patience", "stop_loss", "optimizer"]
    locals_dict = locals()
    fit_kwargs = {k: locals_dict[k] for k in fit_keys}

    tensors = hdxm.get_tensors()
    inputs = [
        tensors[key] for key in ["temperature", "X", "k_int", "timepoints"]
    ]
    output_data = tensors["d_exp"]

    if isinstance(initial_guess, pd.Series):
        assert (initial_guess.index.inferred_type == "integer"
                ), "Invalid dtype for initial guess index, must be 'integer'"
        # Map guesses to covered residue range and fill NaN gaps
        initial_guess = initial_guess.reindex(
            hdxm.coverage.r_number).interpolate(limit_direction="both")
        initial_guess = initial_guess.to_numpy()

    assert len(initial_guess) == hdxm.Nr, "Invalid length of initial guesses"
    assert not np.any(np.isnan(initial_guess)), "Initial guess has NaN entries"

    dtype = torch.float64
    dG_par = torch.nn.Parameter(
        torch.tensor(initial_guess,
                     dtype=cfg.TORCH_DTYPE,
                     device=cfg.TORCH_DEVICE).unsqueeze(-1))  # reshape (nr, 1)

    model = DeltaGFit(dG_par)
    criterion = torch.nn.MSELoss(reduction="mean")

    # Take default optimizer kwargs and update them with supplied kwargs
    optimizer_kwargs = {
        **optimizer_defaults.get(optimizer, {}),
        **optimizer_kwargs,
    }  # Take defaults and override with user-specified
    optimizer_klass = getattr(torch.optim, optimizer)

    reg_func = partial(regularizer_1d, r1)

    # returned_model is the same object as model
    losses_array, returned_model = run_optimizer(
        inputs,
        output_data,
        optimizer_klass,
        optimizer_kwargs,
        model,
        criterion,
        reg_func,
        epochs=epochs,
        patience=patience,
        stop_loss=stop_loss,
        callbacks=callbacks,
    )
    losses = _loss_df(losses_array)
    fit_kwargs.update(optimizer_kwargs)
    hdxm_set = HDXMeasurementSet([hdxm])
    result = TorchFitResult(hdxm_set, model, losses=losses, **fit_kwargs)

    return result
Exemple #10
0
from pyhdx.fileIO import csv_to_protein

current_dir = Path(__file__).parent

data_dir = current_dir.parent / 'tests' / 'test_data'
data = read_dynamx(data_dir / 'ecSecB_apo.csv', data_dir / 'ecSecB_dimer.csv')

pmt = PeptideMasterTable(data)
pmt.set_control(('Full deuteration control', 0.167))

st1 = HDXMeasurement(pmt.get_state('SecB his dimer apo'),
                     pH=8,
                     temperature=273.15 + 30)
st2 = HDXMeasurement(pmt.get_state('SecB WT apo'),
                     pH=8,
                     temperature=273.15 + 30)

hdx_set = HDXMeasurementSet([st1, st2])
guess = csv_to_protein(data_dir / 'ecSecB_guess.txt')

gibbs_guess = hdx_set.guess_deltaG([guess['rate'], guess['rate']])

# Example fit with only 1000 epochs and high regularizers
# For real data start with parameters r1=0.05, r2=0.5, epochs=100000
result = fit_gibbs_global_batch(hdx_set, gibbs_guess, r1=2, r2=5, epochs=1000)

#Human readable output
result.output.to_file('Batch_fit_result.txt', fmt='pprint')

#Machine readable output
result.output.to_file('Batch_fit_result.csv', fmt='csv')
dataframe_to_file(output_dir / f'ecSecB_torch_fit_epochs_{epochs_long}.csv',
                  fr_torch.output)
dataframe_to_file(output_dir / f'ecSecB_torch_fit_epochs_{epochs_long}.txt',
                  fr_torch.output,
                  fmt='pprint')

# ----------
# Batch fits
# ----------

hdxm_dimer = HDXMeasurement(pmt.get_state('SecB his dimer apo'),
                            sequence=sequence_dimer,
                            temperature=temperature,
                            pH=pH)

hdx_set = HDXMeasurementSet([hdxm_dimer, hdxm])

gibbs_guess = hdx_set[0].guess_deltaG(guess_output['rate'])
batch_result = fit_gibbs_global_batch(hdx_set, gibbs_guess, epochs=epochs)

dataframe_to_file(output_dir / 'ecSecB_batch.csv', batch_result.output)
dataframe_to_file(output_dir / 'ecSecB_batch.txt',
                  batch_result.output,
                  fmt='pprint')

# Order is inverted compared to test!
mock_alignment = {
    'dimer':
    'MSEQNNTEMTFQIQRIYTKDISFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVY--------------EVVLRVTVTASLGEETAFLCEVQQGGIFSIAGIEGTQMAHCLGA----YCPNILFPAARECIASMVARGTFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA-----------------',
    'apo':
    'MSEQNNTEMTFQIQRIYTKDI------------SFEAPNAPHVFQKDWQPEVKLDLDTASSQLADDVYEVVLRVTVTASLG-------------------EETAFLCEVQQGGIFSIAGIEGTQMAHCLGAYCPNILFPYARECITSMVSRG----TFPQLNLAPVNFDALFMNYLQQQAGEGTEEHQDA',
Exemple #12
0
    def hdx_set(self):
        """Returns combined HDXMeasurementSet of all currently added data objects"""
        #todo when alignments are added in, update this as (fixed) attribute

        return HDXMeasurementSet(list(self.data_objects.values()))