def setUp(self): super().setUp() self.n_runs = 9 seed = 42 np.random.seed(seed) p = 5 # pats number v = 3 # vis number f = 2 # feat number t = np.sort(np.random.rand(self.n_runs, p, v), axis=2) y = np.sort(np.random.rand(self.n_runs, p, v, f), axis=2) self.datas = [ Data.from_individuals(np.arange(p), t[r, :], y[r, :], [f'feature_{i}' for i in range(f)]) for r in range(self.n_runs) ] self.model_type = 'logistic' self.settings_algos_fit = [ AlgorithmSettings('mcmc_saem', n_iter=10, seed=seed) ] * self.n_runs self.settings_algos_perso = [ AlgorithmSettings('mode_real', seed=seed) ] * self.n_runs
def test_fit_logistic_diag_noise(self): # Inputs data = Data.from_csv_file(example_data_path) algo_settings = AlgorithmSettings('mcmc_saem', loss='MSE_diag_noise', n_iter=10, seed=0) # Initialize leaspy = Leaspy("logistic") leaspy.model.load_hyperparameters({'source_dimension': 2}) # Fit the model on the data leaspy.fit(data, algorithm_settings=algo_settings) #leaspy.save("../../fitted_multivariate_model_diag_noise.json") self.assertAlmostEqual(leaspy.model.parameters['tau_mean'], 78.0697, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['tau_std'], 1.0275, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['xi_mean'], 0.0, delta=0.001) self.assertAlmostEqual(leaspy.model.parameters['xi_std'], 0.1634, delta=0.001) diff_noise = leaspy.model.parameters['noise_std'] - torch.tensor([0.3287, 0.2500, 0.2591, 0.2913]) diff_g = leaspy.model.parameters['g'] - torch.tensor([1.9405, 2.5914, 2.5199, 2.2495]) diff_v = leaspy.model.parameters['v0'] - torch.tensor([-3.5214, -3.5387, -3.5215, -3.5085]) self.assertAlmostEqual(torch.sum(diff_noise**2).item(), 0.0, delta=0.01) self.assertAlmostEqual(torch.sum(diff_g**2).item(), 0.0, delta=0.01) self.assertAlmostEqual(torch.sum(diff_v**2).item(), 0.0, delta=0.02)
def test_fit_logistic_big_setup(n_patients, n_visits_per_patient, n_modalities): from leaspy import Data, AlgorithmSettings, Leaspy from algo.algo_factory import AlgoFactory from inputs.data.dataset import Dataset import pandas as pd import numpy as np import time global algorithm, dataset, leaspy start = time.time() # n_patients = 200 # n_visits_per_patient = 10 # n_modalities = 8 df = pd.DataFrame() # Inputs for i in range(n_patients): patient = np.random.uniform(low=0.01, high=0.99, size=(n_visits_per_patient, n_modalities)) times = np.random.uniform(low=0.01, high=0.99, size=(n_visits_per_patient, 1)) patient_df = pd.DataFrame(patient) patient_df.columns = [str(col) for col in patient_df.columns] patient_df['ID'] = i patient_df['TIME'] = times df = pd.concat([df, patient_df]) df = df[["ID", "TIME"] + [str(i) for i in range(n_modalities)]] data = Data.from_dataframe(df) algo_settings = AlgorithmSettings('mcmc_saem', n_iter=200, seed=0) # Initialize leaspy = Leaspy("logistic") leaspy.model.load_hyperparameters({'source_dimension': 2}) # Fit the model on the data # leaspy.fit(data, algorithm_settings=algo_settings) # Check algorithm compatibility Leaspy.check_if_algo_is_compatible(algo_settings, "fit") algorithm = AlgoFactory.algo(algo_settings) dataset = Dataset(data, algo=algorithm, model=leaspy.model) if not leaspy.model.is_initialized: leaspy.model.initialize(dataset) print("test_fit_logistic_big_setup execution time: %.2f s" % (time.time() - start))
def test_check_cofactors(self, get_result=False): """ Test Leaspy.simulate return a ``ValueError`` if the ``cofactor`` and ``cofactor_state`` parameters given in the ``AlgorithmSettings`` are invalid. Parameters ---------- get_result : bool If set to ``True``, return the leaspy model and result object used to do the test. Else return nothing. Returns ------- model : leaspy.Leaspy results : leaspy.io.outputs.result.Result """ data = Data.from_csv_file(example_data_path) cofactors = pd.read_csv( os.path.join(test_data_dir, "io/data/data_tiny_covariate.csv")) cofactors.columns = ("ID", "Treatments") cofactors['ID'] = cofactors['ID'].apply(lambda x: str(x)) cofactors = cofactors.set_index("ID") data.load_cofactors(cofactors, ["Treatments"]) model = Leaspy.load( os.path.join(test_data_dir, "model_parameters/multivariate_model_sampler.json")) settings = AlgorithmSettings('mode_real') individual_parameters = model.personalize(data, settings) settings = AlgorithmSettings('simulation', cofactor="dummy") self.assertRaises(ValueError, model.simulate, individual_parameters, data, settings) settings = AlgorithmSettings('simulation', cofactor="Treatments", cofactor_state="dummy") self.assertRaises(ValueError, model.simulate, individual_parameters, data, settings) if get_result: return model, individual_parameters, data
def test_all_model_run_crossentropy(self): """ Check if the following models run with the following algorithms. """ for model_name in ('linear', 'univariate', 'logistic', 'logistic_parallel'): logistic_leaspy = Leaspy(model_name) settings = AlgorithmSettings('mcmc_saem', n_iter=200, seed=0, loss="crossentropy") df = pd.read_csv(binary_data_path) if model_name == 'univariate': df = df.iloc[:, :3] data = Data.from_dataframe(df) logistic_leaspy.fit(data, settings) for method in ['scipy_minimize']: burn_in_kw = dict() # not for all algos if '_real' in method: burn_in_kw = dict(n_burn_in_iter=90, ) settings = AlgorithmSettings(method, n_iter=100, seed=0, loss="crossentropy", **burn_in_kw) logistic_result = logistic_leaspy.personalize(data, settings)
def test_fit_logistic_small(): from leaspy import Leaspy, Data, AlgorithmSettings from tests import example_data_path # Inputs data = Data.from_csv_file(example_data_path) algo_settings = AlgorithmSettings('mcmc_saem', n_iter=200, seed=0) # Initialize leaspy = Leaspy("logistic") leaspy.model.load_hyperparameters({'source_dimension': 2}) # Fit the model on the data leaspy.fit(data, algorithm_settings=algo_settings)
def test_personalize_scipy(self): """ Load logistic model from file, and personalize it to data from ... :return: """ # Inputs data = Data.from_csv_file(example_data_path) # Initialize leaspy = Leaspy.load(example_logisticmodel_path) # Launch algorithm algo_personalize_settings = AlgorithmSettings('scipy_minimize', seed=0) ips, noise_std = leaspy.personalize(data, settings=algo_personalize_settings, return_noise=True) self.assertAlmostEqual(noise_std.item(), 0.1169, delta=0.01)
def test_personalize_scipy_diag_noise(self): """ Load logistic model (diag noise) from file, and personalize it to data from ... :return: """ # Inputs data = Data.from_csv_file(example_data_path) # Initialize leaspy = Leaspy.load(example_logisticmodel_diag_noise_path) # Launch algorithm algo_personalize_settings = AlgorithmSettings('scipy_minimize', seed=0) ips, noise_std = leaspy.personalize(data, settings=algo_personalize_settings, return_noise=True) diff_noise = noise_std - torch.tensor([0.3299, 0.1236, 0.1642, 0.2582]) self.assertAlmostEqual((diff_noise**2).sum(), 0., delta=0.01)
def test_fit_univariate_logistic(self): # Inputs df = pd.read_csv(example_data_path) data = Data.from_dataframe(df.iloc[:,:3]) # one feature column algo_settings = AlgorithmSettings('mcmc_saem', n_iter=10, seed=0) # Initialize leaspy = Leaspy("univariate") # Fit the model on the data leaspy.fit(data, algorithm_settings=algo_settings) self.assertAlmostEqual(leaspy.model.parameters['noise_std'], 0.1780, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['tau_mean'], 70.2322, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['tau_std'], 2.0974, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['xi_mean'], -2.8940, delta=0.001) self.assertAlmostEqual(leaspy.model.parameters['xi_std'], 0.1063, delta=0.001) self.assertAlmostEqual(leaspy.model.parameters['g'], 0.9939, delta=0.001)
def test_personalize_modereal(self): """ Load logistic model from file, and personalize it to data from ... :return: """ # Inputs data = Data.from_csv_file(example_data_path) # Initialize leaspy = Leaspy.load(example_logisticmodel_path) # Launch algorithm path_settings = os.path.join(os.path.dirname(__file__), "data/settings_mode_real.json") algo_personalize_settings = AlgorithmSettings.load(path_settings) ips, noise_std = leaspy.personalize(data, settings=algo_personalize_settings, return_noise=True) self.assertAlmostEqual(noise_std.item(), 0.12152, delta=0.01)
def test_construtor(self): """ Test the initialization. """ self.assertEqual(self.settings.parameters['bandwidth_method'], self.algo.bandwidth_method) self.assertEqual(self.settings.parameters['noise'], self.algo.noise) self.assertEqual(self.settings.parameters['number_of_subjects'], self.algo.number_of_subjects) self.assertEqual(self.settings.parameters['mean_number_of_visits'], self.algo.mean_number_of_visits) self.assertEqual(self.settings.parameters['std_number_of_visits'], self.algo.std_number_of_visits) self.assertEqual(self.settings.parameters['cofactor'], self.algo.cofactor) self.assertEqual(self.settings.parameters['cofactor_state'], self.algo.cofactor_state) settings = AlgorithmSettings('simulation', sources_method="dummy") self.assertRaises(ValueError, SimulationAlgorithm, settings)
def test_fit_logisticparallel(self): # Inputs data = Data.from_csv_file(example_data_path) algo_settings = AlgorithmSettings('mcmc_saem', n_iter=10, seed=0) # Initialize leaspy = Leaspy("logistic_parallel") leaspy.model.load_hyperparameters({'source_dimension': 2}) # Fit the model on the data leaspy.fit(data, algorithm_settings=algo_settings) self.assertAlmostEqual(leaspy.model.parameters['noise_std'], 0.2641, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['tau_mean'], 70.4093, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['tau_std'], 2.2325, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['xi_mean'], -3.1897, delta=0.001) self.assertAlmostEqual(leaspy.model.parameters['xi_std'], 0.1542, delta=0.001) self.assertAlmostEqual(leaspy.model.parameters['g'], 1.0160, delta=0.001) diff_deltas = leaspy.model.parameters['deltas'] - torch.tensor([-0.0099, -0.0239, -0.0100]) self.assertAlmostEqual(torch.sum(diff_deltas ** 2).item(), 0.0, delta=0.01)
def test_fit_logistic(self): # Inputs data = Data.from_csv_file(example_data_path) algo_settings = AlgorithmSettings('mcmc_saem', n_iter=10, seed=0) # Initialize leaspy = Leaspy("logistic") leaspy.model.load_hyperparameters({'source_dimension': 2}) # Fit the model on the data leaspy.fit(data, algorithm_settings=algo_settings) self.assertAlmostEqual(leaspy.model.parameters['noise_std'], 0.2986, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['tau_mean'], 78.0270, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['tau_std'], 0.9494, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['xi_mean'], 0.0, delta=0.001) self.assertAlmostEqual(leaspy.model.parameters['xi_std'], 0.1317, delta=0.001) diff_g = leaspy.model.parameters['g'] - torch.tensor([1.9557, 2.5899, 2.5184, 2.2369]) diff_v = leaspy.model.parameters['v0'] - torch.tensor([-3.5714, -3.5820, -3.5811, -3.5886]) self.assertAlmostEqual(torch.sum(diff_g**2).item(), 0.0, delta=0.01) self.assertAlmostEqual(torch.sum(diff_v**2).item(), 0.0, delta=0.02)
def test_fit_logisticparallel_diag_noise(self): # Inputs data = Data.from_csv_file(example_data_path) algo_settings = AlgorithmSettings('mcmc_saem', loss='MSE_diag_noise', n_iter=10, seed=0) # Initialize leaspy = Leaspy("logistic_parallel") leaspy.model.load_hyperparameters({'source_dimension': 2}) # Fit the model on the data leaspy.fit(data, algorithm_settings=algo_settings) self.assertAlmostEqual(leaspy.model.parameters['tau_mean'], 70.3955, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['tau_std'], 2.2052, delta=0.01) self.assertAlmostEqual(leaspy.model.parameters['xi_mean'], -3.1508, delta=0.001) self.assertAlmostEqual(leaspy.model.parameters['xi_std'], 0.1296, delta=0.001) self.assertAlmostEqual(leaspy.model.parameters['g'], 1.0097, delta=0.001) diff_noise = leaspy.model.parameters['noise_std'] - torch.tensor([0.1917, 0.2906, 0.2802, 0.2785]) diff_deltas = leaspy.model.parameters['deltas'] - torch.tensor([-0.0372, -0.0024, -0.0329]) self.assertAlmostEqual(torch.sum(diff_noise**2).item(), 0.0, delta=0.01) self.assertAlmostEqual(torch.sum(diff_deltas ** 2).item(), 0.0, delta=0.01)
def get_individual_parameters(data): # Data leaspy_data = convert_data(data) # Algorithm settings = AlgorithmSettings('scipy_minimize') # Leaspy #leaspy = Leaspy.load(data['model']) # TO CORRECT #if data['model']['name'] == 'logistic_parallel': leaspy = Leaspy.load(data['model']) #elif data['model']['name'] == 'logistic': # leaspy = Leaspy.load(os.path.join(os.getcwd(), 'data', 'example', 'parkinson_model.json')) individual_parameters = leaspy.personalize(leaspy_data, settings=settings) output = { 'individual_parameters' : individual_parameters["patient"], 'scores': leaspy_data.to_dataframe().values.T.tolist() } return output
raise NotImplementedError( "Other resampling methods than RepCV not yet implemented") #%% Run Leaspy # Data as list data_iter = [] for cv_iter in range(n_folds * n_rep): df_split = df.loc[resampling_indices[cv_iter][0]] data = Data.from_dataframe(df_split.reset_index()) data_iter.append(data) # Also settings as list algo_settings_iter = [] algo_settings = AlgorithmSettings('mcmc_saem', n_iter=n_iter, initialization_method="random", seed=seed) for i in range(n_rep * n_folds): algo_settings_iter.append(algo_settings) # Save experiment infos df.to_csv(os.path.join(output_directory, "df.csv")) with open(os.path.join(path_output, "resampling_indices.json"), "w") as json_file: json.dump(resampling_indices, json_file) algo_settings.save(os.path.join(path_output, "calibrate", "algo_settings.json")) def leaspy_factory(i): ll = Leaspy(leaspy_model)
def test_simulation_run(self): """ Test if the simulation run properly with different settings. """ leaspy_session, individual_parameters, data = self.test_check_cofactors( get_result=True) settings = AlgorithmSettings('simulation', seed=0, number_of_subjects=1000, mean_number_of_visits=3, std_number_of_visits=0, sources_method="full_kde", bandwidth_method=.2) new_results = leaspy_session.simulate( individual_parameters, data, settings) # just test if run without error settings = AlgorithmSettings('simulation', seed=0, number_of_subjects=1000, mean_number_of_visits=3, std_number_of_visits=0, sources_method="normal_sources", bandwidth_method=.2) new_results = leaspy_session.simulate( individual_parameters, data, settings) # just test if run without error settings = AlgorithmSettings( 'simulation', seed=0, number_of_subjects=1000, mean_number_of_visits=3, std_number_of_visits=0, sources_method="full_kde", bandwidth_method=.2, features_bounds=True) # idem + test scores bounds # self.test_bounds_behaviour(leaspy_session, results, settings) bounds = { 'Y0': (0., .5), 'Y1': (0., .1), 'Y2': (0., .1), 'Y3': (0., .1) } settings = AlgorithmSettings( 'simulation', seed=0, number_of_subjects=1000, mean_number_of_visits=3, std_number_of_visits=0, sources_method="full_kde", bandwidth_method=.2, features_bounds=bounds) # idem + test scores bounds self._bounds_behaviour(leaspy_session, individual_parameters, data, settings) settings = AlgorithmSettings('simulation', seed=0, number_of_subjects=200, mean_number_of_visits=3, std_number_of_visits=0, sources_method="full_kde", bandwidth_method=.2, reparametrized_age_bounds=(65, 75)) new_results = leaspy_session.simulate( individual_parameters, data, settings) # just test if run without error # Test if the reparametrized ages are within (65, 75) up to a tolerance of 2. repam_age = new_results.data.to_dataframe().groupby( 'ID').first()['TIME'].values repam_age -= new_results.individual_parameters['tau'].squeeze().numpy() repam_age *= np.exp( new_results.individual_parameters['xi'].squeeze().numpy()) repam_age += leaspy_session.model.parameters['tau_mean'].item() self.assertTrue(all(repam_age > 63) & all(repam_age < 77))
def setUp(self): self.settings = AlgorithmSettings('simulation') self.algo = SimulationAlgorithm(self.settings)
def test_usecase(self): """ Functional test of a basic analysis using leaspy package 1 - Data loading 2 - Fit logistic model with MCMC algorithm 3 - Save paramaters & reload (remove created files to keep the repo clean) 4 - Personalize model with 'mode_real' algorithm 5 - Plot results 6 - Simulate new patients """ data = Data.from_csv_file(example_data_path) # Fit algo_settings = AlgorithmSettings('mcmc_saem', n_iter=10, seed=0) leaspy = Leaspy("logistic") leaspy.model.load_hyperparameters({'source_dimension': 2}) leaspy.fit(data, algorithm_settings=algo_settings) self.model_values_test(leaspy.model) # Save parameters and check its consistency path_to_saved_model = os.path.join(test_data_dir, 'model_parameters', 'test_api-copy.json') leaspy.save(path_to_saved_model) with open( os.path.join(test_data_dir, "model_parameters", 'test_api.json'), 'r') as f1: model_parameters = json.load(f1) with open(path_to_saved_model) as f2: model_parameters_new = json.load(f2) # self.assertTrue(ordered(model_parameters) == ordered(model_parameters_new)) self.assertTrue( dict_compare_and_display(model_parameters, model_parameters_new)) # Load data and check its consistency leaspy = Leaspy.load(path_to_saved_model) os.remove(path_to_saved_model) self.assertTrue(leaspy.model.is_initialized) self.model_values_test(leaspy.model) # Personalize algo_personalize_settings = AlgorithmSettings('mode_real', seed=0) individual_parameters = leaspy.personalize( data, settings=algo_personalize_settings) # TODO REFORMAT: compute the noise std afterwards #self.assertAlmostEqual(result.noise_std, 0.21146, delta=0.01) ## Plot TODO #path_output = os.path.join(os.path.dirname(__file__), '../../_data', "_outputs") #plotter = Plotter(path_output) # plotter.plot_mean_trajectory(leaspy.model, save_as="mean_trajectory_plot") #plt.close() # Simulate simulation_settings = AlgorithmSettings('simulation', seed=0) simulation_results = leaspy.simulate(individual_parameters, data, simulation_settings) self.assertTrue(type(simulation_results) == Result) self.assertTrue(simulation_results.data.headers == data.headers) n = simulation_settings.parameters['number_of_subjects'] self.assertEqual(simulation_results.data.n_individuals, n) self.assertEqual( len(simulation_results.get_parameter_distribution('xi')), n) self.assertEqual( len(simulation_results.get_parameter_distribution('tau')), n) self.assertEqual( len( simulation_results.get_parameter_distribution('sources') ['sources0']), n) # simulation_results.data.to_dataframe().to_csv(os.path.join( # test_data_dir, "_outputs/simulation/test_api_simulation_df-post_merge-result_fix.csv"), index=False) # Test the reproducibility of simulate # round is necessary, writing and reading induces numerical errors of magnitude ~ 1e-13 # BUT ON DIFFERENT MACHINE I CAN SEE ERROR OF MAGNITUDE 1e-5 !!! # TODO: Can we improve this?? simulation_df = pd.read_csv( os.path.join( test_data_dir, "_outputs/simulation/test_api_simulation_df-post_merge-result_fix.csv" )) id_simulation_is_reproducible = simulation_df['ID'].equals( simulation_results.data.to_dataframe()['ID']) # Check ID before - str doesn't seem to work with numpy.allclose self.assertTrue(id_simulation_is_reproducible) round_decimal = 5 simulation_is_reproducible = allclose( simulation_df.loc[:, simulation_df.columns != 'ID'].values, simulation_results.data.to_dataframe(). loc[:, simulation_results.data.to_dataframe().columns != 'ID'].values, atol=10**(-round_decimal), rtol=10**(-round_decimal)) # Use of numpy.allclose instead of pandas.testing.assert_frame_equal because of buggy behaviour reported # in https://github.com/pandas-dev/pandas/issues/22052 # If reproducibility error > 1e-5 => display it + visit with the biggest reproducibility error error_message = '' if not simulation_is_reproducible: # simulation_df = pd.read_csv( # os.path.join(test_data_dir, "_outputs/simulation/test_api_simulation_df-post_merge-result_fix.csv")) max_diff = 0. value_v1 = 0. value_v2 = 0. count = 0 tol = 10**(-round_decimal) actual_simu_df = simulation_results.data.to_dataframe() for v1, v2 in zip( simulation_df.loc[:, simulation_df.columns != 'ID'].values. tolist(), actual_simu_df. loc[:, actual_simu_df.columns != 'ID'].values.tolist()): diff = [abs(val1 - val2) for val1, val2 in zip(v1, v2)] if max(diff) > tol: count += 1 if max(diff) > max_diff: value_v1 = v1 value_v2 = v2 max_diff = max(diff) error_message += '\nTolerance error = %.1e' % tol error_message += '\nMaximum error = %.3e' % max_diff error_message += '\n' + str( [round(v, round_decimal + 1) for v in value_v1]) error_message += '\n' + str( [round(v, round_decimal + 1) for v in value_v2]) error_message += '\nNumber of simulated visits above tolerance error = %d / %d \n' \ % (count, simulation_df.shape[0]) # For loop before the last self.assert - otherwise no display is made self.assertTrue(simulation_is_reproducible, error_message)