def test_flatten_model(self): """flatten_model returns a pandas.Series with all the params to recreate a model.""" # Setup model = GaussianMultivariate() X = np.eye(3) model.fit(X) expected_result = pd.Series({ 'covariance__0__0': 1.5000000000000004, 'covariance__1__0': -0.7500000000000003, 'covariance__1__1': 1.5000000000000004, 'covariance__2__0': -0.7500000000000003, 'covariance__2__1': -0.7500000000000003, 'covariance__2__2': 1.5000000000000007, 'distribs__0__mean': 0.33333333333333331, 'distribs__0__std': -0.7520386983881371, 'distribs__1__mean': 0.33333333333333331, 'distribs__1__std': -0.7520386983881371, 'distribs__2__mean': 0.33333333333333331, 'distribs__2__std': -0.7520386983881371, }) data_navigator = MagicMock() modeler = Modeler(data_navigator) # Run result = modeler.flatten_model(model) # Check assert np.isclose(result, expected_result).all()
def test_fit_sample_distribution_dict(self): data = sample_trivariate_xyz() model = GaussianMultivariate(distribution={'x': GaussianKDE()}) model.fit(data) sampled_data = model.sample(10) assert sampled_data.shape == (10, 3)
def fit_copula_to_z2_data(name=None, do_plot=False): """ Example of fitting a copula to z2 stream data :param name: stream name :return: copula obj """ name = name or 'z2~helicopter_psi~helicopter_theta~70.json' assert 'z2~' in name, "Expecting a bivariate stream" lagged_values = get_stream_lagged_values(name=name) normalized_points = [ mr.norminv(mr.from_zcurve(zvalue=z, dim=2)) for z in lagged_values ] npitch, nyaw = zip(*normalized_points) copula = GaussianMultivariate() X = np.array([npitch, nyaw]).transpose() copula.fit(X) synthetic_points = copula.sample(len(X)) spitch = synthetic_points[0] syaw = synthetic_points[1] if do_plot: plt.scatter(spitch, syaw) plt.xlabel('Simulated Pitch - normalized') plt.ylabel('Simulated Yaw - normalized') plt.show() return copula
def test_fit_sample_distribution_name(self): data = sample_trivariate_xyz() model = GaussianMultivariate( 'copulas.univariate.gaussian_kde.GaussianKDE') model.fit(data) sampled_data = model.sample(10) assert sampled_data.shape == (10, 3)
def __init__(self, load, mode, pv_connection, ev_connection, ev_max_connection, pld_pred): self.load = pd.DataFrame(load) self.mode = mode self.copula = GaussianMultivariate() self.pv_connection = pv_connection self.ev_connection = ev_connection self.ev_max_connection = ev_max_connection self.pld_pred = pld_pred
def test_gaussiankde_arguments(self): size = 1000 low = 0 high = 9 data = randint.rvs(low, high, size=size) + norm.rvs(0, 0.1, size=size) dist = GaussianMultivariate(distribution=GaussianKDE(bw_method=0.01)) dist.fit(data) samples = dist.sample(size).to_numpy()[0] d, p = ks_2samp(data, samples) assert p >= 0.05
def test_get_instance_instance_fitted(self): """Try to get a new instance from a fitted instance""" # Run gaussian = GaussianMultivariate() gaussian.fit(pd.DataFrame({'a_field': list(range(10))})) instance = get_instance(gaussian) # Asserts assert not instance.fitted assert isinstance(instance, GaussianMultivariate)
def test_fit_sample_distribution_dict_multiple(self): data = sample_trivariate_xyz() model = GaussianMultivariate( distribution={ 'x': Univariate(parametric=ParametricType.PARAMETRIC), 'y': BetaUnivariate(), 'z': GaussianKDE() }) model.fit(data) sampled_data = model.sample(10) assert sampled_data.shape == (10, 3)
def fit(self, table_data): """Fit the model to the table. Impute the table data before fit the model. Args: table_data (pandas.DataFrame): Data to be fitted. """ table_data = impute(table_data) self.model = GaussianMultivariate(distribution=self.distribution) self.model.fit(table_data)
def get_CB_preds(errors, pred1, pred2, time_serie, copulas_family, method, error_type): if copulas_family == 'gumbel': errors = uniformise_normal_data(errors) copula = Gumbel() else: copula = GaussianMultivariate() copula.fit(errors) synthetic_errors = copula.sample(len(errors)) #norm = stats.distributions.norm() #synthetic_errors = norm.ppf(synthetic_errors) if copulas_family == 'gumbel': cb_pred1 = pred1 - synthetic_errors[:, 0] cb_pred2 = pred2 - synthetic_errors[:, 1] else: cb_pred1 = pred1 - synthetic_errors.iloc[:, 0] cb_pred2 = pred2 - synthetic_errors.iloc[:, 1] cb_pred = get_combined_forecast(cb_pred1, cb_pred2, time_serie, error_type, method) return cb_pred
def fit_and_sample(lagged_zvalues:[[float]],num:int, copula=None): """ Example of fitting a copula function, and sampling lagged_zvalues: [ [z1,z2,z3] ] distributed N(0,1) margins, roughly copula : Something from https://pypi.org/project/copulas/ returns: [ [z1, z2, z3] ] representative sample """ # Remark: It's lazy to just sample synthetic data # Some more evenly spaced sampling would be preferable. # See https://www.microprediction.com/blog/lottery for discussion df = pd.DataFrame(data=lagged_zvalues) if copula is None: copula = GaussianMultivariate() copula.fit(df) synthetic = copula.sample(num) return synthetic.values.tolist()
def test_get_instance_instance(self): """Try to get a new instance from a instance""" # Run instance = get_instance(GaussianMultivariate()) # Asserts assert not instance.fitted assert isinstance(instance, GaussianMultivariate)
def test_cdf(self): data = sample_trivariate_xyz() model = GaussianMultivariate() model.fit(data) sampled_data = model.sample(10) # Test CDF cdf = model.cumulative_distribution(sampled_data) assert (0 <= cdf).all() and (cdf <= 1).all() # Test CDF increasing function for column in sampled_data.columns: sorted_data = sampled_data.sort_values(column) other_columns = data.columns.to_list() other_columns.remove(column) row = sorted_data.sample(1).iloc[0] for column in other_columns: sorted_data[column] = row[column] cdf = model.cumulative_distribution(sorted_data) diffs = np.diff( cdf ) + 0.001 # Add tolerance to avoid floating precision issues. assert (diffs >= 0).all()
def test_save_load(self): data = sample_trivariate_xyz() model = GaussianMultivariate() model.fit(data) sampled_data = model.sample(10) path_to_model = os.path.join(self.test_dir.name, "model.pkl") model.save(path_to_model) model2 = GaussianMultivariate.load(path_to_model) pdf = model.probability_density(sampled_data) pdf2 = model2.probability_density(sampled_data) assert np.all(np.isclose(pdf, pdf2, atol=0.01)) cdf = model.cumulative_distribution(sampled_data) cdf2 = model2.cumulative_distribution(sampled_data) assert np.all(np.isclose(cdf, cdf2, atol=0.01))
def get_errors_sample(errors, copula="Gumbel"): ''' Parameters ---------- errors : numpy array of shape (n,2) The errors to fit with a copula copula : string "Gumbel": fit with a Gumbel copula "Normal": fit with a Normal copula Returns synthetic : a new sample of errors obtain with the JPD of the errors ------- None. ''' ### Transforming the np.array to a dataframe df = pd.DataFrame(errors, columns=['res1', 'res2']) ### Transforming the errors series such that there is no 0 or 1 # because this is leading to problems when using the copula df = pd.DataFrame(np.where(df == 0, 0.00000001, np.where(df == 1, 0.99999999, df)), columns=df.columns) scaler = MinMaxScaler() df = pd.DataFrame(scaler.fit_transform(df.values), columns=df.columns) ### Selecting afor fiting the errors series if copula == "Gumbel": c = gumbel.Gumbel() elif copula == "Normal": c = GaussianMultivariate() ### Fiting the copula and getting the parameters c.fit(df.values) #copulas_parameters=c.to_dict() ### Generating a sample from the copula synthetic = c.sample(len(df)) synthetic = scaler.inverse_transform(synthetic) return synthetic
def test_get_instance_instance_distribution(self): """Try to get a new instance from a instance with distribution""" # Run instance = get_instance( GaussianMultivariate( distribution='copulas.univariate.truncnorm.TruncNorm')) # Asserts assert not instance.fitted assert isinstance(instance, GaussianMultivariate) assert instance.distribution == 'copulas.univariate.truncnorm.TruncNorm'
def test_to_dict_from_dict(self): data = sample_trivariate_xyz() model = GaussianMultivariate() model.fit(data) sampled_data = model.sample(10) params = model.to_dict() model2 = GaussianMultivariate.from_dict(params) pdf = model.probability_density(sampled_data) pdf2 = model2.probability_density(sampled_data) assert np.all(np.isclose(pdf, pdf2, atol=0.01)) cdf = model.cumulative_distribution(sampled_data) cdf2 = model2.cumulative_distribution(sampled_data) assert np.all(np.isclose(cdf, cdf2, atol=0.01))
def fit_and_sample(lagged_zvalues: [[float]], num: int, copula=None): """ Example of creating a "sample" of future values lagged_zvalues: [ [z1,z2,z3] ] distributed N(0,1) margins, roughly copula : Something from https://pypi.org/project/copulas/ returns: [ [z1, z2, z3] ] representative sample Swap out this function for whatever you like. """ # Remark 1: It's lazy to just sample synthetic data # Remark 2: Any multivariate density estimation could go here. # Remark 3: If you prefer uniform margin, use mw.get_lagged_copulas(name=name, count= 5000) # # See https://www.microprediction.com/blog/lottery for discussion of this "game" df = pd.DataFrame(data=lagged_zvalues) if copula is None: copula = GaussianMultivariate() # <--- copula.fit(df) synthetic = copula.sample(num) return synthetic.values.tolist()
def testMITCopulas(): import warnings warnings.filterwarnings('ignore') from copulas.datasets import sample_trivariate_xyz from copulas.multivariate import GaussianMultivariate from copulas.visualization import compare_3d # Load a dataset with 3 columns that are not independent real_data = sample_trivariate_xyz() # Fit a gaussian copula to the data copula = GaussianMultivariate() copula.fit(real_data) # Sample synthetic data synthetic_data = copula.sample(len(real_data)) # Plot the real and the synthetic data to compare compare_3d(real_data, synthetic_data) return True
def get_more_thermal_params(N=100,F_2x=3.84): from copulas.multivariate import GaussianMultivariate d1_d2_q1_copula = GaussianMultivariate.load(Path(__file__).parent / "./Parameter_Sets/d1_d2_q1_CMIP6_copula.pkl") d1_d2_q1_df = d1_d2_q1_copula.sample(10*N) while (d1_d2_q1_df<0).any(axis=1).sum() != 0: d1_d2_q1_df.loc[(d1_d2_q1_df<0).any(axis=1)] = d1_d2_q1_copula.sample((d1_d2_q1_df<0).any(axis=1).sum()).values d2_samples = d1_d2_q1_df['d2'].values d3_samples = d1_d2_q1_df['d1'].values q3_samples = d1_d2_q1_df['q1'].values d1_samples = sp.stats.truncnorm(-2,2,loc=283,scale=116).rvs(10*N) TCR_samples = np.random.lognormal(np.log(2.5)/2,np.log(2.5)/(2*1.645),10*N) RWF_samples = sp.stats.truncnorm(-2.75,2.75,loc=0.582,scale=0.06).rvs(10*N) ECS_samples = TCR_samples/RWF_samples d = np.array([d1_samples,d2_samples,d3_samples]) k = 1-(d/70)*(1-np.exp(-70/d)) q = ((TCR_samples/F_2x - k[2]*q3_samples)[np.newaxis,:] - np.roll(k[:2],axis=0,shift=1)*(ECS_samples/F_2x - q3_samples)[np.newaxis,:])/(k[:2] - np.roll(k[:2],axis=0,shift=1)) sample_df = pd.DataFrame(index=['d','q'],columns = [1,2,3]).apply(pd.to_numeric) df_list = [] i=0 j=0 while j<N: curr_df = sample_df.copy() curr_df.loc['d'] = d[:,i] curr_df.loc['q',3] = q3_samples[i] curr_df.loc['q',[1,2]] = q[:,i] if curr_df.loc['q',2]<=0: i+=1 continue df_list += [curr_df] j+=1 i+=1 thermal_params = pd.concat(df_list,axis=1,keys=['therm'+str(x) for x in np.arange(N)]) return thermal_params
def fit_and_sample(lagged_zvalues: [[float]], num: int, copula=None): """ Example of fitting a copula function, and sampling lagged_zvalues: [ [z1,z2,z3] ] distributed N(0,1) margins, roughly copula : Something from https://pypi.org/project/copulas/ returns: [ [z1, z2, z3] ] representative sample """ # This is the part you'll want to change. # Remark 1: It's lazy to just sample synthetic data # Some more evenly spaced sampling would be preferable. # Remark 2: Any multivariate density estimation could go here. # Remark 3: If you want to literally fit to a Copula (i.e. roughly uniform margins) # then you might want to use mw.get_lagged_copulas(name=name, count= 5000) instead # # See https://www.microprediction.com/blog/lottery for discussion of why evenly # spaced samples are likely to serve you better. df = pd.DataFrame(data=lagged_zvalues) if copula is None: copula = GaussianMultivariate() # <--- copula.fit(df) synthetic = copula.sample(num) return synthetic.values.tolist()
def test_pdf(self): data = sample_trivariate_xyz() model = GaussianMultivariate() model.fit(data) sampled_data = model.sample(10) # Test PDF pdf = model.probability_density(sampled_data) assert (0 < pdf).all()
def set_parameters(self, parameters): """Set copula model parameters. Add additional keys after unflatte the parameters in order to set expected parameters for the copula. Args: dict: Copula flatten parameters. """ parameters = unflatten_dict(parameters) parameters.setdefault('fitted', True) parameters.setdefault('distribution', self.distribution) parameters = self._unflatten_gaussian_copula(parameters) self.model = GaussianMultivariate.from_dict(parameters)
def test_fit_sample(self): data = sample_trivariate_xyz() model = GaussianMultivariate() model.fit(data) for N in [10, 50, 100]: assert len(model.sample(N)) == N sampled_data = model.sample(10) assert sampled_data.shape == (10, 3) for column in data.columns: assert column in sampled_data
def copula_based(X,Y): """ Calculate joint PDF/CDF using copula """ import pandas as pd from copulas.multivariate import GaussianMultivariate # fit gaussian copula data=pd.DataFrame(list(zip(X,Y)),columns=['P','T']) dist=GaussianMultivariate() dist.fit(data) sampled=dist.sample(1) sampled.at[0,'P']=np.mean(X) sampled.at[0,'T']=np.mean(Y) # find pdf/cdf at mean value pdf=dist.pdf(sampled) cdf=dist.cumulative_distribution(sampled) return [pdf,cdf]
def test_cdf(self): data = sample_trivariate_xyz() model = GaussianMultivariate() model.fit(data) sampled_data = model.sample(10) # Test CDF cdf = model.cumulative_distribution(sampled_data) assert (0 < cdf).all() and (cdf < 1).all() # Test CDF increasing function for column in sampled_data.columns: sorted_data = sampled_data.sort_values(column) other_columns = data.columns.to_list() other_columns.remove(column) row = sorted_data.sample(1).iloc[0] for column in other_columns: sorted_data[column] = row[column] cdf = model.cumulative_distribution(sorted_data) assert (np.diff(cdf) >= 0).all()
def test_conditional_sampling(): condition = np.random.randint(1, 4, size=3000) conditioned = np.random.normal(loc=1, scale=1, size=3000) * condition data = pd.DataFrame({ 'a': condition, 'b': condition, 'c': conditioned, }) gm = GaussianMultivariate() gm.fit(data) sampled = gm.sample(3000, conditions={'b': 1}) np.testing.assert_allclose(sampled['a'].mean(), 1, atol=.5) np.testing.assert_allclose(sampled['b'].mean(), 1, atol=.5) np.testing.assert_allclose(sampled['c'].mean(), 1, atol=.5) sampled = gm.sample(3000, conditions={'a': 3, 'b': 3}) np.testing.assert_allclose(sampled['a'].mean(), 3, atol=.5) np.testing.assert_allclose(sampled['b'].mean(), 3, atol=.5) np.testing.assert_allclose(sampled['c'].mean(), 3, atol=.5)
#print(type(XR)) ########################################copula######################################### conv = [] scales = [] As = [] for i in range(len(XL)): sbl = XL[i].flatten() sbr = XR[i].flatten() conc = np.empty((sbl.shape[0], 2)) conc[:, 0] = sbr[:] conc[:, 1] = sbl[:] #print(conc.shape) copula = GaussianMultivariate(distribution=GammaUnivariate) copula.fit(conc) XX = np.array(copula.to_dict()['covariance']) xx = XX.flatten() conv.append(xx) UNI = copula.to_dict()['univariates'][0] #avec les sbr de droite scales.append(UNI["scale"]) As.append(UNI["a"]) A = np.array(As) B = np.array(scales) C = np.array(conv) distribution = np.empty((A.shape[0], 6)) distribution[:, 0] = A distribution[:, 1] = B distribution[:, 2: 6] = C # distribution[shape scale c[0] c[1] c[2] c[3] ] pour chaque sb
class GaussianCopula(SDVModel): """Model wrapping ``copulas.multivariate.GaussianMultivariate`` copula. Args: distribution (copulas.univariate.Univariate or str): Copulas univariate distribution to use. Example: The example below shows simple usage case where a ``GaussianMultivariate`` is being created and its ``fit`` and ``sample`` methods are being called. >>> model = GaussianMultivariate() >>> model.fit(pd.DataFrame({'a_field': list(range(10))})) >>> model.sample(5) a_field 0 4.796559 1 7.395329 2 7.400417 3 2.794212 4 1.925887 """ DISTRIBUTION = GaussianUnivariate distribution = None model = None def __init__(self, distribution=None): self.distribution = distribution or self.DISTRIBUTION def fit(self, table_data): """Fit the model to the table. Impute the table data before fit the model. Args: table_data (pandas.DataFrame): Data to be fitted. """ table_data = impute(table_data) self.model = GaussianMultivariate(distribution=self.distribution) self.model.fit(table_data) def sample(self, num_samples): """Sample ``num_samples`` rows from the model. Args: num_samples (int): Amount of rows to sample. Returns: pandas.DataFrame: Sampled data with the number of rows specified in ``num_samples``. """ return self.model.sample(num_samples) def get_parameters(self): """Get copula model parameters. Compute model ``covariance`` and ``distribution.std`` before it returns the flatten dict. Returns: dict: Copula flatten parameters. """ values = list() triangle = np.tril(self.model.covariance) for index, row in enumerate(triangle.tolist()): values.append(row[:index + 1]) self.model.covariance = np.array(values) params = self.model.to_dict() univariates = dict() for name, univariate in zip(params.pop('columns'), params['univariates']): univariates[name] = univariate if 'scale' in univariate: scale = univariate['scale'] if scale == 0: scale = EPSILON univariate['scale'] = np.log(scale) params['univariates'] = univariates return flatten_dict(params) def _prepare_sampled_covariance(self, covariance): """Prepare a covariance matrix. Args: covariance (list): covariance after unflattening model parameters. Result: list[list]: symmetric Positive semi-definite matrix. """ covariance = np.array(square_matrix(covariance)) covariance = (covariance + covariance.T - (np.identity(covariance.shape[0]) * covariance)) if not check_matrix_symmetric_positive_definite(covariance): covariance = make_positive_definite(covariance) return covariance.tolist() def _unflatten_gaussian_copula(self, model_parameters): """Prepare unflattened model params to recreate Gaussian Multivariate instance. The preparations consist basically in: - Transform sampled negative standard deviations from distributions into positive numbers - Ensure the covariance matrix is a valid symmetric positive-semidefinite matrix. - Add string parameters kept inside the class (as they can't be modelled), like ``distribution_type``. Args: model_parameters (dict): Sampled and reestructured model parameters. Returns: dict: Model parameters ready to recreate the model. """ univariate_kwargs = {'type': model_parameters['distribution']} columns = list() univariates = list() for column, univariate in model_parameters['univariates'].items(): columns.append(column) univariate.update(univariate_kwargs) univariate['scale'] = np.exp(univariate['scale']) univariates.append(univariate) model_parameters['univariates'] = univariates model_parameters['columns'] = columns covariance = model_parameters.get('covariance') model_parameters['covariance'] = self._prepare_sampled_covariance( covariance) return model_parameters def set_parameters(self, parameters): """Set copula model parameters. Add additional keys after unflatte the parameters in order to set expected parameters for the copula. Args: dict: Copula flatten parameters. """ parameters = unflatten_dict(parameters) parameters.setdefault('fitted', True) parameters.setdefault('distribution', self.distribution) parameters = self._unflatten_gaussian_copula(parameters) self.model = GaussianMultivariate.from_dict(parameters)
from botocore import UNSIGNED from botocore.client import Config from scipy.stats import ks_2samp from copulas import get_instance from copulas.multivariate import GaussianMultivariate, VineCopula from copulas.univariate import GaussianUnivariate LOGGER = logging.getLogger(__name__) BUCKET_NAME = 'atm-data' # Bucket where the datasets are stored DATA_URL = 'http://{}.s3.amazonaws.com/'.format(BUCKET_NAME) AVAILABLE_MODELS = { 'GaussianMultivariate(GaussianUnivariate)': GaussianMultivariate(GaussianUnivariate), 'GaussianMultivariate()': GaussianMultivariate(), 'VineCopula("center")': VineCopula('center'), 'VineCopula("direct")': VineCopula('direct'), 'VineCopula("regular")': VineCopula('regular') } OUTPUT_COLUMNS = [ 'model_name', 'dataset_name', 'num_columns', 'num_rows', 'elapsed_time',