コード例 #1
0
ファイル: copularnd.py プロジェクト: xiyueyiwan/copula-py
def _t(M, Rho, nu):
    N = Rho.shape[0]
    mu = np.zeros(N)  # zero mean
    x = mvt.multivariate_t_rvs(mu, Rho, nu, M)  # generate T RV's
    U = t.cdf(x, nu)

    return U
 def _sample(self, N):
     if self.dist == 'normal':
         return np.random.multivariate_normal(np.zeros(self.get_dimension()), self.Q_star, N).T
     elif self.dist == 't':
         return mdist.multivariate_t_rvs(np.zeros(self.get_dimension()), self.Q_star, df=T_DIST_NU, n=N).T
     else:
         raise Exception('Unknown distribution: ' + str(self.dist))
コード例 #3
0
def _t(M, Rho, nu):
    N = Rho.shape[0]
    mu = np.zeros(N)        # zero mean
    x = mvt.multivariate_t_rvs(mu,Rho,nu,M) # generate T RV's
    U = t.cdf(x, nu)
    
    return U
コード例 #4
0
def time_series_from_dist(corr, t_samples=1000, dist="normal", deg_free=3):
    """
    Generates a time series from a given correlation matrix.

    It uses multivariate sampling from distributions to create the time series. It supports
    normal and student-t distributions. This method relies and acts as a wrapper for the
    `np.random.multivariate_normal` and
    `statsmodels.sandbox.distributions.multivariate.multivariate_t_rvs` modules.
    `<https://numpy.org/doc/stable/reference/random/generated/numpy.random.multivariate_normal.html>`_
    `<https://www.statsmodels.org/stable/sandbox.html?highlight=sandbox#module-statsmodels.sandbox>`_

    It is reproduced with modifications from the following paper:
    `Marti, G., Andler, S., Nielsen, F. and Donnat, P., 2016.
    Clustering financial time series: How long is enough?. arXiv preprint arXiv:1603.04017.
    <https://www.ijcai.org/Proceedings/16/Papers/367.pdf>`_

    :param corr: (np.array) Correlation matrix.
    :param t_samples: (int) Number of samples in the time series.
    :param dist: (str) Type of distributions to use.
        Can take the values ["normal", "student"].
    :param deg_free: (int) Degrees of freedom. Only used for student-t distribution.
    :return: (pd.DataFrame) The resulting time series of shape (len(corr), t_samples).
    """
    # Initialize means.
    means = np.zeros(len(corr))

    # Generate time series based on distribution.
    if dist == "normal":
        series = np.random.multivariate_normal(means, corr, t_samples)
    elif dist == "student":
        series = multivariate_t_rvs(means, corr * ((deg_free - 2) / deg_free), df=deg_free, n=t_samples)
    else:
        raise ValueError("{} is not supported".format(dist))

    return pd.DataFrame(series)
コード例 #5
0
def run_rg():
  # parameters for sampled data
  num_samples_train = 10000
  num_samples_test = 10000
  num_components = 10
  # This "sigma" matrix parameterizes the elliptically-symmetric
  # distribution. For a gaussian ESD it is equivalent to the covariance matrix
  temp = np.random.multivariate_normal(np.ones(num_components),
                                       np.eye(num_components),
                                       size=num_components)
  sigma = np.dot(temp, temp.T) + np.diag(1e-5*np.ones(num_components))
  #^ create random positive definite matrix

  ########################################
  # Create a dataset to train the model on
  ########################################
  # *** our convention is that first dimension indexes components, and ***
  # *** second dimension indexes samples ***
  # Draw samples from a general multivariate student's t distribution
  t_samps = multivariate.multivariate_t_rvs(
      np.zeros(num_components), sigma, num_components/2, num_samples_train).T
  # center the data
  t_samps = t_samps - np.mean(t_samps, axis=1)[:, None]

  # Draw samples from a multivariate normal to compare against
  g_samps = np.random.multivariate_normal(
      np.zeros(num_components), sigma, size=num_samples_train).T
  # center the data
  g_samps = g_samps - np.mean(g_samps, axis=1)[:, None]


  ######################################
  # fit RG model to the training dataset
  ######################################
  g_func_fit, p_whitening = fit_rg(t_samps)

  # Apply the model to the training set as a sanity check
  rg_t_samps, scalings = apply_rg(t_samps, g_func_fit, p_whitening)

  # Invert the transformation to get back our original samples.
  reconstructed_t_samps = invert_rg(rg_t_samps, scalings, p_whitening)


  ##############################################
  # Create some testing data and apply the model
  ##############################################
  # Draw samples from the same general multivariate student's t distribution
  t_samps_test = multivariate.multivariate_t_rvs(
      np.zeros(num_components), sigma, num_components/2, num_samples_test).T
  # center the data
  t_samps_test = t_samps_test - np.mean(t_samps_test, axis=1)[:, None]

  rg_t_samps_test, scalings_test = apply_rg(t_samps_test, g_func_fit,
                                            p_whitening)
  reconstructed_t_samps_test = invert_rg(rg_t_samps_test, scalings_test,
                                         p_whitening)
  # we can actually synthesize new data too
  synth_rg_samps = np.random.multivariate_normal(
      np.zeros(num_components), np.eye(num_components), size=num_samples_test).T
  # We'll choose the scalings as samples from an estimate of the
  # appropriate scalings for the distribution we want to map to,
  # in this case the general multivariate student's T distribution.
  synth_use_KDE = False
  #^ We can either compute a kernel density estimate of the distribution of
  #  scalings, or we can just resample the scalings in our training set.
  #  Kernel density estimation can be slow and innaccurate, so for speed choose
  #  the resampling
  if synth_use_KDE:
    #### We could try to determine the best bandwidth
    # params = {'bandwidth': np.logspace(-1, 1, 20)}
    # grid = GridSearchCV(KernelDensity(), params)
    # grid.fit(scalings.reshape(-1, 1))
    # print("KDE estimate for scaling - best bandwidth: ",
    #       grid.best_estimator_.bandwidth)
    # kde = grid.best_estimator_
    #### or just plug something in
    kde = KernelDensity(bandwidth=0.127)
    kde.fit(scalings)
    sampled_scalings = np.squeeze(kde.sample(num_samples_test))
  else:
    sampled_scalings = np.random.choice(scalings, num_samples_test,
                                        replace=False)

  synth_samps = invert_rg(synth_rg_samps, sampled_scalings, p_whitening)


  #######################
  # Plotting some results
  #######################
  print("Plotting...")
  # we will use a few randomly-chosen 2D projections of our multivariate data
  # to visualize the distributions of our data
  if num_components < 2:
    raise ValueError('Univariate!')
  if num_components == 2:
    plotted_component_pairs = [[0, 1]]
  else:
    # pick 3 random pairs of components
    plotted_component_pairs = []
    inds = np.arange(num_components)
    for _ in range(3):
      first_c = np.random.choice(inds)
      second_c = np.random.choice(np.delete(inds, first_c))
      while ([first_c, second_c] in plotted_component_pairs or
             [second_c, first_c] in plotted_component_pairs):
        second_c = np.random.choice(np.delete(inds, first_c))
      plotted_component_pairs.append([first_c, second_c])

  # gather variance and kurtosis stats for these pairs
  stats_t_train = {} # stats for the training set
  stats_t_test = {} # stats for the testing set
  stats_g = {} # stats for the gaussian reference set
  stats_synth = {} # stats for the synthetic data set
  for pair_idx in range(3):
    c1 = plotted_component_pairs[pair_idx][0]
    c2 = plotted_component_pairs[pair_idx][1]
    stats_t_train[pair_idx] = {
        'input': compute_var_kurt(t_samps[[c1, c2]]),
        'rg': compute_var_kurt(rg_t_samps[[c1, c2]]),
        'rec_input': compute_var_kurt(reconstructed_t_samps[[c1, c2]])}
    stats_t_test[pair_idx] = {
        'input': compute_var_kurt(t_samps_test[[c1, c2]]),
        'rg': compute_var_kurt(rg_t_samps_test[[c1, c2]]),
        'rec_input': compute_var_kurt(reconstructed_t_samps_test[[c1, c2]])}
    stats_g[pair_idx] = compute_var_kurt(g_samps[[c1, c2]])
    stats_synth[pair_idx] = {
        'rg': compute_var_kurt(synth_rg_samps[[c1, c2]]),
        'rec_input': compute_var_kurt(synth_samps[[c1, c2]])}

  # compare our training data to data from a gaussian distribution
  plt.figure(figsize=(10, 15))
  for pair_idx in range(3):
    temp_idx = (pair_idx * 2) + 1
    c1 = plotted_component_pairs[pair_idx][0]
    c2 = plotted_component_pairs[pair_idx][1]
    # fit a KDE for these points
    kde = KernelDensity(bandwidth=0.5)
    kde.fit(np.hstack((t_samps[c1, :][:, None], t_samps[c2, :][:, None])))
    # some plotting params
    kde_lim_x = [-stats_t_train[pair_idx]['input']['var_c1'],
                 stats_t_train[pair_idx]['input']['var_c1']]
    kde_lim_y = [-stats_t_train[pair_idx]['input']['var_c2'],
                 stats_t_train[pair_idx]['input']['var_c2']]
    kde_sampling_density_x = 100
    kde_sampling_density_y = 100
    x = np.linspace(kde_lim_x[0], kde_lim_x[1], kde_sampling_density_x)
    y = np.linspace(kde_lim_y[0], kde_lim_y[1], kde_sampling_density_y)
    grid_points = np.array(list(itertools.product(x, y[::-1])))
    # generate plot
    ax = plt.subplot(3, 2, temp_idx)
    estimated_pdf = np.exp(kde.score_samples(grid_points))
    ax.imshow(estimated_pdf.reshape((kde_sampling_density_y,
                                     kde_sampling_density_x), order='F'),
              extent=[kde_lim_x[0], kde_lim_x[-1], kde_lim_y[0], kde_lim_y[-1]],
              aspect='auto')
    ax.text(kde_lim_x[0], kde_lim_y[0],
            format_string(stats_t_train[pair_idx], 'input'),
            horizontalalignment='left', verticalalignment='bottom',
            color='white', fontsize=7)
    ax.set_ylabel('Components {} & {}'.format(c1, c2))
    if pair_idx == 0:
      ax.set_title('Multivariate T (training_set)')

    # do the same thing for multivariate gaussian
    kde = KernelDensity(bandwidth=0.5)
    kde.fit(np.hstack((g_samps[c1, :][:, None], g_samps[c2, :][:, None])))
    # some plotting params
    kde_lim_x = [-stats_g[pair_idx]['var_c1'], stats_g[pair_idx]['var_c1']]
    kde_lim_y = [-stats_g[pair_idx]['var_c2'], stats_g[pair_idx]['var_c2']]
    kde_sampling_density_x = 100
    kde_sampling_density_y = 100
    x = np.linspace(kde_lim_x[0], kde_lim_x[1], kde_sampling_density_x)
    y = np.linspace(kde_lim_y[0], kde_lim_y[1], kde_sampling_density_y)
    grid_points = np.array(list(itertools.product(x, y[::-1])))
    # generate plot
    ax = plt.subplot(3, 2, temp_idx + 1)
    estimated_pdf = np.exp(kde.score_samples(grid_points))
    ax.imshow(estimated_pdf.reshape((kde_sampling_density_y,
                                     kde_sampling_density_x), order='F'),
              extent=[kde_lim_x[0], kde_lim_x[-1], kde_lim_y[0], kde_lim_y[1]],
              aspect='auto')
    ax.text(kde_lim_x[0], kde_lim_y[0], format_string(stats_g[pair_idx]),
            horizontalalignment='left', verticalalignment='bottom',
            color='white', fontsize=7)
    if pair_idx == 0:
      ax.set_title('Multivariate Gaussian')
    plt.suptitle('2D Projections of multivariate samples from ' +
                 'two different distributions')


  # For the training data, plot some 2D projections of the data in original
  # input space, in the radially-gaussianized space, and then reconstructed
  # in the input space
  plt.figure(figsize=(15, 15))
  for pair_idx in range(3):
    temp_idx = (pair_idx * 3) + 1
    c1 = plotted_component_pairs[pair_idx][0]
    c2 = plotted_component_pairs[pair_idx][1]
    ax = plt.subplot(3, 3, temp_idx)
    ax.scatter(t_samps[c1, :], t_samps[c2, :], s=1)
    ax.set_ylabel('Components {} & {}'.format(c1, c2))
    ax.text(np.min(t_samps[c1]), np.min(t_samps[c2]),
            format_string(stats_t_train[pair_idx], 'input'),
            horizontalalignment='left', verticalalignment='bottom',
            color='black', fontsize=7)
    if pair_idx == 0:
      ax.set_title('Samples in the input space')

    ax = plt.subplot(3, 3, temp_idx + 1)
    ax.scatter(rg_t_samps[c1, :], rg_t_samps[c2, :], s=1)
    ax.text(np.min(rg_t_samps[c1]), np.min(rg_t_samps[c2]),
            format_string(stats_t_train[pair_idx], 'rg'),
            horizontalalignment='left', verticalalignment='bottom',
            color='black', fontsize=7)
    if pair_idx == 0:
      ax.set_title('Samples in the radially-gaussianized space')

    ax = plt.subplot(3, 3, temp_idx + 2)
    ax.scatter(reconstructed_t_samps[c1, :], reconstructed_t_samps[c2, :], s=1)
    ax.text(np.min(reconstructed_t_samps[c1]),
            np.min(reconstructed_t_samps[c2]),
            format_string(stats_t_train[pair_idx], 'rec_input'),
            horizontalalignment='left', verticalalignment='bottom',
            color='black', fontsize=7)
    if pair_idx == 0:
      ax.set_title('Reconstructed samples after RG inversion')
  plt.suptitle('Samples from the general Student\'s T distributed training set')

  # Do the same thing for the testing dataset
  plt.figure(figsize=(15, 15))
  for pair_idx in range(3):
    temp_idx = (pair_idx * 3) + 1
    c1 = plotted_component_pairs[pair_idx][0]
    c2 = plotted_component_pairs[pair_idx][1]
    ax = plt.subplot(3, 3, temp_idx)
    ax.scatter(t_samps_test[c1, :], t_samps_test[c2, :], s=1)
    ax.set_ylabel('Components {} & {}'.format(c1, c2))
    ax.text(np.min(t_samps_test[c1]), np.min(t_samps_test[c2]),
            format_string(stats_t_test[pair_idx], 'input'),
            horizontalalignment='left', verticalalignment='bottom',
            color='black', fontsize=7)
    if pair_idx == 0:
      ax.set_title('Samples in the input space')

    ax = plt.subplot(3, 3, temp_idx + 1)
    ax.scatter(rg_t_samps_test[c1, :], rg_t_samps_test[c2, :], s=1)
    ax.text(np.min(rg_t_samps_test[c1]), np.min(rg_t_samps_test[c2]),
            format_string(stats_t_test[pair_idx], 'rg'),
            horizontalalignment='left', verticalalignment='bottom',
            color='black', fontsize=7)
    if pair_idx == 0:
      ax.set_title('Samples in the radially-gaussianized space')

    ax = plt.subplot(3, 3, temp_idx + 2)
    ax.scatter(reconstructed_t_samps_test[c1, :],
               reconstructed_t_samps_test[c2, :], s=1)
    ax.text(np.min(reconstructed_t_samps_test[c1]),
            np.min(reconstructed_t_samps_test[c2]),
            format_string(stats_t_test[pair_idx], 'rec_input'),
            horizontalalignment='left', verticalalignment='bottom',
            color='black', fontsize=7)
    if pair_idx == 0:
      ax.set_title('Reconstructed samples after RG inversion')
  plt.suptitle('Samples from the general Student\'s T distributed testing set')

  # Finally, show the synthesized samples
  plt.figure(figsize=(15, 15))
  for pair_idx in range(3):
    temp_idx = (pair_idx * 3) + 1
    c1 = plotted_component_pairs[pair_idx][0]
    c2 = plotted_component_pairs[pair_idx][1]
    ax = plt.subplot(3, 3, temp_idx)
    ax.scatter(t_samps[c1, :], t_samps[c2, :], s=1)
    ax.set_ylabel('Components {} & {}'.format(c1, c2))
    ax.text(np.min(t_samps[c1]), np.min(t_samps[c2]),
            format_string(stats_t_train[pair_idx], 'input'),
            horizontalalignment='left', verticalalignment='bottom',
            color='black', fontsize=7)
    if pair_idx == 0:
      ax.set_title('Training set samples in the input space')

    ax = plt.subplot(3, 3, temp_idx + 1)
    ax.scatter(synth_rg_samps[c1, :], synth_rg_samps[c2, :], s=1)
    ax.text(np.min(synth_rg_samps[c1]), np.min(synth_rg_samps[c2]),
            format_string(stats_synth[pair_idx], 'rg'),
            horizontalalignment='left', verticalalignment='bottom',
            color='black', fontsize=7)
    if pair_idx == 0:
      ax.set_title('Synthetic normally-distributed samples')

    ax = plt.subplot(3, 3, temp_idx + 2)
    ax.scatter(synth_samps[c1, :], synth_samps[c2, :], s=1)
    ax.text(np.min(synth_samps[c1]), np.min(synth_samps[c2]),
            format_string(stats_synth[pair_idx], 'rec_input'),
            horizontalalignment='left', verticalalignment='bottom',
            color='black', fontsize=7)
    if pair_idx == 0:
      ax.set_title('New, synthetic samples under the model')
  plt.suptitle('Generating synthetic data under the generative (inverse) model')

  plt.show()
コード例 #6
0
def multivariate_t_rvs_change_order(n, m, S, df):
    return multivariate_t_rvs(m, S, df, n)