コード例 #1
0
def _frank_PKTE(X):
    # calculate empirical kendall's tau
    ktau = multivariate_stats.kendalls_tau(X)
    # inverse to find dependency parameter
    alpha_hat = invcopulastat('Frank', 'kendall', ktau)
    
    return alpha_hat
コード例 #2
0
def cvolume(family, u1v1, u1v2, u2v1, u2v2, *args):
    """
    Computes the C-Volume of a specified copula family with dependency parameter
    defined in the args.
      family - the copula type, must be:
        'Gaussian'
        'T'
        'Clayton'
        'Frank'
        'Gumbel'
      u1v1 - a N x 2 matrix of values between [0,1] that represents the bottom
             left coordinate of the grid for which the C-Volume is desired
      u1v2 - a N x 2 matrix of values between [0,1] that represent the top
             left coordinate of the grid for which the C-Volume is desired
      u2v1 - a N x 2 matrix of values between [0,1] that represent the bottom
             right coordinate of the grid for which the C-volume is desired
      u2v2 - a N x 2 matrix of values between [0,1] that represents the top
             right coordinate of the grid for which the C-Volume is desired
      args - must be atleast of length 2, for which the first element in args
             is expected to be a string which describes the dependency value
             being provided, must be one of the following:
        'kendall' - means kendall's Tau is being provided
        'spearman' - means spearman's rho is being provided
        'native' - means that the dependency parameter of the copula family
                   itself is being provided directly
             the second argmuent  must be the value of the dependency type 
            provided. For kendall and spearman, a scalar value is expected.  
            For native, if the family type is Frank, Gumbel, or Clayton, then 
            a scalar value is expected, which represents the dependency
            parameter.  If the family type is Gaussian, then a 2 x 2 numpy array
            is expected, which represents the correlation matrix defining the
            Gaussian copula.  If the family is T, then the 2nd argument is the
            2x2 numpy array representing the correlation matrix, and the 3rd
            argument is the degrees of freedom
    """    
    family_lc = family.lower()
    if(family_lc=='gaussian'):
        if(len(args)<2):
            raise ValueError("Gaussian Family expects 2 variable arguments, the dependency type and value")
        if(args[0]=='kendall' or args[0]=='spearman'):
            # get the correlation parameter
            r = invcopulastat(family, args[0], args[1])
        else:
            r = args[1]
        
        cvol = _gaussian(u1v1, u1v2, u2v1, u2v2, r)
    elif(family_lc=='t'):
        if(len(args)<2):
            raise ValueError("T Family expects atleast 2 variable arguments, the dependency type and value")
        
        if(args[0]=='kendall' or args[0]=='spearman'):
            raise ValueError('T Family does not accept Kendalls Tau or Spearmans Rho, only native parameters')
        else:
            r = args[1]
            nu = args[2]
            
            cvol = _gaussian(u1v1, u1v2, u2v1, u2v2, r, nu)
            
    elif(family_lc=='clayton'):
        if(len(args)<2):
            raise ValueError("Clayton Family expects 2 variable arguments, the dependency type and value")
        
        if(args[0]=='kendall' or args[0]=='spearman'):
            # get the correlation parameter and degrees of freedom
            alpha = invcopulastat(family, args[0], args[1])
        else:
            alpha = args[1]
        
        cvol = _clayton(u1v1, u1v2, u2v1, u2v2, alpha)
        
    elif(family_lc=='frank'):
        if(len(args)<2):
            raise ValueError("Frank Family expects 2 variable arguments, the dependency type and value")
        if(args[0]=='kendall' or args[0]=='spearman'):
            # get the correlation parameter and degrees of freedom
            alpha = invcopulastat(family, args[0], args[1])
        else:
            alpha = args[1]
        
        cvol = _frank(u1v1, u1v2, u2v1, u2v2, alpha)

    elif(family_lc=='gumbel'):
        if(len(args)<2):
            raise ValueError("Gumbel Family expects 2 variable arguments, the dependency type and value")
        if(args[0]=='kendall' or args[0]=='spearman'):
            # get the correlation parameter and degrees of freedom
            alpha = invcopulastat(family, args[0], args[1])
        else:
            alpha = args[1]
        
        cvol = _gumbel(u1v1, u1v2, u2v1, u2v2, alpha)

    return cvol
コード例 #3
0
def visualizeMNSig():
    # some tests on the copula multinomial signature
    
    K = 4
    M = 1000
    N = 3
    tauVec = np.arange(-0.9,0.95,0.05)
    # the families to test against and pick optimal copula
    families = ['Gaussian', 'Clayton', 'Gumbel', 'Frank']
    
    helmAccuracyResults = testHELM_parametric(K,M,N,tauVec,families)
        
    resultsAggregate = {}
    
    for family in families:
        famResults = {}
        for tau in tauVec:
            mnsig = copulamnsig(family,K,'kendall',tau)
            famResults[tau] = mnsig
        resultsAggregate[family] = famResults

    # visualize the results
    for tau in tauVec:
        # we would also like to visualize this copula on the side, to try to 
        # understand what may be a better way todo model selection
        try:
            r = invcopulastat('Gaussian', 'kendall', tau)
        except ValueError:
            r = -1
        Rho = np.empty((N,N))
        for jj in range(0,N):
            for kk in range(0,N):
                if(jj==kk):
                    Rho[jj][kk] = 1
                else:
                    Rho[jj][kk] = r
        
        try:
            alpha_clayton = invcopulastat('Clayton', 'kendall', tau)
        except ValueError:
            alpha_clayton = -1
        
        try:
            alpha_gumbel  = invcopulastat('Gumbel', 'kendall', tau)
        except ValueError:
            alpha_gumbel = -1
            
        try:
            alpha_frank   = invcopulastat('Frank', 'kendall', tau)
        except ValueError:
            alpha_frank   = -1
        
        if(r!=-1):
            try:
                U_gauss   = copularnd('Gaussian', M, Rho)
            except ValueError:
                U_gauss   = np.zeros((M,N))
        if(alpha_clayton!=-1):
            try:
                U_clayton = copularnd('Clayton', M, N, alpha_clayton)
            except ValueError:
                U_clayton   = np.zeros((M,N))
        if(alpha_frank!=-1):
            try:
                U_frank   = copularnd('Frank', M, N, alpha_frank)
            except ValueError:
                U_frank   = np.zeros((M,N))
        if(alpha_gumbel!=-1):
            try:
                U_gumbel  = copularnd('Gumbel', M, N, alpha_gumbel)
            except ValueError:
                U_gumbel  = np.zeros((M,N))
        
        # get each family's MN signature and plot it
        plt.figure(figsize=(30,20))
        
        plt.subplot(231)
        if(np.sum(resultsAggregate['Gaussian'][tau])>0):
            plt.plot(np.arange(1,K*K+1), resultsAggregate['Gaussian'][tau], 'b.-', label='Gaussian Copula')
        if(np.sum(resultsAggregate['Clayton'][tau])>0):
            plt.plot(np.arange(1,K*K+1), resultsAggregate['Clayton'][tau], 'g.-', label='Clayton Copula')
        if(np.sum(resultsAggregate['Gumbel'][tau])>0):
            plt.plot(np.arange(1,K*K+1), resultsAggregate['Gumbel'][tau], 'r.-', label='Gumbel Copula')
        if(np.sum(resultsAggregate['Frank'][tau])>0):
            plt.plot(np.arange(1,K*K+1), resultsAggregate['Frank'][tau], 'k.-', label='Frank Copula')
        
        plt.title(r'Copula Multinomial Signature $\tau$=' + "{0:.2f}".format(tau) + ' K=' + str(K))
        plt.legend()
        plt.grid()
        
        plt.subplot(232)
        if(r!=-1):
            plt.scatter(U_gauss[:,0], U_gauss[:,1])
        plt.grid()
        plt.title(r'Gaussian Copula, $\rho$=' + "{0:.2f}".format(r) + r' $\tau$=' + "{0:.2f}".format(tau))
        
        plt.subplot(233)
        if(alpha_clayton!=-1):
            plt.scatter(U_clayton[:,0], U_clayton[:,1])
        plt.grid()
        plt.title(r'Clayton Copula, $\alpha$=' + "{0:.2f}".format(alpha_clayton) + r' $\tau$=' + "{0:.2f}".format(tau))
        
        plt.subplot(235)
        if(alpha_frank!=-1):
            plt.scatter(U_frank[:,0], U_frank[:,1])
        plt.grid()
        plt.title(r'Frank Copula, $\alpha$=' + "{0:.2f}".format(alpha_frank) + r' $\tau$=' + "{0:.2f}".format(tau))
        
        plt.subplot(236)
        if(alpha_gumbel!=-1):
            plt.scatter(U_gumbel[:,0], U_gumbel[:,1])
        plt.grid()
        plt.title(r'Gumbel Copula, $\alpha$=' + "{0:.2f}".format(alpha_gumbel) + r' $\tau$=' + "{0:.2f}".format(tau))
        
        plt.subplot(234)
        # index manually to ensure accuracy
        cla = np.array([helmAccuracyResults['Clayton'][tau]['clayton'],
                        helmAccuracyResults['Gaussian'][tau]['clayton'],
                        helmAccuracyResults['Gumbel'][tau]['clayton'],
                        helmAccuracyResults['Frank'][tau]['clayton']])
        gau = np.array([helmAccuracyResults['Clayton'][tau]['gaussian'],
                        helmAccuracyResults['Gaussian'][tau]['gaussian'],
                        helmAccuracyResults['Gumbel'][tau]['gaussian'],
                        helmAccuracyResults['Frank'][tau]['gaussian']])
        gum = np.array([helmAccuracyResults['Clayton'][tau]['gumbel'],
                        helmAccuracyResults['Gaussian'][tau]['gumbel'],
                        helmAccuracyResults['Gumbel'][tau]['gumbel'],
                        helmAccuracyResults['Frank'][tau]['gumbel']])
        fra = np.array([helmAccuracyResults['Clayton'][tau]['frank'],
                        helmAccuracyResults['Gaussian'][tau]['frank'],
                        helmAccuracyResults['Gumbel'][tau]['frank'],
                        helmAccuracyResults['Frank'][tau]['frank']])
        ind = np.arange(4)
        width = 0.2
        p1 = plt.bar(ind,cla,width,color='b')
        p2 = plt.bar(ind,gau,width,color='g',bottom=cla)
        p3 = plt.bar(ind,gum,width,color='k',bottom=cla+gau)
        p4 = plt.bar(ind,fra,width,color='r',bottom=cla+gau+gum)
        plt.xticks(ind+width/2.,('Clayton', 'Gaussian', 'Gumbel', 'Frank'))
        plt.legend( (p1[0], p2[0], p3[0], p4[0]), ('Clayton', 'Gaussian', 'Gumbel', 'Frank') )

        plt.grid()
        plt.savefig(os.path.join('figures/HELM_performance/', 
                     'HELM_DIM_' + str(N) + '_tau_' + "{0:.2f}".format(tau) + ' _K_' + str(K) + '.png'))
        
        plt.close()
コード例 #4
0
def testHELM(tau, M, N, familyToTest, numMCSims, copulaFamiliesToTest):
    results = {}
    for fam in copulaFamiliesToTest:
        results[fam.lower()] = 0
    
    for ii in range(0,numMCSims):
        # generate samples of the requested copula with tau same as the
        # empirical signature we calculated above
        if(familyToTest.lower()=='gaussian'):
            r = invcopulastat(familyToTest, 'kendall', tau)
            
            Rho = np.empty((N,N))
            for jj in range(0,N):
                for kk in range(0,N):
                    if(jj==kk):
                        Rho[jj][kk] = 1
                    else:
                        Rho[jj][kk] = r
            try:
                U = copularnd(familyToTest, M, Rho)
            except ValueError:
                # copularnd will throw a ValueError if Rho is not a positive semidefinite matrix
                return results      # return 0, which will then be ignored by tests
                
        else:       # assume Clayton, Frank, or Gumbel
            try:
                alpha = invcopulastat(familyToTest, 'kendall', tau)
                U = copularnd(familyToTest, M, N, alpha)
            except ValueError:
                continue
            
        lst = []
        for jj in range(0,N):
            U_conditioned = U[:,jj]
            # if there are any 1's, condition it
            U_conditioned[U_conditioned==1] = 0.99
            if(jj%2==0):
                lst.append(norm.ppf(U_conditioned))
            else:
                lst.append(expon.ppf(U_conditioned))
        
        # combine X and Y into the joint distribution w/ the copula
        X = np.vstack(lst)
        X = X.T
                    
        ret = optimalCopulaFamily(X, family_search=copulaFamiliesToTest)
        ret_family = ret[0].lower()
        # aggregate results
        results[ret_family] = results[ret_family] + 1.0
        
        # display some progress
        sys.stdout.write("\rComputing " + str(familyToTest) + " Copula (DIM=%d) (tau=%f)-- %d%%" % (N,tau,ii+1))
        sys.stdout.flush()
    
    sys.stdout.write("\r")
    
    # convert results to percentage
    for fam in copulaFamiliesToTest:
        results[fam.lower()] = results[fam.lower()]/float(numMCSims) * 100
    
    return results
コード例 #5
0
def optimalCopulaFamily(X, K=4, family_search=['Gaussian', 'Clayton', 'Gumbel', 'Frank']):
    """
    This function, given a multivariate data set X, computes the best copula family which fits
    the data, using the procedure described in the paper "Highly Efficient Learning of Mixed
    Copula Networks," by Gal Elidan
      
      X - the multivariate dataset for which we desire the copula.  Must be a numpy array of 
          dimension [M x N], where M is the number of data points, and N is the dimensionality
          of the dataset
      K - the square root of the number of grid points (for now, we assume square gridding of the
          unit cube)
      family_search - a list of all the copula families to search.  Currently, what is supported is
          Gaussian, Clayton, Gumbel, and Frank.  As more copula's are added, the default list will
          be expanded.
    """
    # compute the empirical Kendall's Tau
    tau_hat = multivariate_stats.kendalls_tau(X)
    
    # compute empirical multinomial signature
    empirical_mnsig = empirical_copulamnsig(X, K)
    empirical_mnsig = empirical_mnsig[0]['esig']
    # replace any 0 values w/ smallest possible float value
    empirical_mnsig[empirical_mnsig==0] = np.spacing(1)
    
    # compute the multinomial signature for each of the copula families specified
    # and simultaneously compute the kullback leibler divergence between the empirical
    # and the computed, and store that info
    distances = {}
    for family in family_search:
        # because the Clayton and Gumbel Copula's have restrictions for the valid values of
        # Kendall's tau, we do checks here to ensure those restrictions are met, because there
        # will be a certain variance associated with the tau_hat measurement
        
        if(family.lower()=='clayton'):
            # here we add some additional optimizatons as follows.  We know that the Clayton copula
            # captures only positive concordance.  Like any estimator, tau_hat will have some variance
            # associated with it.  Thus, the optimization we make is as follows, if tau_hat is within
            # a configurable amount less than 0, then we will set tau_hat to 0 and continue processing.  
            # However, if tau_hat is greater than that, we theoretically wouldn't have to test against 
            # the Clayton copula model, so we set the KL-divergence to be infinity to exclude 
            # this family from being selected
            if(tau_hat<-0.05):
                distances[family] = np.inf
                continue
            elif(tau_hat>=-0.05 and tau_hat<0):
                tau_hat = 0
            elif(tau_hat>=1):
                tau_hat = 1 - np.spacing(1)     # as close to 1 as possible in our precision
        elif(family.lower()=='gumbel'):
            # here we add some additional optimizatons as follows.  We know that the Gumbel copula
            # captures only positive concordance.  Like any estimator, tau_hat will have some variance
            # associated with it.  Thus, the optimization we make is as follows, if tau_hat is within
            # a configurable amount less than 0, then we will set tau_hat to 0 and continue processing.  
            # However, if tau_hat is greater than that, we theoretically wouldn't have to test against 
            # the Gumbel copula model, so we set the KL-divergence to be infinity to exclude 
            # this family from being selected
            if(tau_hat<-0.05):
                distances[family] = np.inf
                continue
            elif(tau_hat>=-0.05 and tau_hat<0):
                tau_hat = 0
            elif(tau_hat>=1):
                tau_hat = 1 - np.spacing(1)     # as close to 1 as possible in our precision
        # any other copula families with restrictions can go here
        
        mnsig = copulamnsig(family,K,'kendall',tau_hat)
        # replace any 0 values w/ smallest possible float value
        mnsig[mnsig==0] = np.spacing(1)
        
        # compute KL divergence, see
        # http://docs.scipy.org/doc/scipy-dev/reference/generated/scipy.stats.entropy.html
        distances[family] = entropy(mnsig, empirical_mnsig)
        
    # search for the minimum distance, that is the optimal copula family to use
    minDistance = np.inf
    for family, distance in distances.iteritems():
        if distance<minDistance:
            minDistance = distance
            optimalFamily = family
    
    depParams = invcopulastat(optimalFamily, 'kendall', tau_hat)
    
    return (optimalFamily, depParams, tau_hat)
コード例 #6
0
def cvolume(family, u1v1, u1v2, u2v1, u2v2, *args):
    """
    Computes the C-Volume of a specified copula family with dependency parameter
    defined in the args.
      family - the copula type, must be:
        'Gaussian'
        'T'
        'Clayton'
        'Frank'
        'Gumbel'
      u1v1 - a N x 2 matrix of values between [0,1] that represents the bottom
             left coordinate of the grid for which the C-Volume is desired
      u1v2 - a N x 2 matrix of values between [0,1] that represent the top
             left coordinate of the grid for which the C-Volume is desired
      u2v1 - a N x 2 matrix of values between [0,1] that represent the bottom
             right coordinate of the grid for which the C-volume is desired
      u2v2 - a N x 2 matrix of values between [0,1] that represents the top
             right coordinate of the grid for which the C-Volume is desired
      args - must be atleast of length 2, for which the first element in args
             is expected to be a string which describes the dependency value
             being provided, must be one of the following:
        'kendall' - means kendall's Tau is being provided
        'spearman' - means spearman's rho is being provided
        'native' - means that the dependency parameter of the copula family
                   itself is being provided directly
             the second argmuent  must be the value of the dependency type 
            provided. For kendall and spearman, a scalar value is expected.  
            For native, if the family type is Frank, Gumbel, or Clayton, then 
            a scalar value is expected, which represents the dependency
            parameter.  If the family type is Gaussian, then a 2 x 2 numpy array
            is expected, which represents the correlation matrix defining the
            Gaussian copula.  If the family is T, then the 2nd argument is the
            2x2 numpy array representing the correlation matrix, and the 3rd
            argument is the degrees of freedom
    """
    family_lc = family.lower()
    if (family_lc == 'gaussian'):
        if (len(args) < 2):
            raise ValueError(
                "Gaussian Family expects 2 variable arguments, the dependency type and value"
            )
        if (args[0] == 'kendall' or args[0] == 'spearman'):
            # get the correlation parameter
            r = invcopulastat(family, args[0], args[1])
        else:
            r = args[1]

        cvol = _gaussian(u1v1, u1v2, u2v1, u2v2, r)
    elif (family_lc == 't'):
        if (len(args) < 2):
            raise ValueError(
                "T Family expects atleast 2 variable arguments, the dependency type and value"
            )

        if (args[0] == 'kendall' or args[0] == 'spearman'):
            raise ValueError(
                'T Family does not accept Kendalls Tau or Spearmans Rho, only native parameters'
            )
        else:
            r = args[1]
            nu = args[2]

            cvol = _gaussian(u1v1, u1v2, u2v1, u2v2, r, nu)

    elif (family_lc == 'clayton'):
        if (len(args) < 2):
            raise ValueError(
                "Clayton Family expects 2 variable arguments, the dependency type and value"
            )

        if (args[0] == 'kendall' or args[0] == 'spearman'):
            # get the correlation parameter and degrees of freedom
            alpha = invcopulastat(family, args[0], args[1])
        else:
            alpha = args[1]

        cvol = _clayton(u1v1, u1v2, u2v1, u2v2, alpha)

    elif (family_lc == 'frank'):
        if (len(args) < 2):
            raise ValueError(
                "Frank Family expects 2 variable arguments, the dependency type and value"
            )
        if (args[0] == 'kendall' or args[0] == 'spearman'):
            # get the correlation parameter and degrees of freedom
            alpha = invcopulastat(family, args[0], args[1])
        else:
            alpha = args[1]

        cvol = _frank(u1v1, u1v2, u2v1, u2v2, alpha)

    elif (family_lc == 'gumbel'):
        if (len(args) < 2):
            raise ValueError(
                "Gumbel Family expects 2 variable arguments, the dependency type and value"
            )
        if (args[0] == 'kendall' or args[0] == 'spearman'):
            # get the correlation parameter and degrees of freedom
            alpha = invcopulastat(family, args[0], args[1])
        else:
            alpha = args[1]

        cvol = _gumbel(u1v1, u1v2, u2v1, u2v2, alpha)

    return cvol
コード例 #7
0
def visualizeMNSig():
    # some tests on the copula multinomial signature

    K = 4
    M = 1000
    N = 3
    tauVec = np.arange(-0.9, 0.95, 0.05)
    # the families to test against and pick optimal copula
    families = ['Gaussian', 'Clayton', 'Gumbel', 'Frank']

    helmAccuracyResults = testHELM_parametric(K, M, N, tauVec, families)

    resultsAggregate = {}

    for family in families:
        famResults = {}
        for tau in tauVec:
            mnsig = copulamnsig(family, K, 'kendall', tau)
            famResults[tau] = mnsig
        resultsAggregate[family] = famResults

    # visualize the results
    for tau in tauVec:
        # we would also like to visualize this copula on the side, to try to
        # understand what may be a better way todo model selection
        try:
            r = invcopulastat('Gaussian', 'kendall', tau)
        except ValueError:
            r = -1
        Rho = np.empty((N, N))
        for jj in range(0, N):
            for kk in range(0, N):
                if (jj == kk):
                    Rho[jj][kk] = 1
                else:
                    Rho[jj][kk] = r

        try:
            alpha_clayton = invcopulastat('Clayton', 'kendall', tau)
        except ValueError:
            alpha_clayton = -1

        try:
            alpha_gumbel = invcopulastat('Gumbel', 'kendall', tau)
        except ValueError:
            alpha_gumbel = -1

        try:
            alpha_frank = invcopulastat('Frank', 'kendall', tau)
        except ValueError:
            alpha_frank = -1

        if (r != -1):
            try:
                U_gauss = copularnd('Gaussian', M, Rho)
            except ValueError:
                U_gauss = np.zeros((M, N))
        if (alpha_clayton != -1):
            try:
                U_clayton = copularnd('Clayton', M, N, alpha_clayton)
            except ValueError:
                U_clayton = np.zeros((M, N))
        if (alpha_frank != -1):
            try:
                U_frank = copularnd('Frank', M, N, alpha_frank)
            except ValueError:
                U_frank = np.zeros((M, N))
        if (alpha_gumbel != -1):
            try:
                U_gumbel = copularnd('Gumbel', M, N, alpha_gumbel)
            except ValueError:
                U_gumbel = np.zeros((M, N))

        # get each family's MN signature and plot it
        plt.figure(figsize=(30, 20))

        plt.subplot(231)
        if (np.sum(resultsAggregate['Gaussian'][tau]) > 0):
            plt.plot(np.arange(1, K * K + 1),
                     resultsAggregate['Gaussian'][tau],
                     'b.-',
                     label='Gaussian Copula')
        if (np.sum(resultsAggregate['Clayton'][tau]) > 0):
            plt.plot(np.arange(1, K * K + 1),
                     resultsAggregate['Clayton'][tau],
                     'g.-',
                     label='Clayton Copula')
        if (np.sum(resultsAggregate['Gumbel'][tau]) > 0):
            plt.plot(np.arange(1, K * K + 1),
                     resultsAggregate['Gumbel'][tau],
                     'r.-',
                     label='Gumbel Copula')
        if (np.sum(resultsAggregate['Frank'][tau]) > 0):
            plt.plot(np.arange(1, K * K + 1),
                     resultsAggregate['Frank'][tau],
                     'k.-',
                     label='Frank Copula')

        plt.title(r'Copula Multinomial Signature $\tau$=' +
                  "{0:.2f}".format(tau) + ' K=' + str(K))
        plt.legend()
        plt.grid()

        plt.subplot(232)
        if (r != -1):
            plt.scatter(U_gauss[:, 0], U_gauss[:, 1])
        plt.grid()
        plt.title(r'Gaussian Copula, $\rho$=' + "{0:.2f}".format(r) +
                  r' $\tau$=' + "{0:.2f}".format(tau))

        plt.subplot(233)
        if (alpha_clayton != -1):
            plt.scatter(U_clayton[:, 0], U_clayton[:, 1])
        plt.grid()
        plt.title(r'Clayton Copula, $\alpha$=' +
                  "{0:.2f}".format(alpha_clayton) + r' $\tau$=' +
                  "{0:.2f}".format(tau))

        plt.subplot(235)
        if (alpha_frank != -1):
            plt.scatter(U_frank[:, 0], U_frank[:, 1])
        plt.grid()
        plt.title(r'Frank Copula, $\alpha$=' + "{0:.2f}".format(alpha_frank) +
                  r' $\tau$=' + "{0:.2f}".format(tau))

        plt.subplot(236)
        if (alpha_gumbel != -1):
            plt.scatter(U_gumbel[:, 0], U_gumbel[:, 1])
        plt.grid()
        plt.title(r'Gumbel Copula, $\alpha$=' +
                  "{0:.2f}".format(alpha_gumbel) + r' $\tau$=' +
                  "{0:.2f}".format(tau))

        plt.subplot(234)
        # index manually to ensure accuracy
        cla = np.array([
            helmAccuracyResults['Clayton'][tau]['clayton'],
            helmAccuracyResults['Gaussian'][tau]['clayton'],
            helmAccuracyResults['Gumbel'][tau]['clayton'],
            helmAccuracyResults['Frank'][tau]['clayton']
        ])
        gau = np.array([
            helmAccuracyResults['Clayton'][tau]['gaussian'],
            helmAccuracyResults['Gaussian'][tau]['gaussian'],
            helmAccuracyResults['Gumbel'][tau]['gaussian'],
            helmAccuracyResults['Frank'][tau]['gaussian']
        ])
        gum = np.array([
            helmAccuracyResults['Clayton'][tau]['gumbel'],
            helmAccuracyResults['Gaussian'][tau]['gumbel'],
            helmAccuracyResults['Gumbel'][tau]['gumbel'],
            helmAccuracyResults['Frank'][tau]['gumbel']
        ])
        fra = np.array([
            helmAccuracyResults['Clayton'][tau]['frank'],
            helmAccuracyResults['Gaussian'][tau]['frank'],
            helmAccuracyResults['Gumbel'][tau]['frank'],
            helmAccuracyResults['Frank'][tau]['frank']
        ])
        ind = np.arange(4)
        width = 0.2
        p1 = plt.bar(ind, cla, width, color='b')
        p2 = plt.bar(ind, gau, width, color='g', bottom=cla)
        p3 = plt.bar(ind, gum, width, color='k', bottom=cla + gau)
        p4 = plt.bar(ind, fra, width, color='r', bottom=cla + gau + gum)
        plt.xticks(ind + width / 2.,
                   ('Clayton', 'Gaussian', 'Gumbel', 'Frank'))
        plt.legend((p1[0], p2[0], p3[0], p4[0]),
                   ('Clayton', 'Gaussian', 'Gumbel', 'Frank'))

        plt.grid()
        plt.savefig(
            os.path.join(
                'figures/HELM_performance/', 'HELM_DIM_' + str(N) + '_tau_' +
                "{0:.2f}".format(tau) + ' _K_' + str(K) + '.png'))

        plt.close()
コード例 #8
0
def testHELM(tau, M, N, familyToTest, numMCSims, copulaFamiliesToTest):
    results = {}
    for fam in copulaFamiliesToTest:
        results[fam.lower()] = 0

    for ii in range(0, numMCSims):
        # generate samples of the requested copula with tau same as the
        # empirical signature we calculated above
        if (familyToTest.lower() == 'gaussian'):
            r = invcopulastat(familyToTest, 'kendall', tau)

            Rho = np.empty((N, N))
            for jj in range(0, N):
                for kk in range(0, N):
                    if (jj == kk):
                        Rho[jj][kk] = 1
                    else:
                        Rho[jj][kk] = r
            try:
                U = copularnd(familyToTest, M, Rho)
            except ValueError:
                # copularnd will throw a ValueError if Rho is not a positive semidefinite matrix
                return results  # return 0, which will then be ignored by tests

        else:  # assume Clayton, Frank, or Gumbel
            try:
                alpha = invcopulastat(familyToTest, 'kendall', tau)
                U = copularnd(familyToTest, M, N, alpha)
            except ValueError:
                continue

        lst = []
        for jj in range(0, N):
            U_conditioned = U[:, jj]
            # if there are any 1's, condition it
            U_conditioned[U_conditioned == 1] = 0.99
            if (jj % 2 == 0):
                lst.append(norm.ppf(U_conditioned))
            else:
                lst.append(expon.ppf(U_conditioned))

        # combine X and Y into the joint distribution w/ the copula
        X = np.vstack(lst)
        X = X.T

        ret = optimalCopulaFamily(X, family_search=copulaFamiliesToTest)
        ret_family = ret[0].lower()
        # aggregate results
        results[ret_family] = results[ret_family] + 1.0

        # display some progress
        sys.stdout.write("\rComputing " + str(familyToTest) +
                         " Copula (DIM=%d) (tau=%f)-- %d%%" % (N, tau, ii + 1))
        sys.stdout.flush()

    sys.stdout.write("\r")

    # convert results to percentage
    for fam in copulaFamiliesToTest:
        results[fam.lower()] = results[fam.lower()] / float(numMCSims) * 100

    return results
コード例 #9
0
def optimalCopulaFamily(X,
                        K=4,
                        family_search=[
                            'Gaussian', 'Clayton', 'Gumbel', 'Frank'
                        ]):
    """
    This function, given a multivariate data set X, computes the best copula family which fits
    the data, using the procedure described in the paper "Highly Efficient Learning of Mixed
    Copula Networks," by Gal Elidan
      
      X - the multivariate dataset for which we desire the copula.  Must be a numpy array of 
          dimension [M x N], where M is the number of data points, and N is the dimensionality
          of the dataset
      K - the square root of the number of grid points (for now, we assume square gridding of the
          unit cube)
      family_search - a list of all the copula families to search.  Currently, what is supported is
          Gaussian, Clayton, Gumbel, and Frank.  As more copula's are added, the default list will
          be expanded.
    """
    # compute the empirical Kendall's Tau
    tau_hat = multivariate_stats.kendalls_tau(X)

    # compute empirical multinomial signature
    empirical_mnsig = empirical_copulamnsig(X, K)
    empirical_mnsig = empirical_mnsig[0]['esig']
    # replace any 0 values w/ smallest possible float value
    empirical_mnsig[empirical_mnsig == 0] = np.spacing(1)

    # compute the multinomial signature for each of the copula families specified
    # and simultaneously compute the kullback leibler divergence between the empirical
    # and the computed, and store that info
    distances = {}
    for family in family_search:
        # because the Clayton and Gumbel Copula's have restrictions for the valid values of
        # Kendall's tau, we do checks here to ensure those restrictions are met, because there
        # will be a certain variance associated with the tau_hat measurement

        if (family.lower() == 'clayton'):
            # here we add some additional optimizatons as follows.  We know that the Clayton copula
            # captures only positive concordance.  Like any estimator, tau_hat will have some variance
            # associated with it.  Thus, the optimization we make is as follows, if tau_hat is within
            # a configurable amount less than 0, then we will set tau_hat to 0 and continue processing.
            # However, if tau_hat is greater than that, we theoretically wouldn't have to test against
            # the Clayton copula model, so we set the KL-divergence to be infinity to exclude
            # this family from being selected
            if (tau_hat < -0.05):
                distances[family] = np.inf
                continue
            elif (tau_hat >= -0.05 and tau_hat < 0):
                tau_hat = 0
            elif (tau_hat >= 1):
                tau_hat = 1 - np.spacing(
                    1)  # as close to 1 as possible in our precision
        elif (family.lower() == 'gumbel'):
            # here we add some additional optimizatons as follows.  We know that the Gumbel copula
            # captures only positive concordance.  Like any estimator, tau_hat will have some variance
            # associated with it.  Thus, the optimization we make is as follows, if tau_hat is within
            # a configurable amount less than 0, then we will set tau_hat to 0 and continue processing.
            # However, if tau_hat is greater than that, we theoretically wouldn't have to test against
            # the Gumbel copula model, so we set the KL-divergence to be infinity to exclude
            # this family from being selected
            if (tau_hat < -0.05):
                distances[family] = np.inf
                continue
            elif (tau_hat >= -0.05 and tau_hat < 0):
                tau_hat = 0
            elif (tau_hat >= 1):
                tau_hat = 1 - np.spacing(
                    1)  # as close to 1 as possible in our precision
        # any other copula families with restrictions can go here

        mnsig = copulamnsig(family, K, 'kendall', tau_hat)
        # replace any 0 values w/ smallest possible float value
        mnsig[mnsig == 0] = np.spacing(1)

        # compute KL divergence, see
        # http://docs.scipy.org/doc/scipy-dev/reference/generated/scipy.stats.entropy.html
        distances[family] = entropy(mnsig, empirical_mnsig)

    # search for the minimum distance, that is the optimal copula family to use
    minDistance = np.inf
    for family, distance in distances.iteritems():
        if distance < minDistance:
            minDistance = distance
            optimalFamily = family

    depParams = invcopulastat(optimalFamily, 'kendall', tau_hat)

    return (optimalFamily, depParams, tau_hat)