Beispiel #1
0
    def plfit(self, nosmall=True, finite=False, quiet=False, silent=False,
            usefortran=False, usecy=False, xmin=None, verbose=False, 
            discrete=None, discrete_approx=True, discrete_n_alpha=1000):
        """
        A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m
        from http://www.santafe.edu/~aaronc/powerlaws/

        See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions
        in empirical data" SIAM Review, 51, 661-703 (2009). (arXiv:0706.1062)
        http://arxiv.org/abs/0706.1062

        There are 3 implementations of xmin estimation.  The fortran version is fastest, the C (cython)
        version is ~10% slower, and the python version is ~3x slower than the fortran version.
        Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown
        reasons.

        There is also a discrete version implemented in python - it is different from the continous version!

        *discrete* [ bool | None ]
            If *discrete* is None, the code will try to determine whether the
            data set is discrete or continous based on the uniqueness of the
            data.  If *discrete* is True or False, the distcrete or continuous
            fitter will be used, respectively.

        *xmin* [ float / int ]
            If you specify xmin, the fitter will only determine alpha assuming
            the given xmin; the rest of the code (and most of the complexity)
            is determining an estimate for xmin and alpha.

        *nosmall* [ bool (True) ]
            When on, the code rejects low s/n points

        *finite* [ bool (False) ]
            There is a 'finite-size bias' to the estimator.  The "alpha" the code measures
            is "alpha-hat" s.t. ᾶ = (nα-1)/(n-1), or α = (1 + ᾶ (n-1)) / n

        *quiet* [ bool (False) ]
            If False, delivers messages about what fitter is used and the fit results

        *verbose* [ bool (False) ] 
            Deliver descriptive messages about the fit parameters (only if *quiet*==False)

        *silent* [ bool (False) ] 
            If True, will print NO messages
        """
        x = self.data
        z = numpy.sort(x)
        t = time.time()
        xmins,argxmins = numpy.unique(z,return_index=True)#[:-1]
        self._nunique = len(xmins)
        
        if self._nunique == len(x) and discrete is None:
            if verbose: print "Using CONTINUOUS fitter"
            discrete = False
        elif self._nunique < len(x) and discrete is None:
            if verbose: print "Using DISCRETE fitter"
            discrete = True

        t = time.time()
        if xmin is None:
            if discrete:
                self.discrete_best_alpha( approximate=discrete_approx,
                        n_alpha=discrete_n_alpha, verbose=verbose, finite=finite)
                return self._xmin,self._alpha
            elif usefortran and fortranOK:
                dat,av = fplfit.plfit(z,int(nosmall))
                goodvals=dat>0
                sigma = ((av-1)/numpy.sqrt(len(z)-numpy.arange(len(z))))[argxmins]
                dat = dat[goodvals]
                av = av[goodvals]
                if nosmall:
                    # data, av a;ready treated for this.  sigma, xmins not
                    nmax = argmin(sigma<0.1)
                    xmins = xmins[:nmax]
                    sigma = sigma[:nmax]
                if not quiet: print "FORTRAN plfit executed in %f seconds" % (time.time()-t)
            elif usecy and cyOK:
                dat,av = cplfit.plfit_loop(z,nosmall=nosmall,zunique=xmins,argunique=argxmins)
                goodvals=dat>0
                sigma = (av-1)/numpy.sqrt(len(z)-argxmins)
                dat = dat[goodvals]
                av = av[goodvals]
                if not quiet: print "CYTHON plfit executed in %f seconds" % (time.time()-t)
            else:
                av  = numpy.asarray( map(self.alpha_(z),xmins) ,dtype='float')
                dat = numpy.asarray( map(self.kstest_(z),xmins),dtype='float')
                sigma = (av-1)/numpy.sqrt(len(z)-argxmins+1)
                if nosmall:
                    # test to make sure the number of data points is high enough
                    # to provide a reasonable s/n on the computed alpha
                    goodvals = sigma<0.1
                    nmax = argmin(goodvals)
                    if nmax > 0:
                        dat = dat[:nmax]
                        xmins = xmins[:nmax]
                        av = av[:nmax]
                        sigma = sigma[:nmax]
                    else:
                        if not silent: 
                            print "Not enough data left after flagging - using all positive data."
            if not quiet: 
                print "PYTHON plfit executed in %f seconds" % (time.time()-t)
                if usefortran: print "fortran fplfit did not load"
                if usecy: print "cython cplfit did not load"
            self._av = av
            self._xmin_kstest = dat
            self._sigma = sigma
            xmin  = xmins[argmin(dat)] 
        z     = z[z>=xmin]
        n     = len(z)
        alpha = 1 + n / sum( log(z/xmin) )
        if finite:
            alpha = alpha*(n-1.)/n+1./n
        if n < 50 and not finite and not silent:
            print '(PLFIT) Warning: finite-size bias may be present. n=%i' % n
        ks = max(abs( numpy.arange(n)/float(n) - (1-(xmin/z)**(alpha-1)) ))
        # Parallels Eqn 3.5 in Clauset et al 2009, but zeta(alpha, xmin) = (alpha-1)/xmin.  Really is Eqn B3 in paper.
        L = n*log((alpha-1)/xmin) - alpha*sum(log(z/xmin))
        #requires another map... Larr = arange(len(unique(x))) * log((av-1)/unique(x)) - av*sum
        self._likelihood = L
        self._xmin = xmin
        self._xmins = xmins
        self._alpha= alpha
        self._alphaerr = (alpha-1)/numpy.sqrt(n)
        self._ks = ks  # this ks statistic may not have the same value as min(dat) because of unique()
        if scipyOK: self._ks_prob = scipy.stats.kstwobign.sf(ks*numpy.sqrt(n))
        self._ngtx = n
        if n == 1:
            if not silent:
                print "Failure: only 1 point kept.  Probably not a power-law distribution."
            self._alpha = alpha = 0
            self._alphaerr = 0
            self._likelihood = L = 0
            self._ks = 0
            self._ks_prob = 0
            self._xmin = xmin
            return xmin,0
        if numpy.isnan(L) or numpy.isnan(xmin) or numpy.isnan(alpha):
            raise ValueError("plfit failed; returned a nan")

        if not quiet:
            if verbose: print "The lowest value included in the power-law fit, ",
            print "xmin: %g" % xmin,
            if verbose: print "\nThe number of values above xmin, ",
            print "n(>xmin): %i" % n,
            if verbose: print "\nThe derived power-law alpha (p(x)~x^-alpha) with MLE-derived error, ",
            print "alpha: %g +/- %g  " % (alpha,self._alphaerr), 
            if verbose: print "\nThe log of the Likelihood (the maximized parameter; you minimized the negative log likelihood), ",
            print "Log-Likelihood: %g  " % L,
            if verbose: print "\nThe KS-test statistic between the best-fit power-law and the data, ",
            print "ks: %g" % (ks),
            if scipyOK:
                if verbose: print " occurs with probability  ",
                print "p(ks): %g" % (self._ks_prob)
            else:
                print

        return xmin,alpha
    def plfit(self,nosmall=True,finite=False,quiet=False,silent=False,usefortran=usefortran,usecy=False,
            xmin=None):
        """
        A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m
        from http://www.santafe.edu/~aaronc/powerlaws/

        See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions
        in empirical data" SIAM Review, to appear (2009). (arXiv:0706.1062)
        http://arxiv.org/abs/0706.1062

        nosmall is on by default; it rejects low s/n points
        can specify xmin to skip xmin estimation

        There are 3 implementations of xmin estimation.  The fortran version is fastest, the C (cython)
        version is ~10% slower, and the python version is ~3x slower than the fortran version.
        Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown
        reasons.
        """
        x = self.data
        z = numpy.sort(x)
        t = time.time()
        xmins,argxmins = numpy.unique(z,return_index=True)#[:-1]
        t = time.time()
        if xmin is None:
            if usefortran:
                dat,av = fplfit.plfit(z,int(nosmall))
                goodvals=dat>0
                sigma = ((av-1)/numpy.sqrt(len(z)-numpy.arange(len(z))))[argxmins]
                dat = dat[goodvals]
                av = av[goodvals]
                if not quiet: print "FORTRAN plfit executed in %f seconds" % (time.time()-t)
            elif usecy and cyok:
                dat,av = cplfit.plfit_loop(z,nosmall=nosmall,zunique=xmins,argunique=argxmins)
                goodvals=dat>0
                sigma = (av-1)/numpy.sqrt(len(z)-argxmins)
                dat = dat[goodvals]
                av = av[goodvals]
                if not quiet: print "CYTHON plfit executed in %f seconds" % (time.time()-t)
            else:
                av  = numpy.asarray( map(self.alpha_(z),xmins) ,dtype='float')
                dat = numpy.asarray( map(self.kstest_(z),xmins),dtype='float')
                if nosmall:
                    # test to make sure the number of data points is high enough
                    # to provide a reasonable s/n on the computed alpha
                    sigma = (av-1)/numpy.sqrt(len(z)-argxmins+1)
                    goodvals = sigma<0.1
                    nmax = argmin(goodvals)
                    dat = dat[:nmax]
                    av = av[:nmax]
                if not quiet: print "PYTHON plfit executed in %f seconds" % (time.time()-t)
            self._av = av
            self._xmin_kstest = dat
            self._sigma = sigma
            xmin  = xmins[argmin(dat)] 
        z     = z[z>=xmin]
        n     = len(z)
        alpha = 1 + n / sum( log(z/xmin) )
        if finite:
            alpha = alpha*(n-1.)/n+1./n
        if n < 50 and not finite and not silent:
            print '(PLFIT) Warning: finite-size bias may be present. n=%i' % n
        ks = max(abs( numpy.arange(n)/float(n) - (1-(xmin/z)**(alpha-1)) ))
        L = n*log((alpha-1)/xmin) - alpha*sum(log(z/xmin))
        #requires another map... Larr = arange(len(unique(x))) * log((av-1)/unique(x)) - av*sum
        self._likelihood = L
        self._xmin = xmin
        self._xmins = xmins
        self._alpha= alpha
        self._alphaerr = (alpha-1)/numpy.sqrt(n)
        self._ks = ks  # this ks statistic may not have the same value as min(dat) because of unique()
        self._ngtx = n

        if not quiet:
            print "xmin: %g  n(>xmin): %i  alpha: %g +/- %g  Likelihood: %g  ks: %g" % (xmin,n,alpha,self._alphaerr,L,ks)

        return xmin,alpha
Beispiel #3
0
    def plfit(self,
              nosmall=True,
              finite=False,
              quiet=False,
              silent=False,
              usefortran=False,
              usecy=False,
              xmin=None,
              verbose=False,
              discrete=None,
              discrete_approx=True,
              discrete_n_alpha=1000):
        """
        A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m
        from http://www.santafe.edu/~aaronc/powerlaws/

        See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions
        in empirical data" SIAM Review, 51, 661-703 (2009). (arXiv:0706.1062)
        http://arxiv.org/abs/0706.1062

        There are 3 implementations of xmin estimation.  The fortran version is fastest, the C (cython)
        version is ~10% slower, and the python version is ~3x slower than the fortran version.
        Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown
        reasons.

        There is also a discrete version implemented in python - it is different from the continous version!
        *discrete* [ bool | None ]
            If *discrete* is None, the code will try to determine whether the
            data set is discrete or continous based on the uniqueness of the
            data.  If *discrete* is True or False, the distcrete or continuous
            fitter will be used, respectively.

        *xmin* [ float / int ]
            If you specify xmin, the fitter will only determine alpha assuming
            the given xmin; the rest of the code (and most of the complexity)
            is determining an estimate for xmin and alpha.

        *nosmall* [ bool (True) ]
            When on, the code rejects low s/n points

        *finite* [ bool (False) ]
            There is a 'finite-size bias' to the estimator.  The "alpha" the code measures
            is "alpha-hat" s.t.

        *quiet* [ bool (False) ]
            If False, delivers messages about what fitter is used and the fit results

        *verbose* [ bool (False) ] 
            Deliver descriptive messages about the fit parameters (only if *quiet*==False)

        *silent* [ bool (False) ] 
            If True, will print NO messages
        """
        x = self.data
        z = numpy.sort(x)
        t = time.time()
        xmins, argxmins = numpy.unique(z, return_index=True)  #[:-1]
        self._nunique = len(xmins)

        if self._nunique == len(x) and discrete is None:
            if verbose: print "Using CONTINUOUS fitter"
            discrete = False
        elif self._nunique < len(x) and discrete is None:
            if verbose: print "Using DISCRETE fitter"
            discrete = True

        t = time.time()
        if xmin is None:
            if discrete:
                self.discrete_best_alpha(approximate=discrete_approx,
                                         n_alpha=discrete_n_alpha,
                                         verbose=verbose,
                                         finite=finite)
                return self._xmin, self._alpha
            elif usefortran and fortranOK:
                dat, av = fplfit.plfit(z, int(nosmall))
                goodvals = dat > 0
                sigma = ((av - 1) /
                         numpy.sqrt(len(z) - numpy.arange(len(z))))[argxmins]
                dat = dat[goodvals]
                av = av[goodvals]
                if nosmall:
                    # data, av a;ready treated for this.  sigma, xmins not
                    nmax = argmin(sigma < 0.1)
                    xmins = xmins[:nmax]
                    sigma = sigma[:nmax]
                if not quiet:
                    print "FORTRAN plfit executed in %f seconds" % (
                        time.time() - t)
            elif usecy and cyOK:
                dat, av = cplfit.plfit_loop(z,
                                            nosmall=nosmall,
                                            zunique=xmins,
                                            argunique=argxmins)
                goodvals = dat > 0
                sigma = (av - 1) / numpy.sqrt(len(z) - argxmins)
                dat = dat[goodvals]
                av = av[goodvals]
                if not quiet:
                    print "CYTHON plfit executed in %f seconds" % (
                        time.time() - t)
            else:
                av = numpy.asarray(map(self.alpha_(z), xmins), dtype='float')
                dat = numpy.asarray(map(self.kstest_(z), xmins), dtype='float')
                sigma = (av - 1) / numpy.sqrt(len(z) - argxmins + 1)
                if nosmall:
                    # test to make sure the number of data points is high enough
                    # to provide a reasonable s/n on the computed alpha
                    goodvals = sigma < 0.1
                    nmax = argmin(goodvals)
                    if nmax > 0:
                        dat = dat[:nmax]
                        xmins = xmins[:nmax]
                        av = av[:nmax]
                        sigma = sigma[:nmax]
                    else:
                        if not silent:
                            print "Not enough data left after flagging - using all positive data."
            if not quiet:
                print "PYTHON plfit executed in %f seconds" % (time.time() - t)
                if usefortran: print "fortran fplfit did not load"
                if usecy: print "cython cplfit did not load"
            self._av = av
            self._xmin_kstest = dat
            self._sigma = sigma
            xmin = xmins[argmin(dat)]
        z = z[z >= xmin]
        n = len(z)
        alpha = 1 + n / sum(log(z / xmin))
        if finite:
            alpha = alpha * (n - 1.) / n + 1. / n
        if n < 50 and not finite and not silent:
            print '(PLFIT) Warning: finite-size bias may be present. n=%i' % n
        ks = max(
            abs(numpy.arange(n) / float(n) - (1 - (xmin / z)**(alpha - 1))))
        # Parallels Eqn 3.5 in Clauset et al 2009, but zeta(alpha, xmin) = (alpha-1)/xmin.  Really is Eqn B3 in paper.
        L = n * log((alpha - 1) / xmin) - alpha * sum(log(z / xmin))
        #requires another map... Larr = arange(len(unique(x))) * log((av-1)/unique(x)) - av*sum
        self._likelihood = L
        self._xmin = xmin
        self._xmins = xmins
        self._alpha = alpha
        self._alphaerr = (alpha - 1) / numpy.sqrt(n)
        self._ks = ks  # this ks statistic may not have the same value as min(dat) because of unique()
        if scipyOK:
            self._ks_prob = scipy.stats.kstwobign.sf(ks * numpy.sqrt(n))
        self._ngtx = n
        if n == 1:
            if not silent:
                print "Failure: only 1 point kept.  Probably not a power-law distribution."
            self._alpha = alpha = 0
            self._alphaerr = 0
            self._likelihood = L = 0
            self._ks = 0
            self._ks_prob = 0
            self._xmin = xmin
            return xmin, 0
        if numpy.isnan(L) or numpy.isnan(xmin) or numpy.isnan(alpha):
            raise ValueError("plfit failed; returned a nan")

        if not quiet:
            if verbose:
                print "The lowest value included in the power-law fit, ",
            print "xmin: %g" % xmin,
            if verbose: print "\nThe number of values above xmin, ",
            print "n(>xmin): %i" % n,
            if verbose:
                print "\nThe derived power-law alpha (p(x)~x^-alpha) with MLE-derived error, ",
            print "alpha: %g +/- %g  " % (alpha, self._alphaerr),
            if verbose:
                print "\nThe log of the Likelihood (the maximized parameter; you minimized the negative log likelihood), ",
            print "Log-Likelihood: %g  " % L,
            if verbose:
                print "\nThe KS-test statistic between the best-fit power-law and the data, ",
            print "ks: %g" % (ks),
            if scipyOK:
                if verbose: print " occurs with probability  ",
                print "p(ks): %g" % (self._ks_prob)
            else:
                print

        return xmin, alpha
Beispiel #4
0
    def plfit(self,
              nosmall=True,
              finite=False,
              quiet=False,
              silent=False,
              usefortran=False,
              usecy=False,
              xmin=None,
              verbose=False,
              discrete=None,
              discrete_approx=True,
              discrete_n_alpha=1000):
        """
        A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m
        from http://www.santafe.edu/~aaronc/powerlaws/

        See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions
        in empirical data" SIAM Review, 51, 661-703 (2009). (arXiv:0706.1062)
        http://arxiv.org/abs/0706.1062

        There are 3 implementations of xmin estimation.  The fortran version is fastest, the C (cython)
        version is ~10% slower, and the python version is ~3x slower than the fortran version.
        Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown
        reasons.

        There is also a discrete version implemented in python - it is different from the continous version!

        *discrete* [ bool | None ]
            If *discrete* is None, the code will try to determine whether the
            data set is discrete or continous based on the uniqueness of the
            data; if your data set is continuous but you have any non-unique
            data points (e.g., flagged "bad" data), the "automatic"
            determination will fail.  If *discrete* is True or False, the
            discrete or continuous fitter will be used, respectively.

        *xmin* [ float / int ]
            If you specify xmin, the fitter will only determine alpha assuming
            the given xmin; the rest of the code (and most of the complexity)
            is determining an estimate for xmin and alpha.

        *nosmall* [ bool (True) ]
            When on, the code rejects low s/n points.  WARNING: This option,
            which is on by default, may result in different answers than the
            original Matlab code and the "powerlaw" python package

        *finite* [ bool (False) ]
            There is a 'finite-size bias' to the estimator.  The "alpha" the code measures
            is "alpha-hat" s.t. ᾶ = (nα-1)/(n-1), or α = (1 + ᾶ (n-1)) / n

        *quiet* [ bool (False) ]
            If False, delivers messages about what fitter is used and the fit results

        *verbose* [ bool (False) ]
            Deliver descriptive messages about the fit parameters (only if *quiet*==False)

        *silent* [ bool (False) ]
            If True, will print NO messages
        """
        x = self.data
        if any(x < 0):
            raise ValueError("Power law distributions are only valid for "
                             "positive data.  Remove negative values before "
                             "fitting.")
        z = np.sort(x)

        # xmins = the unique values of x that can be used as the threshold for
        # the power law fit
        # argxmins = the index of each of these possible thresholds
        xmins, argxmins = np.unique(z, return_index=True)
        self._nunique = len(xmins)

        if self._nunique == len(x) and discrete is None:
            if verbose:
                print("Using CONTINUOUS fitter because there are no repeated "
                      "values.")
            discrete = False
        elif self._nunique < len(x) and discrete is None:
            if verbose:
                print("Using DISCRETE fitter because there are repeated "
                      "values.")
            discrete = True

        t = time.time()
        if xmin is None:
            if discrete:
                self.discrete_best_alpha(approximate=discrete_approx,
                                         n_alpha=discrete_n_alpha,
                                         verbose=verbose,
                                         finite=finite)
                return self._xmin, self._alpha
            elif usefortran and fortranOK:
                kstest_values, alpha_values = fplfit.plfit(z, 0)
                if not quiet:
                    print(("FORTRAN plfit executed in %f seconds" %
                           (time.time() - t)))
            elif usecy and cyOK:
                kstest_values, alpha_values = cplfit.plfit_loop(
                    z, nosmall=False, zunique=xmins, argunique=argxmins)
                if not quiet:
                    print(("CYTHON plfit executed in %f seconds" %
                           (time.time() - t)))
            else:
                # python (numpy) version
                f_alpha = alpha_gen(z)
                f_kstest = kstest_gen(z)
                alpha_values = np.asarray(list(map(f_alpha, xmins)),
                                          dtype='float')
                kstest_values = np.asarray(list(map(f_kstest, xmins)),
                                           dtype='float')
                if not quiet:
                    print(("PYTHON plfit executed in %f seconds" %
                           (time.time() - t)))

            if not quiet:
                if usefortran and not fortranOK:
                    raise ImportError("fortran fplfit did not load")
                if usecy and not cyOK:
                    raise ImportError("cython cplfit did not load")

            # For each alpha, the number of included data points is
            # total data length - first index of xmin
            # No +1 is needed: xmin is included.
            sigma = (alpha_values - 1) / np.sqrt(len(z) - argxmins)
            # I had changed it to this, but I think this is wrong.
            # sigma = (alpha_values-1)/np.sqrt(len(z)-np.arange(len(z)))

            if nosmall:
                # test to make sure the number of data points is high enough
                # to provide a reasonable s/n on the computed alpha
                goodvals = sigma < 0.1
                nmax = argmin(goodvals)
                if nmax <= 0:
                    nmax = len(xmins) - 1
                    if not silent:
                        print("Not enough data left after flagging "
                              "low S/N points.  "
                              "Using all data.")
            else:
                # -1 to weed out the very last data point; it cannot be correct
                # (can't have a power law with 1 data point).
                nmax = len(xmins) - 1

            best_ks_index = argmin(kstest_values[:nmax])
            xmin = xmins[best_ks_index]

            self._alpha_values = alpha_values
            self._xmin_kstest = kstest_values
            if scipyOK:
                # CHECK THIS
                self._ks_prob_all = np.array([
                    scipy.stats.ksone.sf(D_stat,
                                         len(kstest_values) - ii)
                    for ii, D_stat in enumerate(kstest_values)
                ])
            self._sigma = sigma

            # sanity check
            n = np.count_nonzero(z >= xmin)
            alpha = 1. + float(n) / sum(log(z[z >= xmin] / xmin))
            try:
                np.testing.assert_almost_equal(alpha,
                                               alpha_values[best_ks_index],
                                               decimal=5)
            except AssertionError:
                raise AssertionError("The alpha value computed was not self-"
                                     "consistent.  This should not happen.")

        z = z[z >= xmin]
        n = len(z)
        alpha = 1. + float(n) / sum(log(z / xmin))
        if finite:
            alpha = alpha * (n - 1.) / n + 1. / n
        if n < 50 and not finite and not silent:
            print(
                ('(PLFIT) Warning: finite-size bias may be present. n=%i' % n))

        ks = max(abs(np.arange(n) / float(n) - (1 - (xmin / z)**(alpha - 1))))
        # Parallels Eqn 3.5 in Clauset et al 2009, but zeta(alpha, xmin) =
        # (alpha-1)/xmin.  Really is Eqn B3 in paper.
        L = n * log((alpha - 1) / xmin) - alpha * sum(log(z / xmin))
        #requires another map... Larr = arange(len(unique(x))) * log((alpha_values-1)/unique(x)) - alpha_values*sum
        self._likelihood = L
        self._xmin = xmin
        self._xmins = xmins
        self._alpha = alpha
        self._alphaerr = (alpha - 1) / np.sqrt(n)

        # this ks statistic may not have the same value as min(dat) because of unique()
        self._ks = ks

        if scipyOK:
            self._ks_prob = scipy.stats.ksone.sf(ks, n)

        self._ngtx = n
        if n == 1:
            if not silent:
                print(
                    "Failure: only 1 point kept.  Probably not a power-law distribution."
                )
            self._alpha = alpha = 0
            self._alphaerr = 0
            self._likelihood = L = 0
            self._ks = 0
            self._ks_prob = 0
            self._xmin = xmin
            return xmin, 0
        if np.isnan(L) or np.isnan(xmin) or np.isnan(alpha):
            raise ValueError("plfit failed; returned a nan")

        if not quiet:
            if verbose:
                print("The lowest value included in the power-law fit, ",
                      end=' ')
            print("xmin: %g" % xmin, end=' ')
            if verbose: print("\nThe number of values above xmin, ", end=' ')
            print("n(>xmin): %i" % n, end=' ')
            if verbose:
                print(
                    "\nThe derived power-law alpha (p(x)~x^-alpha) with MLE-derived error, ",
                    end=' ')
            print("alpha: %g +/- %g  " % (alpha, self._alphaerr), end=' ')
            if verbose:
                print(
                    "\nThe log of the Likelihood (the maximized parameter; you minimized the negative log likelihood), ",
                    end=' ')
            print("Log-Likelihood: %g  " % L, end=' ')
            if verbose:
                print(
                    "\nThe KS-test statistic between the best-fit power-law and the data, ",
                    end=' ')
            print("ks: %g" % (ks), end=' ')
            if scipyOK:
                if verbose: print(" occurs with probability  ", end=' ')
                print("p(ks): %g" % (self._ks_prob))
            else:
                print()

        return xmin, alpha
Beispiel #5
0
    def plfit(self, nosmall=True, finite=False, quiet=False, silent=False,
            usefortran=False, usecy=False, xmin=None, verbose=False, 
            discrete=None, discrete_approx=True, discrete_n_alpha=1000):
        """
        A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m
        from http://www.santafe.edu/~aaronc/powerlaws/

        See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions
        in empirical data" SIAM Review, 51, 661-703 (2009). (arXiv:0706.1062)
        http://arxiv.org/abs/0706.1062

        There are 3 implementations of xmin estimation.  The fortran version is fastest, the C (cython)
        version is ~10% slower, and the python version is ~3x slower than the fortran version.
        Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown
        reasons.

        There is also a discrete version implemented in python - it is different from the continous version!

        *discrete* [ bool | None ]
            If *discrete* is None, the code will try to determine whether the
            data set is discrete or continous based on the uniqueness of the
            data; if your data set is continuous but you have any non-unique
            data points (e.g., flagged "bad" data), the "automatic"
            determination will fail.  If *discrete* is True or False, the
            distcrete or continuous fitter will be used, respectively.

        *xmin* [ float / int ]
            If you specify xmin, the fitter will only determine alpha assuming
            the given xmin; the rest of the code (and most of the complexity)
            is determining an estimate for xmin and alpha.

        *nosmall* [ bool (True) ]
            When on, the code rejects low s/n points.  WARNING: This option,
            which is on by default, may result in different answers than the
            original Matlab code and the "powerlaw" python package

        *finite* [ bool (False) ]
            There is a 'finite-size bias' to the estimator.  The "alpha" the code measures
            is "alpha-hat" s.t. ᾶ = (nα-1)/(n-1), or α = (1 + ᾶ (n-1)) / n

        *quiet* [ bool (False) ]
            If False, delivers messages about what fitter is used and the fit results

        *verbose* [ bool (False) ] 
            Deliver descriptive messages about the fit parameters (only if *quiet*==False)

        *silent* [ bool (False) ] 
            If True, will print NO messages
        """
        x = self.data
        if any(x < 0):
            raise ValueError("Power law distributions are only valid for "
                             "positive data.  Remove negative values before "
                             "fitting.")
        z = np.sort(x)

        # xmins = the unique values of x that can be used as the threshold for
        # the power law fit
        # argxmins = the index of each of these possible thresholds
        xmins,argxmins = np.unique(z,return_index=True)
        self._nunique = len(xmins)
        
        if self._nunique == len(x) and discrete is None:
            if verbose:
                print("Using CONTINUOUS fitter because there are no repeated "
                      "values.")
            discrete = False
        elif self._nunique < len(x) and discrete is None:
            if verbose:
                print("Using DISCRETE fitter because there are repeated "
                      "values.")
            discrete = True

        t = time.time()
        if xmin is None:
            if discrete:
                self.discrete_best_alpha(approximate=discrete_approx,
                                         n_alpha=discrete_n_alpha,
                                         verbose=verbose,
                                         finite=finite)
                return self._xmin,self._alpha
            elif usefortran and fortranOK:
                kstest_values,alpha_values = fplfit.plfit(z, 0)
                if not quiet:
                    print("FORTRAN plfit executed in %f seconds" % (time.time()-t))
            elif usecy and cyOK:
                kstest_values,alpha_values = cplfit.plfit_loop(z,
                                                               nosmall=False,
                                                               zunique=xmins,
                                                               argunique=argxmins)
                if not quiet:
                    print("CYTHON plfit executed in %f seconds" % (time.time()-t))
            else:
                # python (numpy) version
                f_alpha = alpha_gen(z)
                f_kstest = kstest_gen(z)
                alpha_values = np.asarray(map(f_alpha,xmins),
                                             dtype='float')
                kstest_values = np.asarray(map(f_kstest,xmins),
                                              dtype='float')
                if not quiet:
                    print("PYTHON plfit executed in %f seconds" % (time.time()-t))

            if not quiet: 
                if usefortran and not fortranOK:
                    raise ImportError("fortran fplfit did not load")
                if usecy and not cyOK: 
                    raise ImportError("cython cplfit did not load")

            # For each alpha, the number of included data points is
            # total data length - first index of xmin
            # No +1 is needed: xmin is included.
            sigma = (alpha_values-1)/np.sqrt(len(z)-argxmins)

            if nosmall:
                # test to make sure the number of data points is high enough
                # to provide a reasonable s/n on the computed alpha
                goodvals = sigma<0.1
                nmax = argmin(goodvals)
                if nmax <= 0:
                    nmax = len(xmins) - 1
                    if not silent: 
                        print("Not enough data left after flagging "
                              "low S/N points.  "
                              "Using all data.")
            else:
                # -1 to weed out the very last data point; it cannot be correct
                # (can't have a power law with 1 data point).
                nmax = len(xmins)-1

            best_ks_index = argmin(kstest_values[:nmax])
            xmin  = xmins[best_ks_index] 

            self._alpha_values = alpha_values
            self._xmin_kstest = kstest_values
            self._sigma = sigma

            # sanity check
            n = np.count_nonzero(z>=xmin)
            alpha = 1. + float(n)/sum(log(z[z>=xmin]/xmin))
            try:
                np.testing.assert_almost_equal(alpha, alpha_values[best_ks_index],
                                               decimal=5)
            except AssertionError:
                raise AssertionError("The alpha value computed was not self-"
                                     "consistent.  This should not happen.")

        z     = z[z>=xmin]
        n     = len(z)
        alpha = 1. + float(n) / sum(log(z/xmin))
        if finite:
            alpha = alpha*(n-1.)/n+1./n
        if n < 50 and not finite and not silent:
            print('(PLFIT) Warning: finite-size bias may be present. n=%i' % n)

        ks = max(abs( np.arange(n)/float(n) - (1-(xmin/z)**(alpha-1)) ))
        # Parallels Eqn 3.5 in Clauset et al 2009, but zeta(alpha, xmin) =
        # (alpha-1)/xmin.  Really is Eqn B3 in paper.
        L = n*log((alpha-1)/xmin) - alpha*sum(log(z/xmin))
        #requires another map... Larr = arange(len(unique(x))) * log((alpha_values-1)/unique(x)) - alpha_values*sum
        self._likelihood = L
        self._xmin = xmin
        self._xmins = xmins
        self._alpha= alpha
        self._alphaerr = (alpha-1)/np.sqrt(n)

        # this ks statistic may not have the same value as min(dat) because of unique()
        self._ks = ks

        if scipyOK:
            self._ks_prob = scipy.stats.kstwobign.sf(ks*np.sqrt(n))

        self._ngtx = n
        if n == 1:
            if not silent:
                print "Failure: only 1 point kept.  Probably not a power-law distribution."
            self._alpha = alpha = 0
            self._alphaerr = 0
            self._likelihood = L = 0
            self._ks = 0
            self._ks_prob = 0
            self._xmin = xmin
            return xmin,0
        if np.isnan(L) or np.isnan(xmin) or np.isnan(alpha):
            raise ValueError("plfit failed; returned a nan")

        if not quiet:
            if verbose: print "The lowest value included in the power-law fit, ",
            print "xmin: %g" % xmin,
            if verbose: print "\nThe number of values above xmin, ",
            print "n(>xmin): %i" % n,
            if verbose: print "\nThe derived power-law alpha (p(x)~x^-alpha) with MLE-derived error, ",
            print "alpha: %g +/- %g  " % (alpha,self._alphaerr), 
            if verbose: print "\nThe log of the Likelihood (the maximized parameter; you minimized the negative log likelihood), ",
            print "Log-Likelihood: %g  " % L,
            if verbose: print "\nThe KS-test statistic between the best-fit power-law and the data, ",
            print "ks: %g" % (ks),
            if scipyOK:
                if verbose: print " occurs with probability  ",
                print "p(ks): %g" % (self._ks_prob)
            else:
                print

        return xmin,alpha