def plfit(self, nosmall=True, finite=False, quiet=False, silent=False, usefortran=False, usecy=False, xmin=None, verbose=False, discrete=None, discrete_approx=True, discrete_n_alpha=1000): """ A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m from http://www.santafe.edu/~aaronc/powerlaws/ See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions in empirical data" SIAM Review, 51, 661-703 (2009). (arXiv:0706.1062) http://arxiv.org/abs/0706.1062 There are 3 implementations of xmin estimation. The fortran version is fastest, the C (cython) version is ~10% slower, and the python version is ~3x slower than the fortran version. Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown reasons. There is also a discrete version implemented in python - it is different from the continous version! *discrete* [ bool | None ] If *discrete* is None, the code will try to determine whether the data set is discrete or continous based on the uniqueness of the data. If *discrete* is True or False, the distcrete or continuous fitter will be used, respectively. *xmin* [ float / int ] If you specify xmin, the fitter will only determine alpha assuming the given xmin; the rest of the code (and most of the complexity) is determining an estimate for xmin and alpha. *nosmall* [ bool (True) ] When on, the code rejects low s/n points *finite* [ bool (False) ] There is a 'finite-size bias' to the estimator. The "alpha" the code measures is "alpha-hat" s.t. ᾶ = (nα-1)/(n-1), or α = (1 + ᾶ (n-1)) / n *quiet* [ bool (False) ] If False, delivers messages about what fitter is used and the fit results *verbose* [ bool (False) ] Deliver descriptive messages about the fit parameters (only if *quiet*==False) *silent* [ bool (False) ] If True, will print NO messages """ x = self.data z = numpy.sort(x) t = time.time() xmins,argxmins = numpy.unique(z,return_index=True)#[:-1] self._nunique = len(xmins) if self._nunique == len(x) and discrete is None: if verbose: print "Using CONTINUOUS fitter" discrete = False elif self._nunique < len(x) and discrete is None: if verbose: print "Using DISCRETE fitter" discrete = True t = time.time() if xmin is None: if discrete: self.discrete_best_alpha( approximate=discrete_approx, n_alpha=discrete_n_alpha, verbose=verbose, finite=finite) return self._xmin,self._alpha elif usefortran and fortranOK: dat,av = fplfit.plfit(z,int(nosmall)) goodvals=dat>0 sigma = ((av-1)/numpy.sqrt(len(z)-numpy.arange(len(z))))[argxmins] dat = dat[goodvals] av = av[goodvals] if nosmall: # data, av a;ready treated for this. sigma, xmins not nmax = argmin(sigma<0.1) xmins = xmins[:nmax] sigma = sigma[:nmax] if not quiet: print "FORTRAN plfit executed in %f seconds" % (time.time()-t) elif usecy and cyOK: dat,av = cplfit.plfit_loop(z,nosmall=nosmall,zunique=xmins,argunique=argxmins) goodvals=dat>0 sigma = (av-1)/numpy.sqrt(len(z)-argxmins) dat = dat[goodvals] av = av[goodvals] if not quiet: print "CYTHON plfit executed in %f seconds" % (time.time()-t) else: av = numpy.asarray( map(self.alpha_(z),xmins) ,dtype='float') dat = numpy.asarray( map(self.kstest_(z),xmins),dtype='float') sigma = (av-1)/numpy.sqrt(len(z)-argxmins+1) if nosmall: # test to make sure the number of data points is high enough # to provide a reasonable s/n on the computed alpha goodvals = sigma<0.1 nmax = argmin(goodvals) if nmax > 0: dat = dat[:nmax] xmins = xmins[:nmax] av = av[:nmax] sigma = sigma[:nmax] else: if not silent: print "Not enough data left after flagging - using all positive data." if not quiet: print "PYTHON plfit executed in %f seconds" % (time.time()-t) if usefortran: print "fortran fplfit did not load" if usecy: print "cython cplfit did not load" self._av = av self._xmin_kstest = dat self._sigma = sigma xmin = xmins[argmin(dat)] z = z[z>=xmin] n = len(z) alpha = 1 + n / sum( log(z/xmin) ) if finite: alpha = alpha*(n-1.)/n+1./n if n < 50 and not finite and not silent: print '(PLFIT) Warning: finite-size bias may be present. n=%i' % n ks = max(abs( numpy.arange(n)/float(n) - (1-(xmin/z)**(alpha-1)) )) # Parallels Eqn 3.5 in Clauset et al 2009, but zeta(alpha, xmin) = (alpha-1)/xmin. Really is Eqn B3 in paper. L = n*log((alpha-1)/xmin) - alpha*sum(log(z/xmin)) #requires another map... Larr = arange(len(unique(x))) * log((av-1)/unique(x)) - av*sum self._likelihood = L self._xmin = xmin self._xmins = xmins self._alpha= alpha self._alphaerr = (alpha-1)/numpy.sqrt(n) self._ks = ks # this ks statistic may not have the same value as min(dat) because of unique() if scipyOK: self._ks_prob = scipy.stats.kstwobign.sf(ks*numpy.sqrt(n)) self._ngtx = n if n == 1: if not silent: print "Failure: only 1 point kept. Probably not a power-law distribution." self._alpha = alpha = 0 self._alphaerr = 0 self._likelihood = L = 0 self._ks = 0 self._ks_prob = 0 self._xmin = xmin return xmin,0 if numpy.isnan(L) or numpy.isnan(xmin) or numpy.isnan(alpha): raise ValueError("plfit failed; returned a nan") if not quiet: if verbose: print "The lowest value included in the power-law fit, ", print "xmin: %g" % xmin, if verbose: print "\nThe number of values above xmin, ", print "n(>xmin): %i" % n, if verbose: print "\nThe derived power-law alpha (p(x)~x^-alpha) with MLE-derived error, ", print "alpha: %g +/- %g " % (alpha,self._alphaerr), if verbose: print "\nThe log of the Likelihood (the maximized parameter; you minimized the negative log likelihood), ", print "Log-Likelihood: %g " % L, if verbose: print "\nThe KS-test statistic between the best-fit power-law and the data, ", print "ks: %g" % (ks), if scipyOK: if verbose: print " occurs with probability ", print "p(ks): %g" % (self._ks_prob) else: print return xmin,alpha
def plfit(self,nosmall=True,finite=False,quiet=False,silent=False,usefortran=usefortran,usecy=False, xmin=None): """ A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m from http://www.santafe.edu/~aaronc/powerlaws/ See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions in empirical data" SIAM Review, to appear (2009). (arXiv:0706.1062) http://arxiv.org/abs/0706.1062 nosmall is on by default; it rejects low s/n points can specify xmin to skip xmin estimation There are 3 implementations of xmin estimation. The fortran version is fastest, the C (cython) version is ~10% slower, and the python version is ~3x slower than the fortran version. Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown reasons. """ x = self.data z = numpy.sort(x) t = time.time() xmins,argxmins = numpy.unique(z,return_index=True)#[:-1] t = time.time() if xmin is None: if usefortran: dat,av = fplfit.plfit(z,int(nosmall)) goodvals=dat>0 sigma = ((av-1)/numpy.sqrt(len(z)-numpy.arange(len(z))))[argxmins] dat = dat[goodvals] av = av[goodvals] if not quiet: print "FORTRAN plfit executed in %f seconds" % (time.time()-t) elif usecy and cyok: dat,av = cplfit.plfit_loop(z,nosmall=nosmall,zunique=xmins,argunique=argxmins) goodvals=dat>0 sigma = (av-1)/numpy.sqrt(len(z)-argxmins) dat = dat[goodvals] av = av[goodvals] if not quiet: print "CYTHON plfit executed in %f seconds" % (time.time()-t) else: av = numpy.asarray( map(self.alpha_(z),xmins) ,dtype='float') dat = numpy.asarray( map(self.kstest_(z),xmins),dtype='float') if nosmall: # test to make sure the number of data points is high enough # to provide a reasonable s/n on the computed alpha sigma = (av-1)/numpy.sqrt(len(z)-argxmins+1) goodvals = sigma<0.1 nmax = argmin(goodvals) dat = dat[:nmax] av = av[:nmax] if not quiet: print "PYTHON plfit executed in %f seconds" % (time.time()-t) self._av = av self._xmin_kstest = dat self._sigma = sigma xmin = xmins[argmin(dat)] z = z[z>=xmin] n = len(z) alpha = 1 + n / sum( log(z/xmin) ) if finite: alpha = alpha*(n-1.)/n+1./n if n < 50 and not finite and not silent: print '(PLFIT) Warning: finite-size bias may be present. n=%i' % n ks = max(abs( numpy.arange(n)/float(n) - (1-(xmin/z)**(alpha-1)) )) L = n*log((alpha-1)/xmin) - alpha*sum(log(z/xmin)) #requires another map... Larr = arange(len(unique(x))) * log((av-1)/unique(x)) - av*sum self._likelihood = L self._xmin = xmin self._xmins = xmins self._alpha= alpha self._alphaerr = (alpha-1)/numpy.sqrt(n) self._ks = ks # this ks statistic may not have the same value as min(dat) because of unique() self._ngtx = n if not quiet: print "xmin: %g n(>xmin): %i alpha: %g +/- %g Likelihood: %g ks: %g" % (xmin,n,alpha,self._alphaerr,L,ks) return xmin,alpha
def plfit(self, nosmall=True, finite=False, quiet=False, silent=False, usefortran=False, usecy=False, xmin=None, verbose=False, discrete=None, discrete_approx=True, discrete_n_alpha=1000): """ A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m from http://www.santafe.edu/~aaronc/powerlaws/ See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions in empirical data" SIAM Review, 51, 661-703 (2009). (arXiv:0706.1062) http://arxiv.org/abs/0706.1062 There are 3 implementations of xmin estimation. The fortran version is fastest, the C (cython) version is ~10% slower, and the python version is ~3x slower than the fortran version. Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown reasons. There is also a discrete version implemented in python - it is different from the continous version! *discrete* [ bool | None ] If *discrete* is None, the code will try to determine whether the data set is discrete or continous based on the uniqueness of the data. If *discrete* is True or False, the distcrete or continuous fitter will be used, respectively. *xmin* [ float / int ] If you specify xmin, the fitter will only determine alpha assuming the given xmin; the rest of the code (and most of the complexity) is determining an estimate for xmin and alpha. *nosmall* [ bool (True) ] When on, the code rejects low s/n points *finite* [ bool (False) ] There is a 'finite-size bias' to the estimator. The "alpha" the code measures is "alpha-hat" s.t. *quiet* [ bool (False) ] If False, delivers messages about what fitter is used and the fit results *verbose* [ bool (False) ] Deliver descriptive messages about the fit parameters (only if *quiet*==False) *silent* [ bool (False) ] If True, will print NO messages """ x = self.data z = numpy.sort(x) t = time.time() xmins, argxmins = numpy.unique(z, return_index=True) #[:-1] self._nunique = len(xmins) if self._nunique == len(x) and discrete is None: if verbose: print "Using CONTINUOUS fitter" discrete = False elif self._nunique < len(x) and discrete is None: if verbose: print "Using DISCRETE fitter" discrete = True t = time.time() if xmin is None: if discrete: self.discrete_best_alpha(approximate=discrete_approx, n_alpha=discrete_n_alpha, verbose=verbose, finite=finite) return self._xmin, self._alpha elif usefortran and fortranOK: dat, av = fplfit.plfit(z, int(nosmall)) goodvals = dat > 0 sigma = ((av - 1) / numpy.sqrt(len(z) - numpy.arange(len(z))))[argxmins] dat = dat[goodvals] av = av[goodvals] if nosmall: # data, av a;ready treated for this. sigma, xmins not nmax = argmin(sigma < 0.1) xmins = xmins[:nmax] sigma = sigma[:nmax] if not quiet: print "FORTRAN plfit executed in %f seconds" % ( time.time() - t) elif usecy and cyOK: dat, av = cplfit.plfit_loop(z, nosmall=nosmall, zunique=xmins, argunique=argxmins) goodvals = dat > 0 sigma = (av - 1) / numpy.sqrt(len(z) - argxmins) dat = dat[goodvals] av = av[goodvals] if not quiet: print "CYTHON plfit executed in %f seconds" % ( time.time() - t) else: av = numpy.asarray(map(self.alpha_(z), xmins), dtype='float') dat = numpy.asarray(map(self.kstest_(z), xmins), dtype='float') sigma = (av - 1) / numpy.sqrt(len(z) - argxmins + 1) if nosmall: # test to make sure the number of data points is high enough # to provide a reasonable s/n on the computed alpha goodvals = sigma < 0.1 nmax = argmin(goodvals) if nmax > 0: dat = dat[:nmax] xmins = xmins[:nmax] av = av[:nmax] sigma = sigma[:nmax] else: if not silent: print "Not enough data left after flagging - using all positive data." if not quiet: print "PYTHON plfit executed in %f seconds" % (time.time() - t) if usefortran: print "fortran fplfit did not load" if usecy: print "cython cplfit did not load" self._av = av self._xmin_kstest = dat self._sigma = sigma xmin = xmins[argmin(dat)] z = z[z >= xmin] n = len(z) alpha = 1 + n / sum(log(z / xmin)) if finite: alpha = alpha * (n - 1.) / n + 1. / n if n < 50 and not finite and not silent: print '(PLFIT) Warning: finite-size bias may be present. n=%i' % n ks = max( abs(numpy.arange(n) / float(n) - (1 - (xmin / z)**(alpha - 1)))) # Parallels Eqn 3.5 in Clauset et al 2009, but zeta(alpha, xmin) = (alpha-1)/xmin. Really is Eqn B3 in paper. L = n * log((alpha - 1) / xmin) - alpha * sum(log(z / xmin)) #requires another map... Larr = arange(len(unique(x))) * log((av-1)/unique(x)) - av*sum self._likelihood = L self._xmin = xmin self._xmins = xmins self._alpha = alpha self._alphaerr = (alpha - 1) / numpy.sqrt(n) self._ks = ks # this ks statistic may not have the same value as min(dat) because of unique() if scipyOK: self._ks_prob = scipy.stats.kstwobign.sf(ks * numpy.sqrt(n)) self._ngtx = n if n == 1: if not silent: print "Failure: only 1 point kept. Probably not a power-law distribution." self._alpha = alpha = 0 self._alphaerr = 0 self._likelihood = L = 0 self._ks = 0 self._ks_prob = 0 self._xmin = xmin return xmin, 0 if numpy.isnan(L) or numpy.isnan(xmin) or numpy.isnan(alpha): raise ValueError("plfit failed; returned a nan") if not quiet: if verbose: print "The lowest value included in the power-law fit, ", print "xmin: %g" % xmin, if verbose: print "\nThe number of values above xmin, ", print "n(>xmin): %i" % n, if verbose: print "\nThe derived power-law alpha (p(x)~x^-alpha) with MLE-derived error, ", print "alpha: %g +/- %g " % (alpha, self._alphaerr), if verbose: print "\nThe log of the Likelihood (the maximized parameter; you minimized the negative log likelihood), ", print "Log-Likelihood: %g " % L, if verbose: print "\nThe KS-test statistic between the best-fit power-law and the data, ", print "ks: %g" % (ks), if scipyOK: if verbose: print " occurs with probability ", print "p(ks): %g" % (self._ks_prob) else: print return xmin, alpha
def plfit(self, nosmall=True, finite=False, quiet=False, silent=False, usefortran=False, usecy=False, xmin=None, verbose=False, discrete=None, discrete_approx=True, discrete_n_alpha=1000): """ A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m from http://www.santafe.edu/~aaronc/powerlaws/ See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions in empirical data" SIAM Review, 51, 661-703 (2009). (arXiv:0706.1062) http://arxiv.org/abs/0706.1062 There are 3 implementations of xmin estimation. The fortran version is fastest, the C (cython) version is ~10% slower, and the python version is ~3x slower than the fortran version. Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown reasons. There is also a discrete version implemented in python - it is different from the continous version! *discrete* [ bool | None ] If *discrete* is None, the code will try to determine whether the data set is discrete or continous based on the uniqueness of the data; if your data set is continuous but you have any non-unique data points (e.g., flagged "bad" data), the "automatic" determination will fail. If *discrete* is True or False, the discrete or continuous fitter will be used, respectively. *xmin* [ float / int ] If you specify xmin, the fitter will only determine alpha assuming the given xmin; the rest of the code (and most of the complexity) is determining an estimate for xmin and alpha. *nosmall* [ bool (True) ] When on, the code rejects low s/n points. WARNING: This option, which is on by default, may result in different answers than the original Matlab code and the "powerlaw" python package *finite* [ bool (False) ] There is a 'finite-size bias' to the estimator. The "alpha" the code measures is "alpha-hat" s.t. ᾶ = (nα-1)/(n-1), or α = (1 + ᾶ (n-1)) / n *quiet* [ bool (False) ] If False, delivers messages about what fitter is used and the fit results *verbose* [ bool (False) ] Deliver descriptive messages about the fit parameters (only if *quiet*==False) *silent* [ bool (False) ] If True, will print NO messages """ x = self.data if any(x < 0): raise ValueError("Power law distributions are only valid for " "positive data. Remove negative values before " "fitting.") z = np.sort(x) # xmins = the unique values of x that can be used as the threshold for # the power law fit # argxmins = the index of each of these possible thresholds xmins, argxmins = np.unique(z, return_index=True) self._nunique = len(xmins) if self._nunique == len(x) and discrete is None: if verbose: print("Using CONTINUOUS fitter because there are no repeated " "values.") discrete = False elif self._nunique < len(x) and discrete is None: if verbose: print("Using DISCRETE fitter because there are repeated " "values.") discrete = True t = time.time() if xmin is None: if discrete: self.discrete_best_alpha(approximate=discrete_approx, n_alpha=discrete_n_alpha, verbose=verbose, finite=finite) return self._xmin, self._alpha elif usefortran and fortranOK: kstest_values, alpha_values = fplfit.plfit(z, 0) if not quiet: print(("FORTRAN plfit executed in %f seconds" % (time.time() - t))) elif usecy and cyOK: kstest_values, alpha_values = cplfit.plfit_loop( z, nosmall=False, zunique=xmins, argunique=argxmins) if not quiet: print(("CYTHON plfit executed in %f seconds" % (time.time() - t))) else: # python (numpy) version f_alpha = alpha_gen(z) f_kstest = kstest_gen(z) alpha_values = np.asarray(list(map(f_alpha, xmins)), dtype='float') kstest_values = np.asarray(list(map(f_kstest, xmins)), dtype='float') if not quiet: print(("PYTHON plfit executed in %f seconds" % (time.time() - t))) if not quiet: if usefortran and not fortranOK: raise ImportError("fortran fplfit did not load") if usecy and not cyOK: raise ImportError("cython cplfit did not load") # For each alpha, the number of included data points is # total data length - first index of xmin # No +1 is needed: xmin is included. sigma = (alpha_values - 1) / np.sqrt(len(z) - argxmins) # I had changed it to this, but I think this is wrong. # sigma = (alpha_values-1)/np.sqrt(len(z)-np.arange(len(z))) if nosmall: # test to make sure the number of data points is high enough # to provide a reasonable s/n on the computed alpha goodvals = sigma < 0.1 nmax = argmin(goodvals) if nmax <= 0: nmax = len(xmins) - 1 if not silent: print("Not enough data left after flagging " "low S/N points. " "Using all data.") else: # -1 to weed out the very last data point; it cannot be correct # (can't have a power law with 1 data point). nmax = len(xmins) - 1 best_ks_index = argmin(kstest_values[:nmax]) xmin = xmins[best_ks_index] self._alpha_values = alpha_values self._xmin_kstest = kstest_values if scipyOK: # CHECK THIS self._ks_prob_all = np.array([ scipy.stats.ksone.sf(D_stat, len(kstest_values) - ii) for ii, D_stat in enumerate(kstest_values) ]) self._sigma = sigma # sanity check n = np.count_nonzero(z >= xmin) alpha = 1. + float(n) / sum(log(z[z >= xmin] / xmin)) try: np.testing.assert_almost_equal(alpha, alpha_values[best_ks_index], decimal=5) except AssertionError: raise AssertionError("The alpha value computed was not self-" "consistent. This should not happen.") z = z[z >= xmin] n = len(z) alpha = 1. + float(n) / sum(log(z / xmin)) if finite: alpha = alpha * (n - 1.) / n + 1. / n if n < 50 and not finite and not silent: print( ('(PLFIT) Warning: finite-size bias may be present. n=%i' % n)) ks = max(abs(np.arange(n) / float(n) - (1 - (xmin / z)**(alpha - 1)))) # Parallels Eqn 3.5 in Clauset et al 2009, but zeta(alpha, xmin) = # (alpha-1)/xmin. Really is Eqn B3 in paper. L = n * log((alpha - 1) / xmin) - alpha * sum(log(z / xmin)) #requires another map... Larr = arange(len(unique(x))) * log((alpha_values-1)/unique(x)) - alpha_values*sum self._likelihood = L self._xmin = xmin self._xmins = xmins self._alpha = alpha self._alphaerr = (alpha - 1) / np.sqrt(n) # this ks statistic may not have the same value as min(dat) because of unique() self._ks = ks if scipyOK: self._ks_prob = scipy.stats.ksone.sf(ks, n) self._ngtx = n if n == 1: if not silent: print( "Failure: only 1 point kept. Probably not a power-law distribution." ) self._alpha = alpha = 0 self._alphaerr = 0 self._likelihood = L = 0 self._ks = 0 self._ks_prob = 0 self._xmin = xmin return xmin, 0 if np.isnan(L) or np.isnan(xmin) or np.isnan(alpha): raise ValueError("plfit failed; returned a nan") if not quiet: if verbose: print("The lowest value included in the power-law fit, ", end=' ') print("xmin: %g" % xmin, end=' ') if verbose: print("\nThe number of values above xmin, ", end=' ') print("n(>xmin): %i" % n, end=' ') if verbose: print( "\nThe derived power-law alpha (p(x)~x^-alpha) with MLE-derived error, ", end=' ') print("alpha: %g +/- %g " % (alpha, self._alphaerr), end=' ') if verbose: print( "\nThe log of the Likelihood (the maximized parameter; you minimized the negative log likelihood), ", end=' ') print("Log-Likelihood: %g " % L, end=' ') if verbose: print( "\nThe KS-test statistic between the best-fit power-law and the data, ", end=' ') print("ks: %g" % (ks), end=' ') if scipyOK: if verbose: print(" occurs with probability ", end=' ') print("p(ks): %g" % (self._ks_prob)) else: print() return xmin, alpha
def plfit(self, nosmall=True, finite=False, quiet=False, silent=False, usefortran=False, usecy=False, xmin=None, verbose=False, discrete=None, discrete_approx=True, discrete_n_alpha=1000): """ A Python implementation of the Matlab code http://www.santafe.edu/~aaronc/powerlaws/plfit.m from http://www.santafe.edu/~aaronc/powerlaws/ See A. Clauset, C.R. Shalizi, and M.E.J. Newman, "Power-law distributions in empirical data" SIAM Review, 51, 661-703 (2009). (arXiv:0706.1062) http://arxiv.org/abs/0706.1062 There are 3 implementations of xmin estimation. The fortran version is fastest, the C (cython) version is ~10% slower, and the python version is ~3x slower than the fortran version. Also, the cython code suffers ~2% numerical error relative to the fortran and python for unknown reasons. There is also a discrete version implemented in python - it is different from the continous version! *discrete* [ bool | None ] If *discrete* is None, the code will try to determine whether the data set is discrete or continous based on the uniqueness of the data; if your data set is continuous but you have any non-unique data points (e.g., flagged "bad" data), the "automatic" determination will fail. If *discrete* is True or False, the distcrete or continuous fitter will be used, respectively. *xmin* [ float / int ] If you specify xmin, the fitter will only determine alpha assuming the given xmin; the rest of the code (and most of the complexity) is determining an estimate for xmin and alpha. *nosmall* [ bool (True) ] When on, the code rejects low s/n points. WARNING: This option, which is on by default, may result in different answers than the original Matlab code and the "powerlaw" python package *finite* [ bool (False) ] There is a 'finite-size bias' to the estimator. The "alpha" the code measures is "alpha-hat" s.t. ᾶ = (nα-1)/(n-1), or α = (1 + ᾶ (n-1)) / n *quiet* [ bool (False) ] If False, delivers messages about what fitter is used and the fit results *verbose* [ bool (False) ] Deliver descriptive messages about the fit parameters (only if *quiet*==False) *silent* [ bool (False) ] If True, will print NO messages """ x = self.data if any(x < 0): raise ValueError("Power law distributions are only valid for " "positive data. Remove negative values before " "fitting.") z = np.sort(x) # xmins = the unique values of x that can be used as the threshold for # the power law fit # argxmins = the index of each of these possible thresholds xmins,argxmins = np.unique(z,return_index=True) self._nunique = len(xmins) if self._nunique == len(x) and discrete is None: if verbose: print("Using CONTINUOUS fitter because there are no repeated " "values.") discrete = False elif self._nunique < len(x) and discrete is None: if verbose: print("Using DISCRETE fitter because there are repeated " "values.") discrete = True t = time.time() if xmin is None: if discrete: self.discrete_best_alpha(approximate=discrete_approx, n_alpha=discrete_n_alpha, verbose=verbose, finite=finite) return self._xmin,self._alpha elif usefortran and fortranOK: kstest_values,alpha_values = fplfit.plfit(z, 0) if not quiet: print("FORTRAN plfit executed in %f seconds" % (time.time()-t)) elif usecy and cyOK: kstest_values,alpha_values = cplfit.plfit_loop(z, nosmall=False, zunique=xmins, argunique=argxmins) if not quiet: print("CYTHON plfit executed in %f seconds" % (time.time()-t)) else: # python (numpy) version f_alpha = alpha_gen(z) f_kstest = kstest_gen(z) alpha_values = np.asarray(map(f_alpha,xmins), dtype='float') kstest_values = np.asarray(map(f_kstest,xmins), dtype='float') if not quiet: print("PYTHON plfit executed in %f seconds" % (time.time()-t)) if not quiet: if usefortran and not fortranOK: raise ImportError("fortran fplfit did not load") if usecy and not cyOK: raise ImportError("cython cplfit did not load") # For each alpha, the number of included data points is # total data length - first index of xmin # No +1 is needed: xmin is included. sigma = (alpha_values-1)/np.sqrt(len(z)-argxmins) if nosmall: # test to make sure the number of data points is high enough # to provide a reasonable s/n on the computed alpha goodvals = sigma<0.1 nmax = argmin(goodvals) if nmax <= 0: nmax = len(xmins) - 1 if not silent: print("Not enough data left after flagging " "low S/N points. " "Using all data.") else: # -1 to weed out the very last data point; it cannot be correct # (can't have a power law with 1 data point). nmax = len(xmins)-1 best_ks_index = argmin(kstest_values[:nmax]) xmin = xmins[best_ks_index] self._alpha_values = alpha_values self._xmin_kstest = kstest_values self._sigma = sigma # sanity check n = np.count_nonzero(z>=xmin) alpha = 1. + float(n)/sum(log(z[z>=xmin]/xmin)) try: np.testing.assert_almost_equal(alpha, alpha_values[best_ks_index], decimal=5) except AssertionError: raise AssertionError("The alpha value computed was not self-" "consistent. This should not happen.") z = z[z>=xmin] n = len(z) alpha = 1. + float(n) / sum(log(z/xmin)) if finite: alpha = alpha*(n-1.)/n+1./n if n < 50 and not finite and not silent: print('(PLFIT) Warning: finite-size bias may be present. n=%i' % n) ks = max(abs( np.arange(n)/float(n) - (1-(xmin/z)**(alpha-1)) )) # Parallels Eqn 3.5 in Clauset et al 2009, but zeta(alpha, xmin) = # (alpha-1)/xmin. Really is Eqn B3 in paper. L = n*log((alpha-1)/xmin) - alpha*sum(log(z/xmin)) #requires another map... Larr = arange(len(unique(x))) * log((alpha_values-1)/unique(x)) - alpha_values*sum self._likelihood = L self._xmin = xmin self._xmins = xmins self._alpha= alpha self._alphaerr = (alpha-1)/np.sqrt(n) # this ks statistic may not have the same value as min(dat) because of unique() self._ks = ks if scipyOK: self._ks_prob = scipy.stats.kstwobign.sf(ks*np.sqrt(n)) self._ngtx = n if n == 1: if not silent: print "Failure: only 1 point kept. Probably not a power-law distribution." self._alpha = alpha = 0 self._alphaerr = 0 self._likelihood = L = 0 self._ks = 0 self._ks_prob = 0 self._xmin = xmin return xmin,0 if np.isnan(L) or np.isnan(xmin) or np.isnan(alpha): raise ValueError("plfit failed; returned a nan") if not quiet: if verbose: print "The lowest value included in the power-law fit, ", print "xmin: %g" % xmin, if verbose: print "\nThe number of values above xmin, ", print "n(>xmin): %i" % n, if verbose: print "\nThe derived power-law alpha (p(x)~x^-alpha) with MLE-derived error, ", print "alpha: %g +/- %g " % (alpha,self._alphaerr), if verbose: print "\nThe log of the Likelihood (the maximized parameter; you minimized the negative log likelihood), ", print "Log-Likelihood: %g " % L, if verbose: print "\nThe KS-test statistic between the best-fit power-law and the data, ", print "ks: %g" % (ks), if scipyOK: if verbose: print " occurs with probability ", print "p(ks): %g" % (self._ks_prob) else: print return xmin,alpha