def __init__(self, MetricTable): # Create empty ratio table nprobs = MetricTable.nprobs nsolvs = MetricTable.nsolvs self.ratios = ma.masked_array(1.0 * ma.zeros((nprobs + 1, nsolvs))) # Compute best relative performance ratios across # solvers for each problem for prob in range(nprobs): metrics = MetricTable.prob_mets(prob) best_met = ma.minimum(metrics) if (ma.count(metrics) == nsolvs and ma.maximum(metrics) <= opts.minlimit): self.ratios[prob + 1, :] = 1.0 else: self.ratios[prob + 1, :] = metrics * (1.0 / best_met) # Sort each solvers performance ratios for solv in range(nsolvs): self.ratios[:, solv] = ma.sort(self.ratios[:, solv]) # Compute largest ratio and use to replace failures entries self.maxrat = ma.maximum(self.ratios) self.ratios = ma.filled(self.ratios, 1.01 * self.maxrat)
def __init__(self, MetricTable): # Create empty ratio table nprobs = MetricTable.nprobs nsolvs = MetricTable.nsolvs self.ratios = ma.masked_array(1.0 * ma.zeros((nprobs+1, nsolvs))) # Compute best relative performance ratios across # solvers for each problem for prob in range(nprobs): metrics = MetricTable.prob_mets(prob) best_met = ma.minimum(metrics) if (ma.count(metrics)==nsolvs and ma.maximum(metrics)<=opts.minlimit): self.ratios[prob+1,:] = 1.0; else: self.ratios[prob+1,:] = metrics * (1.0 / best_met) # Sort each solvers performance ratios for solv in range(nsolvs): self.ratios[:,solv] = ma.sort(self.ratios[:,solv]) # Compute largest ratio and use to replace failures entries self.maxrat = ma.maximum(self.ratios) self.ratios = ma.filled(self.ratios, 1.01 * self.maxrat)
def __init__(self, MetricTable, opts): epsilon = 0.0 if opts.cpu: epsilon = 0.01 # Create empty ratio table nprobs = MetricTable.nprobs nsolvs = MetricTable.nsolvs self.ratios = ma.zeros((nprobs, nsolvs), dtype=numpy.float) # Compute best relative performance ratios across # solvers for each problem for prob in range(nprobs): metrics = MetricTable.prob_mets(prob) + epsilon best_met = ma.minimum(metrics) self.ratios[prob,:] = metrics * (1.0 / best_met) # Sort each solvers performance ratios for solv in range(nsolvs): self.ratios[:,solv] = ma.sort(self.ratios[:,solv]) # Compute largest ratio and use to replace failure entries self.maxrat = ma.maximum(self.ratios) self.ratios = ma.filled(self.ratios, 10 * self.maxrat)
def predict(self, mu, sigma, Ys, model=None): #calculating var s = sigma + self.sigma alpha = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8000, 0.9000]) q = np.outer(np.sqrt(2 * s), erfinv(2 * alpha - 1)) + mu z = self.warp(model.Y)[0] I = argsort(z, axis=0) sortz = sort(z, axis=0) sortt = model.Y[I] quant = self.warpinv(q, self._get_initial_points(q, sortz, sortt), 100) var = np.square((quant[:, 8] - (quant[:, 0])) / 4) #calculating mu H = np.array([7.6e-07, 0.0013436, 0.0338744, 0.2401386, 0.6108626, 0.6108626, 0.2401386, 0.0338744, 0.0013436, 7.6e-07]) quard = np.array([-3.4361591, -2.5327317, -1.7566836, -1.0366108, -0.3429013, 0.3429013, 1.0366108, 1.7566836, 2.5327317, 3.4361591]) mu_quad = np.outer(np.sqrt(2 * s), quard) + mu mean = self.warpinv(mu_quad, self._get_initial_points(mu_quad, sortz, sortt), 100) mean = mdot(mean, H[:, np.newaxis]) / np.sqrt(math.pi) lpd = None if not (Ys is None): ts, w = self.warp(Ys) lpd = -0.5*np.log(2*math.pi*s) - 0.5 * np.square(ts-mu)/s + np.log(w) return mean, var[:, np.newaxis], lpd[:, 0][:, np.newaxis]
def test_sort(self): series =self.series series.thresholds = (-0.5, +0.5) series.minimum_size = 5 indices = series.indices idx = series.argsort() _series = ma.sort(series) assert_equal(_series, series[idx]) assert_equal(_series.indices, indices[idx])
def _lhsmu(N, samples=None, corr=None, random_state=None, M=5): if random_state is None: random_state = np.random.RandomState() elif not isinstance(random_state, np.random.RandomState): random_state = np.random.RandomState(random_state) if samples is None: samples = N I = M * samples rdpoints = random_state.uniform(size=(I, N)) dist = spatial.distance.cdist(rdpoints, rdpoints, metric='euclidean') D_ij = ma.masked_array(dist, mask=np.identity(I)) index_rm = np.zeros(I - samples, dtype=int) i = 0 while i < I - samples: order = ma.sort(D_ij, axis=1) avg_dist = ma.mean(order[:, 0:2], axis=1) min_l = ma.argmin(avg_dist) D_ij[min_l, :] = ma.masked D_ij[:, min_l] = ma.masked index_rm[i] = min_l i += 1 rdpoints = np.delete(rdpoints, index_rm, axis=0) if (corr is not None): #check if covariance matrix is valid assert type(corr) == np.ndarray assert corr.ndim == 2 assert corr.shape[0] == corr.shape[1] assert corr.shape[0] == N norm_u = stats.norm().ppf(rdpoints) L = linalg.cholesky(corr, lower=True) norm_u = np.matmul(norm_u, L) H = stats.norm().cdf(norm_u) else: H = np.zeros_like(rdpoints, dtype=float) rank = np.argsort(rdpoints, axis=0) for l in range(samples): low = float(l) / samples high = float(l + 1) / samples l_pos = rank == l H[l_pos] = random_state.uniform(low, high, size=N) return H
def test_sort(self): series = self.series series.thresholds = (-0.5, +0.5) series.minimum_size = 5 indices = series.indices idx = series.argsort() _series = ma.sort(series) assert_equal(_series, series[idx]) assert_equal(_series.indices, indices[idx])
def test_testCI(self): # Test of conversions and indexing x1 = np.array([1, 2, 4, 3]) x2 = array(x1, mask=[1, 0, 0, 0]) x3 = array(x1, mask=[0, 1, 0, 1]) x4 = array(x1) # test conversion to strings str(x2) # raises? repr(x2) # raises? assert_(eq(np.sort(x1), sort(x2, fill_value=0))) # tests of indexing assert_(type(x2[1]) is type(x1[1])) assert_(x1[1] == x2[1]) assert_(x2[0] is masked) assert_(eq(x1[2], x2[2])) assert_(eq(x1[2:5], x2[2:5])) assert_(eq(x1[:], x2[:])) assert_(eq(x1[1:], x3[1:])) x1[2] = 9 x2[2] = 9 assert_(eq(x1, x2)) x1[1:3] = 99 x2[1:3] = 99 assert_(eq(x1, x2)) x2[1] = masked assert_(eq(x1, x2)) x2[1:3] = masked assert_(eq(x1, x2)) x2[:] = x1 x2[1] = masked assert_(allequal(getmask(x2), array([0, 1, 0, 0]))) x3[:] = masked_array([1, 2, 3, 4], [0, 1, 1, 0]) assert_(allequal(getmask(x3), array([0, 1, 1, 0]))) x4[:] = masked_array([1, 2, 3, 4], [0, 1, 1, 0]) assert_(allequal(getmask(x4), array([0, 1, 1, 0]))) assert_(allequal(x4, array([1, 2, 3, 4]))) x1 = np.arange(5) * 1.0 x2 = masked_values(x1, 3.0) assert_(eq(x1, x2)) assert_(allequal(array([0, 0, 0, 1, 0], MaskType), x2.mask)) assert_(eq(3.0, x2.fill_value)) x1 = array([1, 'hello', 2, 3], object) x2 = np.array([1, 'hello', 2, 3], object) s1 = x1[1] s2 = x2[1] assert_equal(type(s2), str) assert_equal(type(s1), str) assert_equal(s1, s2) assert_(x1[1:1].shape == (0,))
def get_pairwise(): ntopic = 100 # f = open(r'E:\python_workplace\hai2012\corpus\corpus_NP\corpus_NP.twords', encoding='utf-8') # tword_array = loadtxt(r'E:\python_workplace\hai2012\corpus\corpus_NP\corpus_NP.twdist') f = open( r'E:\python_workplace\Opinion_Mining\Data\Nokia 6610\Nokia6610.twords', encoding='utf-8') tword_array = loadtxt( r'E:\python_workplace\Opinion_Mining\Data\Nokia 6610\Nokia6610.twdist') tword_array = -sort(-tword_array, axis=1) tword_array = tword_array[:, 0:100].transpose() wdict = {} for num, line in enumerate(f): if num == 0: pass # 忽略标题 else: words = re.split("\t", line.strip()) dcount = 0 for w in words: if w in wdict: wdict[w].append((num - 1, dcount)) elif len(w) > 1: wdict[w] = [(num - 1, dcount)] dcount += 1 f.close() print(wdict) keys = [k for k in wdict.keys()] keys.sort() print(keys) # w_t = numpy.zeros([len(keys), ntopic]) w_t = numpy.ones([len(keys), ntopic]) * 0.000001 for i, k in enumerate(keys): for d in wdict[k]: w_t[i, d[1]] = tword_array[d[0]][d[1]] print(w_t) print(w_t.size) pairwise = spatial.distance.squareform( spatial.distance.pdist(w_t, metric="cosine")) # pairwise = spatial.distance.squareform(spatial.distance.pdist(w_t, lambda i,j: KL_Measure(i, j))) pairwise_filename = r'../Data/pairwise.txt' savetxt(pairwise_filename, pairwise, fmt='%.8f') print(pairwise) print(pairwise.size) return keys, pairwise
def idealfourths(data, axis=None): """Returns an estimate of the lower and upper quartiles of the data along the given axis, as computed with the ideal fourths. """ def _idf(data): x = data.compressed() n = len(x) if n < 3: return [np.nan,np.nan] (j,h) = divmod(n/4. + 5/12.,1) qlo = (1-h)*x[j-1] + h*x[j] k = n - j qup = (1-h)*x[k] + h*x[k-1] return [qlo, qup] data = ma.sort(data, axis=axis).view(MaskedArray) if (axis is None): return _idf(data) else: return ma.apply_along_axis(_idf, axis, data)
def get_pairwise(): ntopic = 100 # f = open(r'E:\python_workplace\hai2012\corpus\corpus_NP\corpus_NP.twords', encoding='utf-8') # tword_array = loadtxt(r'E:\python_workplace\hai2012\corpus\corpus_NP\corpus_NP.twdist') f = open(r'E:\python_workplace\Opinion_Mining\Data\Nokia 6610\Nokia6610.twords', encoding='utf-8') tword_array = loadtxt(r'E:\python_workplace\Opinion_Mining\Data\Nokia 6610\Nokia6610.twdist') tword_array = -sort(-tword_array,axis=1) tword_array = tword_array[:,0:100].transpose() wdict = {} for num, line in enumerate(f): if num == 0: pass # 忽略标题 else: words = re.split("\t",line.strip()) dcount = 0 for w in words: if w in wdict: wdict[w].append((num-1,dcount)) elif len(w)>1: wdict[w] = [(num-1,dcount)] dcount += 1 f.close() print (wdict) keys = [k for k in wdict.keys()] keys.sort() print (keys) # w_t = numpy.zeros([len(keys), ntopic]) w_t = numpy.ones([len(keys), ntopic]) * 0.000001 for i, k in enumerate(keys): for d in wdict[k]: w_t[i,d[1]] = tword_array[d[0]][d[1]] print(w_t) print(w_t.size) pairwise = spatial.distance.squareform(spatial.distance.pdist(w_t, metric = "cosine")) # pairwise = spatial.distance.squareform(spatial.distance.pdist(w_t, lambda i,j: KL_Measure(i, j))) pairwise_filename = r'../Data/pairwise.txt' savetxt(pairwise_filename, pairwise, fmt='%.8f') print (pairwise) print (pairwise.size) return keys, pairwise
def equi_n_discretization(array, intervals=5, dim=1): count = ma.sum(ma.array(ma.ones(array.shape, dtype=int), mask=array.mask), dim) cut = ma.zeros(len(count), dtype=int) sarray = ma.sort(array, dim) r = count % intervals pointsshape = list(array.shape) pointsshape[dim] = 1 points = [] for i in range(intervals): cutend = cut + count // intervals + numpy.ones(len(r)) * (r > i) if dim == 1: p = sarray[list(range(len(cutend))), numpy.array(cutend, dtype=int) -1] else: p = sarray[numpy.array(cutend, dtype=int) -1, list(range(len(cutend)))] points.append(p.reshape(pointsshape)) cut = cutend darray = ma.array(ma.zeros(array.shape) - 1, mask=array.mask) darray[ma.nonzero(array <= points[0])] = 0 for i in range(0, intervals): darray[ma.nonzero((array > points[i]))] = i + 1 return darray
def idealfourths(data, axis=None): """ Returns an estimate of the lower and upper quartiles. Uses the ideal fourths algorithm. Parameters ---------- data : array_like Input array. axis : int, optional Axis along which the quartiles are estimated. If None, the arrays are flattened. Returns ------- idealfourths : {list of floats, masked array} Returns the two internal values that divide `data` into four parts using the ideal fourths algorithm either along the flattened array (if `axis` is None) or along `axis` of `data`. """ def _idf(data): x = data.compressed() n = len(x) if n < 3: return [np.nan, np.nan] (j, h) = divmod(n / 4. + 5 / 12., 1) j = int(j) qlo = (1 - h) * x[j - 1] + h * x[j] k = n - j qup = (1 - h) * x[k] + h * x[k - 1] return [qlo, qup] data = ma.sort(data, axis=axis).view(MaskedArray) if (axis is None): return _idf(data) else: return ma.apply_along_axis(_idf, axis, data)
def predict(self, mu, sigma, Ys, model=None): # calculating var s = sigma + self.sigma alpha = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8000, 0.9000]) q = np.outer(np.sqrt(2 * s), erfinv(2 * alpha - 1)) + mu z = self.warp(model.Y)[0] I = argsort(z, axis=0) sortz = sort(z, axis=0) sortt = model.Y[I] quant = self.warpinv(q, self._get_initial_points(q, sortz, sortt), 100) var = np.square((quant[:, 8] - (quant[:, 0])) / 4) # calculating mu H = np.array( [7.6e-07, 0.0013436, 0.0338744, 0.2401386, 0.6108626, 0.6108626, 0.2401386, 0.0338744, 0.0013436, 7.6e-07] ) quard = np.array( [ -3.4361591, -2.5327317, -1.7566836, -1.0366108, -0.3429013, 0.3429013, 1.0366108, 1.7566836, 2.5327317, 3.4361591, ] ) mu_quad = np.outer(np.sqrt(2 * s), quard) + mu mean = self.warpinv(mu_quad, self._get_initial_points(mu_quad, sortz, sortt), 100) mean = mdot(mean, H[:, np.newaxis]) / np.sqrt(math.pi) lpd = None if not (Ys is None): ts, w = self.warp(Ys) lpd = -0.5 * np.log(2 * math.pi * s) - 0.5 * np.square(ts - mu) / s + np.log(w) return mean, var[:, np.newaxis], lpd[:, 0][:, np.newaxis]
def idealfourths(data, axis=None): """ Returns an estimate of the lower and upper quartiles. Uses the ideal fourths algorithm. Parameters ---------- data : array_like Input array. axis : int, optional Axis along which the quartiles are estimated. If None, the arrays are flattened. Returns ------- idealfourths : {list of floats, masked array} Returns the two internal values that divide `data` into four parts using the ideal fourths algorithm either along the flattened array (if `axis` is None) or along `axis` of `data`. """ def _idf(data): x = data.compressed() n = len(x) if n < 3: return [np.nan, np.nan] (j, h) = divmod(n / 4.0 + 5 / 12.0, 1) j = int(j) qlo = (1 - h) * x[j - 1] + h * x[j] k = n - j qup = (1 - h) * x[k] + h * x[k - 1] return [qlo, qup] data = ma.sort(data, axis=axis).view(MaskedArray) if axis is None: return _idf(data) else: return ma.apply_along_axis(_idf, axis, data)
def equi_n_discretization(array, intervals=5, dim=1): count = ma.sum(ma.array(ma.ones(array.shape, dtype=int), mask=array.mask), dim) cut = ma.zeros(len(count), dtype=int) sarray = ma.sort(array, dim) r = count % intervals pointsshape = list(array.shape) pointsshape[dim] = 1 points = [] for i in range(intervals): cutend = cut + count // intervals + numpy.ones(len(r)) * (r > i) if dim == 1: p = sarray[list(range(len(cutend))), numpy.array(cutend, dtype=int) - 1] else: p = sarray[numpy.array(cutend, dtype=int) - 1, list(range(len(cutend)))] points.append(p.reshape(pointsshape)) cut = cutend darray = ma.array(ma.zeros(array.shape) - 1, mask=array.mask) darray[ma.nonzero(array <= points[0])] = 0 for i in range(0, intervals): darray[ma.nonzero((array > points[i]))] = i + 1 return darray
def fringe(self, exposure, fringe): """Fringe subtraction @param exposure Exposure to process @param frome Fringe frame to apply """ assert exposure, "No exposure provided" assert fringe, "No fringe provided" fringe = self._checkDimensions("fringe", exposure, fringe) # XXX This is a first cut at fringe subtraction. It should be fairly simple to generalise to allow # multiple fringe frames (generated from, e.g., Principal Component Analysis) and solve for the linear # combination that best reproduces the fringes on the science frame. # Optimisations: # * Push the whole thing into C++ # * Persist the fringe measurements along with the fringe frame science = exposure.getMaskedImage() fringe = fringe.getMaskedImage() # XXX Fringe can have mask bits set, because afwMath.statisticsStack propagates them fringe.getMask().set(0) width, height = exposure.getWidth(), exposure.getHeight() policy = self.config['fringe'] num = policy['num'] size = policy['size'] iterations = policy['iterations'] clip = policy['clip'] discard = policy['discard'] xList = numpy.random.random_integers(width - size, size=num) yList = numpy.random.random_integers(height - size, size=num) bgStats = afwMath.makeStatistics(science, afwMath.MEDIAN | afwMath.STDEVCLIP) bgScience = bgStats.getValue(afwMath.MEDIAN) sdScience = bgStats.getValue(afwMath.STDEVCLIP) bgFringe = afwMath.makeStatistics(fringe, afwMath.MEDIAN).getValue() measScience = ma.zeros(num) measFringe = ma.zeros(num) for i in range(num): x, y = int(xList[i]), int(yList[i]) bbox = afwGeom.Box2I(afwGeom.Point2I(x, y), afwGeom.Point2I(x + size - 1, y + size - 1)) subScience = science.Factory(science, bbox, afwImage.LOCAL) subFringe = fringe.Factory(fringe, bbox, afwImage.LOCAL) measScience[i] = afwMath.makeStatistics(subScience, afwMath.MEDIAN).getValue() - bgScience measFringe[i] = afwMath.makeStatistics(subFringe, afwMath.MEDIAN).getValue() - bgFringe # Immediately discard measurements that aren't in the background 'noise' (which includes the fringe # modulation. These have been corrupted by objects. limit = discard * sdScience masked = ma.masked_outside(measScience, -limit, limit) measScience.mask = masked.mask measFringe.mask = masked.mask self.log.log(self.log.DEBUG, "Fringe discard: %f %d" % (limit, measScience.count())) regression = lambda x, y, n: ((x * y).sum() - x.sum() * y.sum() / n) / ((x**2).sum() - x.sum()**2 / n) # Solve for the fringe amplitude, with rejection of bad points lastNum = num for i in range(iterations): slope = regression(measFringe, measScience, 2.0 * num) intercept = measScience.mean() - slope * measFringe.mean() fit = measFringe * slope + intercept resid = measScience - fit sort = ma.sort(resid.copy()) rms = 0.74 * (sort[int(0.75 * lastNum)] - sort[int(0.25 * lastNum)]) limit = clip * rms resid = ma.masked_outside(resid, -limit, limit) measScience.mask = resid.mask measFringe.mask = resid.mask newNum = resid.count() self.log.log(self.log.DEBUG, "Fringe iter %d: %f %f %f %d" % (i, slope, intercept, rms, newNum)) if newNum == lastNum: # Iterating isn't buying us anything break lastNum = newNum slope = regression(measFringe, measScience, 2.0 * num) self.log.log(self.log.INFO, "Fringe amplitude scaling: %f" % slope) science.scaledMinus(slope, fringe)
def plot_fdc(series, multimode=True, plot_enso=False, starting_month=None, lag=6, scale='log', xmin=0.0005, xmax=0.9995, ax=None, **kwargs): """ Plots one or several flow duration curves (FDCs) for the series. The input series should be 1D or 2D. By default, if the series is 1D, one curve only will be plotted, whereas if the series is 2D, a curve will be plotted for each line of the series. A 1D series can also be converted into an annual series with the :keyword:`starting_month` parameter. In that case, ``starting_month`` should be an integer between 1 and 12 precising the month at which the 12-month period should start. For example, to plot the FDCs for each water year (usually from April to the following March), use ``starting_month=4``. When ``enso=True``, ENSO phases are plotted with different colors. When the series is 2D or if it has been converted to an annual frequency, the ENSO indices are defined with the ``full_year=True`` option, where an ENSO episode lasts at least 12 consecutive months. Parameters ---------- series : TimeSeries Flow data. ax : {None, :class:`matplotlib.axes.Axes`}, optional Subplot where to plot the flow duration curves. If None, use the current plot. multimode : {True, False}, optional Whether to interpret a 2D input series as several series or a single one. starting_month : {None, integer}, optional First month of each year. If None, plots the global flow duration curve. Otherwise, ``starting_month`` must be an integer between 1 and 12, corresponding to the first month of the water year (usually, 4 for April). plot_enso : {True, False}, optional Whether to plot each ENSO phase with a different color. lag : {integer}, optional Number of months of lag for the definition of ENSO indices. For example, if lag=6, the ENSO phase starting in Oct. 2001 is applied starting on Apr. 2002. If None, use a lag computed as the time difference between ``starting_month`` and the first month of the reference season of the ENSO indicator (or October if undefined). scale : {'log','lin'}, optional String indicating whether the x-axis is in log (``'log'``) or linear (``'lin'``) scale. If ``'log'``, each plotting position is expressed as a Gaussian pdf. other parameters : The parameters recognized by the :func:`matplotlib.pyplot.plot` function are also recognized. Raises ------ TypeError If ``plot_enso=True`` but the series is not a :class:`~scikits.hydroclimpy.enso.ClimateSeries`. ValueError * If ``starting_month`` is not between 1 and 12. * If ``starting_month`` is defined but the initial series is not 1D. """ if ax is None: ax = gca() # Make sure we have at most a 2D series ............... if series.ndim > 2: raise ValueError("The input series should be 2D at most!") # Get the ENSO indicator associated w/ the series (if any) ensoindicator = getattr(series, 'ensoindicator', None) # Check the starting month ............................ if starting_month is not None: # Make sure we have an integer between 1 and 12 starting_month = int(starting_month) if (starting_month < 1) or (starting_month > 12): errmsg = "The starting month should be between 1 (Jan.) and "\ "12 (Dec.)! (got %s instead)" % starting_month raise ValueError(errmsg) # Check whether we need to plot the ENSO information .. if plot_enso is True: # Make sure we have some ENSO information ......... if ensoindicator is None: errmsg = "No ENSO information is associated with the input series." raise InvalidENSOError(errmsg) # Reset the indices if we have a starting_month ... if starting_month is not None: if lag is None: refmonth = (ensoindicator.reference_season or [10, ])[0] lag = (starting_month + 12 - refmonth) % 12 series.set_ensoindices(full_year=True, lag=lag) else: # Make sure that the indices are already set series.set_ensoindices() # Load the default marker colors .................. from scikits.hydroclimpy.plotlib.ensotools import ENSOlines, \ ENSOmarkers, \ ENSOlabels # No ENSO information to plot : get basic lines & markers else: ENSOlines = {'G':'#cccccc'} ENSOmarkers = {'G':'#cccccc'} # Check whether we are in multimode or not ............ ## 1D input if series.ndim == 1: # Convert to annual if needed if starting_month: multimode = True series = series.convert(FR_ANNSTART[starting_month - 1], func=None) else: multimode = False _series = series.view(ma.MaskedArray) ## 2D input else: # w/ starting month if starting_month is not None: errmsg = "The input series should be 2D! (got %s instead)" raise ValueError(errmsg % str(series.shape)) # w/o multimode if not multimode: _series = series.view(ma.MaskedArray).ravel() # Get the number of valid data per year (ie, per row) n = _series.count(axis= -1) # Get the xdata ......... scale = scale[:3].lower() if scale == 'lin': if multimode: xdata = [np.linspace(1. / (nx + 1), 1 - 1. / (nx + 1), nx) for nx in n] else: xdata = np.linspace(1. / (n + 1), 1 - 1. / (n + 1), n) # xdata = ma.empty(len(series), dtype=float) # xdata[:n] = np.linspace(1. / (n + 1), 1 - 1. / (n + 1), n) elif scale == 'log': if multimode: xdata = [norm.ppf(np.linspace(1. / (nx + 1), 1 - 1. / (nx + 1), nx)) for nx in n] else: xdata = norm.ppf(np.linspace(1. / (n + 1), 1 - 1. / (n + 1), n)) # xdata = ma.empty(len(series), dtype=float) # xdata[:n] = norm.ppf(np.linspace(1. / (n + 1), 1 - 1. / (n + 1), n)) else: raise ValueError("Unrecognized option '%' for scale: "\ "should be in ['lin','log'])") # Get some defaults ..... if multimode: lwdefault = 0.8 zorderdefault = 3 colordefault = ENSOlines['G'] else: lwdefault = 2 zorderdefault = 10 colordefault = 'k' marker = kwargs.pop('marker', 'o') markersize = kwargs.get('markersize', kwargs.get('ms', 3)) lw = kwargs.pop('linewidth', kwargs.pop('lw', lwdefault)) zorder = kwargs.pop('zorder', zorderdefault) color = kwargs.pop('color', kwargs.pop('c', colordefault)) # Multi-mode : one line per year ...................... if multimode: if plot_enso: ensoindices = series.ensoindices if ensoindices.ndim > 1: ensoindices = ensoindices[:, 0] # ENSO mode : different colors for different phases # eidx = series.ensoindices._data # # Take the first column if it's 2D # if eidx.ndim > 1: # eidx=eidx[:,0] for(i, attr) in zip((-1, 0, 1), ('cold', 'neutral', 'warm')): key = attr[0].upper() label = ENSOlabels[key] ydata = series[ensoindices == i] ydata = [np.sort(_).compressed()[::-1] for _ in ydata] # ydata = np.sort(getattr(series, attr).compressed())[::-1] points = [zip(x, y) for (x, y) in zip(xdata, ydata)] collec = LineCollection(points, label=ENSOlabels[key], color=ENSOlines[key], zorder=zorder, linewidth=lw) ax.add_collection(collec, autolim=True) else: ydata = [np.sort(y.compressed())[::-1] for y in _series] points = [zip(x, y) for (x, y) in zip(xdata, ydata)] label = kwargs.pop('label', None) collec = LineCollection(points, label=label, linewidth=lw, colors=ENSOlines['G']) ax.add_collection(collec, autolim=True) # One line for the while dataset ...................... else: ydata = ma.sort(series.compressed(), endwith=False)[::-1] points = [zip(xdata, ydata._series)] label = kwargs.pop('label', 'none') collec = LineCollection(points, label=label, linewidth=lw, colors=color, zorder=zorder) ax.add_collection(collec, autolim=True) # If we need to add some colors if plot_enso and marker: for attr in ('cold', 'neutral', 'warm'): key = attr[0].upper() label = ENSOlabels[key] color = ENSOmarkers[key] #ydata = ma.sort(getattr(series, attr), endwith=False)[::-1] current = getattr(ydata, attr)._series _fdc = ax.plot(xdata, current, ls='', lw=0, marker=marker, ms=markersize, mfc=color, mec=color, label=label, zorder=zorder) #........................ set_normal_limits(ax, xmin=xmin, xmax=xmax, scale=scale) ax.set_ylim(_series.min(), _series.max()) return ax
def quantile(x, probs=DEF_PROBS, typ=DEF_TYPE, method=DEF_METHOD, limit=DEF_LIMIT, na_rm=DEF_NARM, is_sorted=False): """Compute the sample quantiles of any vector distribution. >>> quantile(x, probs=DEF_PROBS, type = DEF_TYPE, method=DEF_METHOD, limit=DEF_LIMIT, na_rm = DEF_NARM, is_sorted=False) """ ## various parameter checkings # check the data if isinstance(x, (pd.DataFrame, pd.Series)): try: x = x.values except: raise TypeError("conversion type error for input dataset") elif not isinstance(x, np.ndarray): try: x = np.asarray(x) except: raise TypeError("wrong type for input dataset") ndim = x.ndim if ndim > 2: raise ValueError("array should be 2D at most !") # check the probs if isinstance(probs, (pd.DataFrame, pd.Series)): try: probs = probs.values except: raise TypeError("conversion type error for input probabilities") elif isinstance(probs, (list, tuple)): try: probs = np.array(probs, copy=False, ndmin=1) except: raise TypeError("conversion type for error input probabilities") elif not isinstance(probs, np.ndarray): raise TypeError("wrong type for input probabilities") # adjust the values: this is taken from R implementation, where alues up to # 2e-14 outside that range are accepted and moved to the nearby endpoint eps = 100 * np.finfo(np.double).eps if (probs < -eps).any() or (probs > 1 + eps).any(): raise ValueError("probs values outside [0,1]") probs = np.maximum(0, np.minimum(1, probs)) #weights = np.ones(x) ## check the weights #if isinstance(weights, (pd.DataFrame,pd.Series)): # try: weights = weights.values # except: raise TypeError("conversion type error for input weights") #elif not isinstance(weights, np.ndarray): # try: weights = np.asarray(weights) # except: raise TypeError("wrong type for input weights") #if x.shape != weights.shape: # raise ValueError("the length of data and weights must be the same") # check parameter typ value if typ not in TYPES: raise ValueError( "typ should be an integer in range [1,{}]!".format(TYPES)) # check parameter method value if method not in METHODS: raise ValueError("method should be in {}!".format(METHODS)) # check parameter method if not isinstance(is_sorted, bool): raise TypeError("wrong type for boolean flag is_sorted!") # check parameter na_rm if not isinstance(na_rm, bool): raise TypeError("wrong type for boolean flag na_rm!") # check parameter limit if not isinstance(limit, (list, tuple, np.ndarray)): raise TypeError("wrong type for boolean flag limit!") if len(limit) != 2: raise ValueError("the length of limit must be 2") ## algorithm implementation def gamma_indice(g, j, typ): gamma = np.zeros(len(j)) if typ == 1: gamma[np.where(g > 0)] = 1 # gamma[np.where(g <= 0)] = 0 elif typ == 2: gamma[np.where(g > 0)] = 1 gamma[np.where(g <= 0)] = 0.5 elif typ == 3: gamma[np.where(np.logical_or(g != 0, j % 2 == 1))] = 1 elif typ >= 4: gamma = g return gamma def _canonical_quantile1D(typ, sorted_x, probs): """Compute the quantile of a 1D numpy array using the canonical/direct approach derived from the original algorithms from Hyndman & Fan, Cunane and Filliben. """ # inspired by the _quantiles1D function of mquantiles N = len(sorted_x) # sorted_x.count() m_indice = lambda p, i: {1: 0, 2: 0, 3: -0.5, 4: 0, 5: 0.5, \ 6: p, 7: 1-p, 8: (p+1)/3 , 9: (2*p+3)/8, \ 10: .4 + .2 * p, 11: .3175 +.365*p}[i] j_indice = lambda p, n, m: np.int_(np.floor(n * p + m)) g_indice = lambda p, n, m, j: p * n + m - j m = m_indice(probs, typ) j = j_indice(probs, N, m) j_1 = j - 1 # adjust for the bounds j_1[j_1 < 0] = 0 j[j > N - 1] = N - 1 x1 = sorted_x[j_1] # indexes start at 0... x2 = sorted_x[j] g = g_indice(probs, N, m, j) gamma = gamma_indice(g, j, typ) return (1 - gamma) * x1 + gamma * x2 def _mquantile1D(typ, sorted_x, probs): """Compute the quantiles of a 1D numpy array following the implementation of the _quantiles1D function of mquantiles. source: https://github.com/scipy/scipy/blob/master/scipy/stats/mstats_basic.py """ N = len( sorted_x ) # sorted_x.count() # though ndarray's have no 'count' attribute if N == 0: return np_ma.array(np.empty(len(probs), dtype=float), mask=True) elif N == 1: return np_ma.array(np.resize(sorted_x, probs.shape), mask=np_ma.nomask) # note that, wrt to the original implementation (see source code mentioned # above), we also added the definitions of (alphap,betap) for typ in [1,2,3] abp_indice = lambda typ: {1: (0, 1), 2: (0, 1), 3: (-.5, -1.5), 4: (0, 1), \ 5: (.5 , .5), 6: (0 , 0), 7:(1 , 1), 8: (1/3, 1/3), \ 9: (3/8 , 3/8), 10: (.4,.4), 11: (.3175, .3175)}[typ] alphap, betap = abp_indice(typ) m = alphap + probs * (1. - alphap - betap) aleph = (probs * N + m) j = np.floor(aleph.clip(1, N - 1)).astype(int) g = (aleph - j).clip(0, 1) gamma = gamma_indice(g, j, typ) return (1. - gamma) * sorted_x[ (j - 1).tolist()] + gamma * sorted_x[j.tolist()] def _wquantile1D(typ, x, probs, weights): # not used """Compute the weighted quantile of a 1D numpy array. """ # Check the data ind_sorted = np.argsort(x) sorted_x = x[ind_sorted] sorted_weights = weights[ind_sorted] # Compute the auxiliary arrays Sn = np.cumsum(sorted_weights) #assert Sn != 0, "The sum of the weights must not be zero" Pn = (Sn - 0.5 * sorted_weights) / np.sum(sorted_weights) # Get the value of the weighted median return np.interp(probs, Pn, sorted_x) ## actual calculation # select method if method == 'DIRECT': _quantile1D = _canonical_quantile1D elif method == 'INHERIT': _quantile1D = _mquantile1D # define input data if na_rm is True: data = np_ma.array(x, copy=True, mask=np.isnan(x)) # weights = np_ma.array(x, copy=True, mask = np.isnan(x)) elif np.isnan(x).any(): raise ValueError( "missing values and NaN's not allowed if 'na_rm' is FALSE") else: data = np_ma.array(x, copy=False) # filter the input data if limit is True: condition = (limit[0] < data) & (data < limit[1]) data[~condition.filled(True)] = np_ma.masked # sort if not already the case if is_sorted is False: # ind_sorted = np.argsort(x) # sorted_x = x[ind_sorted] sorted_data = np_ma.sort(data.compressed()) # Computes quantiles along axis (or globally) if ndim == 1: return _quantile1D(typ, data if is_sorted else sorted_data, probs) else: return np_ma.apply_along_axis(_quantile1D, 1, typ, \ data if is_sorted else sorted_data, probs)
import numpy as np from numpy.ma import sort dfnum = 1. # between group degrees of freedom dfden = 48. # within groups degrees of freedom s = np.random.f(dfnum, dfden, 1000) sort(s)[-10]