def mmglblshow(X, border=0.0): """ - Purpose Apply a random color table to a gray-scale image. - Synopsis Y = glblshow(X, border=0.0) - Input X: Gray-scale (uint8 or uint16) image. Labeled image. border: Boolean Default: 0.0. Labeled image. - Output Y: Gray-scale (uint8 or uint16) or binary image. """ from numpy import take, resize, shape from numpy.random import rand mmin = X.min() mmax = X.max() ncolors = mmax - mmin + 1 R = to_int32(rand(ncolors)*255) G = to_int32(rand(ncolors)*255) B = to_int32(rand(ncolors)*255) if mmin == 0: R[0],G[0],B[0] = 0,0,0 r=resize(take(R, X.ravel() - mmin),X.shape) g=resize(take(G, X.ravel() - mmin),X.shape) b=resize(take(B, X.ravel() - mmin),X.shape) Y=concat('d',r,g,b) return Y
def _infer_interval_breaks(coord, axis=0, check_monotonic=False): """ >>> _infer_interval_breaks(np.arange(5)) array([-0.5, 0.5, 1.5, 2.5, 3.5, 4.5]) >>> _infer_interval_breaks([[0, 1], [3, 4]], axis=1) array([[-0.5, 0.5, 1.5], [ 2.5, 3.5, 4.5]]) """ coord = np.asarray(coord) if check_monotonic and not _is_monotonic(coord, axis=axis): raise ValueError("The input coordinate is not sorted in increasing " "order along axis %d. This can lead to unexpected " "results. Consider calling the `sortby` method on " "the input DataArray. To plot data with categorical " "axes, consider using the `heatmap` function from " "the `seaborn` statistical plotting library." % axis) deltas = 0.5 * np.diff(coord, axis=axis) if deltas.size == 0: deltas = np.array(0.0) first = np.take(coord, [0], axis=axis) - np.take(deltas, [0], axis=axis) last = np.take(coord, [-1], axis=axis) + np.take(deltas, [-1], axis=axis) trim_last = tuple(slice(None, -1) if n == axis else slice(None) for n in range(coord.ndim)) return np.concatenate([first, coord[trim_last] + deltas, last], axis=axis)
def onpick(event): ind = event.ind for i in ind: type = event.artist.get_label() msg = '' if type == 'Blobs': msg = 'Blob ' + str(i) for c in range(self.x.shape[1]): msg += '\n ' + self.cluster_vars[c][:self.cluster_vars[c].find('_mean')]+\ ': ' + str(round(np.take(self.x[:,c], i), 2)) msg += '\n (All values are z-scores)' neighbors = np.where(self.est.labels_ == self.est.labels_[i])[0] if len(neighbors) > 1: msg += '\n Other blobs in cluster: ' + \ ', '.join([k for k in neighbors.astype('str') if not k==str(i)]) + \ '\n' elif type == 'Clusters': msg = 'Cluster ' + str(i) msg += '\n Center of cluster (all values in z-scores):' for c in range(self.est.cluster_centers_.shape[1]): msg += '\n ' + self.cluster_vars[c][:self.cluster_vars[c].find('_mean')]+\ ': ' + str(round(np.take(self.est.cluster_centers_[:,c], i), 2)) inhabitants = np.where(self.est.labels_ == i)[0] msg += '\n Blobs in cluster: ' + \ ', '.join([k for k in inhabitants.astype('str') if not k==str(i)])+'\n' print msg
def _set_reach_dist(self, point_index, processed, X, nbrs): P = X[point_index:point_index + 1] # Assume that radius_neighbors is faster without distances # and we don't need all distances, nevertheless, this means # we may be doing some work twice. indices = nbrs.radius_neighbors(P, radius=self.max_eps, return_distance=False)[0] # Getting indices of neighbors that have not been processed unproc = np.compress((~np.take(processed, indices)).ravel(), indices, axis=0) # Neighbors of current point are already processed. if not unproc.size: return # Only compute distances to unprocessed neighbors: if self.metric == 'precomputed': dists = X[point_index, unproc] else: dists = pairwise_distances(P, np.take(X, unproc, axis=0), self.metric, n_jobs=None).ravel() rdists = np.maximum(dists, self.core_distances_[point_index]) improved = np.where(rdists < np.take(self.reachability_, unproc)) self.reachability_[unproc[improved]] = rdists[improved] self.predecessor_[unproc[improved]] = point_index
def permute_2d(m, p): """Performs 2D permutation of matrix m according to p.""" return m[p][:, p] # unused below m_t = np.transpose(m) r_t = np.take(m_t, p, axis=0) return np.take(np.transpose(r_t), p, axis=0)
def getAllCurves(self, just_legend=False): """ Ensures that the x-range of the curves is strictly monotonically increasing. Conserves curves legend and info dictionary. """ curves = Plugin1DBase.Plugin1DBase.getAllCurves(self) if just_legend: return curves processedCurves = [] for curve in curves: x, y, legend, info = curve[0:4] xproc = x[:] yproc = y[:] # Sort idx = numpy.argsort(xproc, kind='mergesort') xproc = numpy.take(xproc, idx) yproc = numpy.take(yproc, idx) # Ravel, Increasing xproc = xproc.ravel() idx = numpy.nonzero((xproc[1:] > xproc[:-1]))[0] xproc = numpy.take(xproc, idx) yproc = numpy.take(yproc, idx) processedCurves += [(xproc, yproc, legend, info)] return processedCurves
def gt_topk(dat, axis, ret_typ, k, is_ascend): if ret_typ == "indices": if is_ascend: indices = np.arange(k) else: indices = np.arange(-1, -k-1, -1) ret = np.take(dat.argsort(axis=axis), axis=axis, indices=indices, mode='wrap') elif ret_typ == "value": if is_ascend: indices = np.arange(k) else: indices = np.arange(-1, -k-1, -1) ret = np.take(np.sort(dat, axis=axis), axis=axis, indices=indices, mode='wrap') else: assert dat.shape == (5, 5, 5, 5) assert axis is None or axis ==1 ret = np.zeros(dat.shape) if is_ascend: indices = np.arange(k) else: indices = np.arange(-1, -k-1, -1) gt_argsort = np.take(dat.argsort(axis=axis), axis=axis, indices=indices, mode='wrap') if axis is None: ret.ravel()[gt_argsort] = 1 else: for i in range(5): for j in range(5): for k in range(5): ret[i, gt_argsort[i, :, j, k], j, k] = 1 return ret
def createDescriptorList( self ): coordsx = nu.array( [[ self.absx], [self.width], [1], [self.olw],[self.tx_width] ] ) coordsy = nu.array( [[ self.absy], [self.height], [1], [self.olw],[self.tx_height] ] ) length = len(self.thePointMatrix) pointsx= nu.dot( nu.take(self.thePointMatrix,(0,),1), coordsx ) pointsy= nu.dot( nu.take(self.thePointMatrix,(1,),1), coordsy ) points = nu.concatenate( (nu.reshape(pointsx,(length,1)), nu.reshape( pointsy,(length,1) ) ),1 ) for aShapeName in self.theDescriptorList.keys(): aDescriptor = self.theDescriptorList[ aShapeName ] aSpecific = aDescriptor[SD_SPECIFIC] aType = aDescriptor[SD_TYPE ] aSpecific[0] = [] if aType in ( CV_RECT, CV_LINE, CV_ELL, CV_TEXT , CV_IMG): for aPointCode in self.theCodeMap[ aShapeName]: x=points[aPointCode][0] y=points[aPointCode][1] aSpecific[0].extend( [x,y]) if aType == CV_TEXT and aShapeName == "text": aSpecific[SPEC_LABEL] = self.theLabel elif aType == CV_BPATH: for anArtCode in self.theCodeMap[ aShapeName ]: decodedList = [] decodedList.append( anArtCode[0] ) for aPointCode in anArtCode[1:]: x=points[aPointCode][0] y=points[aPointCode][1] decodedList.extend([x,y]) aSpecific[0].append( tuple(decodedList) )
def gather(mask): import glob flist = glob.glob(mask) for f in flist: print f dset = da.read_nc(f, ["csat", "lat", "cloudpts"]) csat = dset["csat"].values lat = dset["lat"].values altitude = dset["csat"].altitude idx = dset["cloudpts"].values > 0 del dset cpts = dict() nprof = dict() for l in lats: idx1 = np.where((lat >= lats[l][0]) & (lat < lats[l][1]))[0] idx2 = np.where((lat >= -lats[l][1]) & (lat < -lats[l][0]))[0] idx = np.concatenate([idx1, idx2]) if l in cpts: nprof[l] = nprof[l] + idx.shape[0] cpts[l] = cpts[l] + np.take(csat, idx, axis=0).sum(axis=0) else: nprof[l] = np.sum(idx) cpts[l] = np.take(csat, idx, axis=0).sum(axis=0) cprofl = dict() for l in lats: cprofl[l] = 100.0 * cpts[l] / nprof[l] return cprofl, altitude
def SNfunc(self,data,sig,significancefloor=0.5): D=data.ravel() S=sig.ravel() args=numpy.argsort(-D/S) D=numpy.take(D,args) S=numpy.take(S,args) Dsum=numpy.cumsum(D) Ssum=numpy.cumsum(S**2)**0.5 SN=(Dsum/Ssum).max() #regional SN import scipy.ndimage as ndimage data[data/sig<significancefloor]=0 masks, multiplicity = ndimage.measurements.label(data) labels=numpy.arange(1, multiplicity+1) SNs=numpy.zeros(multiplicity+1) SNs[0]=SN for i in range(multiplicity): D=data[masks==i+1].ravel() S=sig[masks==i+1].ravel() args=numpy.argsort(-D/S) D=numpy.take(D,args) S=numpy.take(S,args) Dsum=numpy.cumsum(D) Ssum=numpy.cumsum(S**2)**0.5 SNi=(Dsum/Ssum).max() SNs[i+1]=SNi SNs=-numpy.sort(-SNs) return SNs
def convert_to_8_bit(self): """ Convert 16-bit display data to 8-bit using a lookup table. """ if self.intensity_scaling == 'autoscale': self.display_min = self.display_data_16.min() self.display_max = self.display_data_16.max() self._make_linear_lookup_table() elif self.intensity_scaling == 'median_filter_autoscale': filtered_image = ndimage.filters.median_filter( self.display_data_16, size=3, output=self.filtered_image) self.display_min = self.filtered_image.min() self.display_max = self.filtered_image.max() self._make_linear_lookup_table() if not hasattr(self, 'display_data_8'): self.display_data_8 = np.empty( self.buffer_shape[1:], dtype=np.uint8) np.take(self.lut, self.display_data_16, out=self.display_data_8) try: self.display_intensity_scaling_queue.get_nowait() except Queue.Empty: pass self.display_intensity_scaling_queue.put( (self.intensity_scaling, self.display_min, self.display_max)) self.image = ArrayInterfaceImage(self.display_data_8, allow_copy=False) pyglet.gl.glTexParameteri( #Reset to no interpolation pyglet.gl.GL_TEXTURE_2D, pyglet.gl.GL_TEXTURE_MAG_FILTER, pyglet.gl.GL_NEAREST) if hasattr(self, 'window'): if not self.window.visible: self.window.set_visible(True) return None
def test_take_output(self, level=rlevel): """Ensure that 'take' honours output parameter.""" x = np.arange(12).reshape((3,4)) a = np.take(x,[0,2],axis=1) b = np.zeros_like(a) np.take(x,[0,2],axis=1,out=b) assert_array_equal(a,b)
def is_quadrant_red(quadrant): indices = xrange(0,len(quadrant),3) red_quadrant = np.take(quadrant, indices) red_sum = np.sum(red_quadrant) red_avg = np.sum(red_quadrant) / len(red_quadrant) logging.debug("red avg: %s" % (red_avg)) indices = xrange(1,len(quadrant),3) green_quadrant = np.take(quadrant, indices) green_sum = np.sum(green_quadrant) green_avg = np.sum(green_quadrant) / len(green_quadrant) logging.debug("green avg: %s" % (green_avg)) indices = xrange(2,len(quadrant),3) blue_quadrant = np.take(quadrant, indices) blue_sum = np.sum(blue_quadrant) blue_avg = np.sum(blue_quadrant) / len(blue_quadrant) logging.debug("blue avg: %s" % (blue_avg)) is_red = red_avg / (0.5 * (green_avg + blue_avg)) logging.debug("redcalc: %s" % (is_red)) if is_red > 2: return 1 else: return 0
def continuous_components(delta_X, delta_Y, delta_t, t, T, K): p = np.arange(K) delta_xp = np.take(delta_X, p) delta_yp = np.take(delta_Y, p) delta_tp = np.take(delta_t, p) tp = np.take(t, p) tp = np.hstack( ( np.array([0]) , tp ) ) first_term_xi = np.cumsum(delta_X[0:K-1]) second_term_xi = (delta_X[1:K]/delta_t[1:K]) * np.cumsum(delta_t[0:K-1]) xi = np.hstack( ( np.array([0]), first_term_xi - second_term_xi ) ) first_term_delta = np.cumsum(delta_Y[0:K-1]) second_term_delta = (delta_Y[1:K]/delta_t[1:K]) * np.cumsum(delta_t[0:K-1]) delta = np.hstack( ( np.array([0]), first_term_delta - second_term_delta ) ) A0 = (1/T)*np.sum( (delta_xp/(2*delta_tp) * (np.square(tp[1:K+1]) - np.square(tp[0:K]))) + \ xi * (tp[1:K+1] - tp[0:K])) C0 = (1/T)*np.sum( (delta_yp/(2*delta_tp) * (np.square(tp[1:K+1]) - np.square(tp[0:K]))) + \ delta * (tp[1:K+1] - tp[0:K])) return A0, C0
def reconstruct(efds, T, K): T=np.ceil(T) N=len(efds) reconstructed = np.zeros((T,2)) n = np.arange(start=1,stop=N,step=1) t = np.arange(T) n_grid, t_grid = np.meshgrid( n, t ) a_n_grid = np.take(efds[:,0], n_grid) b_n_grid = np.take(efds[:,1], n_grid) c_n_grid = np.take(efds[:,2], n_grid) d_n_grid = np.take(efds[:,3], n_grid) arg_grid = n_grid * t_grid / T cos_term = np.cos( 2 * np.pi * arg_grid ) sin_term = np.sin( 2 * np.pi * arg_grid ) reconstructed[:,0] = efds[0,0] + np.sum(a_n_grid * cos_term + b_n_grid * sin_term, axis=1) reconstructed[:,1] = efds[0,0] + np.sum(c_n_grid * cos_term + d_n_grid * sin_term, axis=1) return reconstructed
def transposed(self, new_column_name, select_as_header=None, **kwargs): """returns the transposed table. Arguments: - new_column_name: the existing header will become a column with this name - select_as_header: current column name containing data to be used as the header. Defaults to the first column. """ select_as_header = select_as_header or self.Header[0] assert select_as_header in self.Header, \ '"%s" not in table Header' % select_as_header raw_data = self.getRawData() raw_data.insert(0, self.Header) transposed = numpy.array(raw_data, dtype='O') transposed = transposed.transpose() # indices for the header and non header rows header_index = self.Header.index(select_as_header) data_indices = range(0, header_index)+range(header_index+1, len(transposed)) header = list(numpy.take(transposed, [header_index], axis=0)[0]) header = [new_column_name]+header[1:] # [1:] slice excludes old name rows = numpy.take(transposed, data_indices, axis=0) return Table(header=header, rows=rows, **kwargs)
def get_MW(self, F, mode='F^-1'): if type(F) is dict: # recursive case for many F's at once M,W = {}, {} for key in F: M[key],W[key] = self.get_MW(F[key], mode=mode) return M,W modes = ['F^-1', 'F^-1/2', 'I', 'L^-1']; assert(mode in modes) if mode == 'F^-1': M = np.linalg.pinv(F, rcond=1e-12) #U,S,V = np.linalg.svd(F) #M = np.einsum('ij,j,jk', V.T, 1./S, U.T) elif mode == 'F^-1/2': U,S,V = np.linalg.svd(F) M = np.einsum('ij,j,jk', V.T, 1./np.sqrt(S), U.T) elif mode == 'I': M = np.identity(F.shape[0], dtype=F.dtype) else: #Cholesky decomposition to get M order = np.array([10,11,9,12,8,20,0,13,7,14,6,15,5,16,4,17,3,18,2,19,1]) # XXX needs generalizing iorder = np.argsort(order) F_o = np.take(np.take(F,order, axis=0), order, axis=1) L_o = np.linalg.cholesky(F_o) U,S,V = np.linalg.svd(L_o.conj()) M_o = np.dot(np.transpose(V), np.dot(np.diag(1./S), np.transpose(U))) M = np.take(np.take(M_o,iorder, axis=0), iorder, axis=1) W = np.dot(M, F) norm = W.sum(axis=-1); norm.shape += (1,) M /= norm; W = np.dot(M, F) return M,W
def getMonotonicCurves(self): """ Convenience method that calls getAllCurves and makes sure that all of the X values are strictly increasing. :return: It returns a list of the form: [[xvalues0, yvalues0, legend0, dict0], [xvalues1, yvalues1, legend1, dict1], [...], [xvaluesn, yvaluesn, legendn, dictn]] """ allCurves = self.getAllCurves() * 1 for i in range(len(allCurves)): curve = allCurves[i] x, y, legend, info = curve[0:4] if self.isCurveHidden(legend): continue # Sort idx = argsort(x, kind='mergesort') xproc = take(x, idx) yproc = take(y, idx) # Ravel, Increase xproc = xproc.ravel() idx = nonzero((xproc[1:] > xproc[:-1]))[0] xproc = take(xproc, idx) yproc = take(yproc, idx) allCurves[i][0:2] = xproc, yproc return allCurves
def getColumns(self, columns, **kwargs): """Return a slice of columns""" # check whether we have integer columns if isinstance(columns, str): columns = [columns] is_int = min([isinstance(val, int) for val in columns]) indexes = [] if is_int: indexes = columns else: indexes = [self.Header.index(head) for head in columns] if self._row_ids: # we disallow reordering of identifiers, and ensure they are only # presented once for val in range(self._row_ids): try: indexes.remove(val) except ValueError: pass indexes = range(self._row_ids) + indexes columns = numpy.take(numpy.asarray(self.Header, dtype="O"), indexes) new = numpy.take(self.array, indexes, axis=1) kw = self._get_persistent_attrs() kw.update(kwargs) return Table(header = columns, rows = new, **kw)
def shoelace(vertices): """ Calculate twice the area of polygon using Shoelace formula. Polygon is defined by vertices. Parameters ---------- vertices : array_like Vertex coordinates in a 2-D space. Coordinates must be placed along the last axis. And data points are along the first axis. Returns ------- area : float You can deduce the order of input vertices from the sign: area is positive if vertices are in counter-clockwise order. area is negative if vertices are in clockwise order. area is zero if all points are colinear. Notes ----- This function can be also used to judge if all points in a data set are collinear. Collinear points as input for initializing Polygon instance will raise a QhullError. Examples -------- Vertices of a square: Clockwise: >>> from tadlib.calfea.polygon import shoelace >>> sq = [(0,0), (0,1), (1,1), (1,0)] >>> shoelace(sq) -2.0 Counter-clockwise: >>> sq = [(0,0), (1,0), (1,1), (0,1)] >>> shoelace(sq) 2.0 """ vertices = np.asfarray(vertices) # Rule for stacking multiple comma separated arrays rule = '0,' + str(len(vertices.shape)) # Slip the array along the first axis slip_v = np.r_[rule, vertices[-1], vertices[:-1]] # Extract coordinates x = np.take(vertices, [0], axis=-1).reshape(vertices.shape[:-1]) y = np.take(vertices, [1], axis=-1).reshape(vertices.shape[:-1]) slip_x = np.take(slip_v, [0], axis=-1).reshape(vertices.shape[:-1]) slip_y = np.take(slip_v, [1], axis=-1).reshape(vertices.shape[:-1]) # Sholelace Foluma area = np.sum(y * slip_x - x * slip_y, axis=0) return area
def plot_multi(signal, ax=None, **kwargs): default_min = float(kwargs.pop('minrange', 0.)) default_max = float(kwargs.pop('maxrange', 1.)) plot_range = set_range(signal, default_min, default_max) axis_name = kwargs.pop('multi', None) kwargs.pop('type', None) kwargs.pop('stack', '1,1') kwargs.poop('signals', None) axes = [getattr(signal, axis) for axis in signal.axes] axis_index = signal.axes.index(axis_name) multi_axis = axes.pop(axis_index) if ax is None: ax = plt.subplot(111) ax.grid() legend = kwargs.pop('legend', False) for index, label in enumerate(multi_axis): label = '{} = {:.3f} {}'.format(axis_name, label, multi_axis.units) data = np.take(signal, index, axis=axis_index) plot_axes = [np.take(axis, index, axis=axis.axes.index(axis_name)) if axis_name in axis.axes else axis for axis in axes] plot_methods[data.ndim](data, *plot_axes, label=label, **kwargs) if legend: plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.subplots_adjust(right=0.65) plt.show() ax.set_ylim(plot_range[0], plot_range[1]) kwargs['multi'] = axis_name
def plot_eig(data,nchan): days=data.keys() for k in days: eig_order=[] eigs = [] eigs_cav = [] for bl in data[k]: c_mat=cov(data[k][bl]) cav = get_cav(c_mat,nchan,scaling=opts.auto) U,S,V= n.linalg.svd(c_mat.conj()) U_cav,S_cav,V_cav = n.linalg.svd(cav.conj()) eig_order.append(S[0]) eigs.append( n.fft.fftshift(n.fft.fft(V.T.conj(),axis=0))) eigs_cav.append( n.fft.fftshift(n.fft.fft(V_cav.T.conj(),axis=0))) order=n.argsort(eig_order) eig_order=n.take(eig_order,order) eigs=n.take(eigs,order,axis=0) eigs_cav=n.take(eigs_cav,order,axis=0) embed() fig=p.figure(1) for cnt,eig in enumerate(eigs): p.plot(eig[0] + cnt*5) p.title('Eigenvectors for day {0}'.format(k)) p.show() p.savefig('eigenvectors_{0}.png'.format(k)) p.clf() for cnt,eig in enumerate(eigs_cav): p.plot(eig[0] + cnt*5) p.title('Eigenvectors of Cav for day {0}'.format(k)) p.savefig('eigenvectors_cav_{0}.png'.format(k)) p.clf() p.close()
def interpolate(self, points): if self.tri == None: xc = self.x_coords.flatten() yc = self.y_coords.flatten() self.no_nan_values = self.values.flatten() if np.isnan(xc).any() and np.isnan(yc).any(): xc = xc[~np.isnan(xc)] yc = yc[~np.isnan(yc)] self.no_nan_values = self.no_nan_values[~np.isnan(self.no_nan_values)] # Default: Qbb Qc Qz self.tri = qhull.Delaunay(np.column_stack((xc, yc)), qhull_options='QbB') simplices = self.tri.find_simplex(points) indices = np.take(self.tri.simplices, simplices, axis=0) transforms = np.take(self.tri.transform, simplices, axis=0) delta = points - transforms[:,2] bary = np.einsum('njk,nk->nj', transforms[:,:2,:], delta) temp = np.hstack((bary, 1-bary.sum(axis=1, keepdims=True))) values = np.einsum('nj,nj->n', np.take(self.no_nan_values, indices), temp) #print values[np.any(temp<0, axis=1)] # This should put a NaN for points outside of any simplices # but is for some reason sometimes also true inside a simplex #values[np.any(temp < 0.0, axis=1)] = np.nan return values
def _set_reach_dist(core_distances_, reachability_, predecessor_, point_index, processed, X, nbrs, metric, metric_params, p, max_eps): P = X[point_index:point_index + 1] # Assume that radius_neighbors is faster without distances # and we don't need all distances, nevertheless, this means # we may be doing some work twice. indices = nbrs.radius_neighbors(P, radius=max_eps, return_distance=False)[0] # Getting indices of neighbors that have not been processed unproc = np.compress(~np.take(processed, indices), indices) # Neighbors of current point are already processed. if not unproc.size: return # Only compute distances to unprocessed neighbors: if metric == 'precomputed': dists = X[point_index, unproc] else: _params = dict() if metric_params is None else metric_params.copy() if metric == 'minkowski' and 'p' not in _params: # the same logic as neighbors, p is ignored if explicitly set # in the dict params _params['p'] = p dists = pairwise_distances(P, np.take(X, unproc, axis=0), metric, n_jobs=None, **_params).ravel() rdists = np.maximum(dists, core_distances_[point_index]) improved = np.where(rdists < np.take(reachability_, unproc)) reachability_[unproc[improved]] = rdists[improved] predecessor_[unproc[improved]] = point_index
def _map(self, X): """ Maps from a scalar or an array to an RGBA value or array. The *X* parameter is either a scalar or an array (of any dimension). If it is scalar, the function returns a tuple of RGBA values; otherwise it returns an array with the new shape = oldshape+(4,). Any values that are outside the 0,1 interval are clipped to that interval before generating RGB values. """ if type(X) in [IntType, FloatType]: vtype = 'scalar' xa = array([X]) else: vtype = 'array' xa = asarray(X) # assume the data is properly normalized #xa = where(xa>1.,1.,xa) #xa = where(xa<0.,0.,xa) nanmask = isnan(xa) xa = where(nanmask, 0, (xa * (self.steps-1)).astype(int)) rgba = zeros(xa.shape+(4,), float) rgba[...,0] = where(nanmask, 0, take(self._red_lut, xa)) rgba[...,1] = where(nanmask, 0, take(self._green_lut, xa)) rgba[...,2] = where(nanmask, 0, take(self._blue_lut, xa)) rgba[...,3] = where(nanmask, 0, take(self._alpha_lut, xa)) if vtype == 'scalar': rgba = tuple(rgba[0,:]) return rgba
def _set_reach_dist(self, point_index, X, nbrs): P = np.array(X[point_index]).reshape(1, -1) indices = nbrs.radius_neighbors(P, radius=self.max_bound, return_distance=False)[0] # Getting indices of neighbors that have not been processed unproc = np.compress((~np.take(self._processed, indices)).ravel(), indices, axis=0) # Keep n_jobs = 1 in the following lines...please if len(unproc) > 0: dists = pairwise_distances(P, np.take(X, unproc, axis=0), self.metric, n_jobs=1).ravel() rdists = np.maximum(dists, self.core_distances_[point_index]) new_reach = np.minimum(np.take(self.reachability_, unproc), rdists) self.reachability_[unproc] = new_reach # Checks to see if everything is already processed; # if so, return control to main loop if unproc.size > 0: # Define return order based on reachability distance return(unproc[quick_scan(np.take(self.reachability_, unproc), dists)]) else: return point_index
def test_np_ufuncs(self): z = self.create_array(shape=(100, 100), chunks=(10, 10)) a = np.arange(10000).reshape(100, 100) z[:] = a eq(np.sum(a), np.sum(z)) assert_array_equal(np.sum(a, axis=0), np.sum(z, axis=0)) eq(np.mean(a), np.mean(z)) assert_array_equal(np.mean(a, axis=1), np.mean(z, axis=1)) condition = np.random.randint(0, 2, size=100, dtype=bool) assert_array_equal(np.compress(condition, a, axis=0), np.compress(condition, z, axis=0)) indices = np.random.choice(100, size=50, replace=True) assert_array_equal(np.take(a, indices, axis=1), np.take(z, indices, axis=1)) # use zarr array as indices or condition zc = self.create_array(shape=condition.shape, dtype=condition.dtype, chunks=10, filters=None) zc[:] = condition assert_array_equal(np.compress(condition, a, axis=0), np.compress(zc, a, axis=0)) zi = self.create_array(shape=indices.shape, dtype=indices.dtype, chunks=10, filters=None) zi[:] = indices # this triggers __array__() call with dtype argument assert_array_equal(np.take(a, indices, axis=1), np.take(a, zi, axis=1))
def __init__(self, x, y, ival=0., sorted=False, side='left'): if side.lower() not in ['right', 'left']: msg = "side can take the values 'right' or 'left'" raise ValueError(msg) self.side = side _x = np.asarray(x) _y = np.asarray(y) if _x.shape != _y.shape: msg = "x and y do not have the same shape" raise ValueError(msg) if len(_x.shape) != 1: msg = 'x and y must be 1-dimensional' raise ValueError(msg) self.x = np.r_[-np.inf, _x] self.y = np.r_[ival, _y] if not sorted: asort = np.argsort(self.x) self.x = np.take(self.x, asort, 0) self.y = np.take(self.y, asort, 0) self.n = self.x.shape[0]
def get_left_channels(self, energy, nchan=1): self.initialize() g_s_ii = self.greenfunction.retarded(energy) lambda_l_ii = self.selfenergies[0].get_lambda(energy) lambda_r_ii = self.selfenergies[1].get_lambda(energy) if self.greenfunction.S is not None: s_mm = self.greenfunction.S s_s_i, s_s_ii = linalg.eig(s_mm) s_s_i = np.abs(s_s_i) s_s_sqrt_i = np.sqrt(s_s_i) # sqrt of eigenvalues s_s_sqrt_ii = np.dot(s_s_ii * s_s_sqrt_i, dagger(s_s_ii)) s_s_isqrt_ii = np.dot(s_s_ii / s_s_sqrt_i, dagger(s_s_ii)) lambdab_r_ii = np.dot(np.dot(s_s_isqrt_ii, lambda_r_ii), s_s_isqrt_ii) a_l_ii = np.dot(np.dot(g_s_ii, lambda_l_ii), dagger(g_s_ii)) ab_l_ii = np.dot(np.dot(s_s_sqrt_ii, a_l_ii), s_s_sqrt_ii) lambda_i, u_ii = linalg.eig(ab_l_ii) ut_ii = np.sqrt(lambda_i / (2.0 * np.pi)) * u_ii m_ii = 2 * np.pi * np.dot(np.dot(dagger(ut_ii), lambdab_r_ii), ut_ii) T_i, c_in = linalg.eig(m_ii) T_i = np.abs(T_i) channels = np.argsort(-T_i)[:nchan] c_in = np.take(c_in, channels, axis=1) T_n = np.take(T_i, channels) v_in = np.dot(np.dot(s_s_isqrt_ii, ut_ii), c_in) return T_n, v_in
def resample(self): "resample() randomly draws a set of points equal in size to the original set from the cached data for bootstrapping" assert hasattr(self, "saved_xarray"), "resampling not set up yet. Call setup_resampling() first." ranlist=Numeric.floor(self.get_random_list(self.pointcount)*self.pointcount).astype(numeric_int) self.xarray=Numeric.take(self.saved_xarray, ranlist, -1) #take columns since vectors lie this way self.yarray=Numeric.take(self.saved_yarray, ranlist) self.firstpass=1
& (cham[1] > 4)) ne_southwesterly = np.where((neu[0] > 202.5) & (neu[0] < 247.5) & (neu[1] > 4)) wolf_southwesterly = np.where((wolf[0] > 202.5) & (wolf[0] < 247.5) & (wolf[1] > 4)) #westerly ch_westerly = np.where((cham[0] > 247.5) & (cham[0] < 292.5) & (cham[1] > 4)) ne_westerly = np.where((neu[0] > 247.5) & (neu[0] < 292.5) & (neu[1] > 4)) wolf_westerly = np.where((wolf[0] > 247.5) & (wolf[0] < 292.5) & (wolf[1] > 4)) #northwesterly ch_northwesterly = np.where((cham[0] > 292.5) & (cham[0] < 337.5) & (cham[1] > 4)) ne_northwesterly = np.where((neu[0] > 292.5) & (neu[0] < 337.5) & (neu[1] > 4)) wolf_northwesterly = np.where((wolf[0] > 292.5) & (wolf[0] < 337.5) & (wolf[1] > 4)) ############################################################################### chfc_n = np.take(cham[2], ch_northerly) chwd_n = np.take(cham[0], ch_northerly) nefc_n = np.take(neu[2], ne_northerly) newd_n = np.take(neu[0], ne_northerly) methfc_n = np.take(wolf[2], wolf_northerly) methwd_n = np.take(wolf[0], wolf_northerly) methc_n = np.take(wolf[3], wolf_northerly) chfc_ne = np.take(cham[2], ch_northeasterly) chwd_ne = np.take(cham[0], ch_northeasterly) nefc_ne = np.take(neu[2], ne_northeasterly) newd_ne = np.take(neu[0], ne_northeasterly) methfc_ne = np.take(wolf[2], wolf_northeasterly) methwd_ne = np.take(wolf[0], wolf_northeasterly) methc_ne = np.take(wolf[3], wolf_northeasterly)
def load_feature(image_idx): selected_features = np.take(self.features, image_idx, axis=0) return selected_features
def plt_mfd(Run_name,mega_MFD, scenarios_names_list, ScL_complet_list, ScL_list, Model_list,BG_hyp_list, dimension_used_list,faults_name_list,sample_list,b_value_list,MFD_type_list,m_Mmax, mega_bining_in_mag,a_s_model,b_sample,sm_sample,Mt_sample,plot_mfd,plot_as_rep,plot_Mmax,xmin,xmax,ymin,ymax, catalog_cum_rate,plot_mfd_detailled,bining_in_mag): file_scenarios_MFD_name = str(Run_name) + '/analysis/txt_files/scenarios_MFD.txt' file_scenarios_MFD = open(file_scenarios_MFD_name,'w') if plot_mfd == True : for scenario in scenarios_names_list : mfds_scenario = [] for mfd_i in mega_MFD: if mfd_i[8] == scenario: mfds_scenario.append(mfd_i) mfd_scenario_cumulative = [] mfd_source_cummulative = [] for mfd in mfds_scenario: mfd_i = mfd[11::].astype(np.float) mfd_source_cummulative_i = [] for i in range(len(mfd_i)): #calculate the cumulative for each source mfd_source_cummulative_i.append(np.sum(np.array(mfd_i)[-(len(mfd_i)-i):])) mfd_source_cummulative.append(mfd_source_cummulative_i) for sample in sample_list: rows, cols = np.where(np.array(mfds_scenario) == sample) mfds_scenario_sample = np.take(mfd_source_cummulative,rows,axis= 0) mfd_scenario_cumulative_sample = np.sum(mfds_scenario_sample,axis=0) mfd_scenario_cumulative.append(mfd_scenario_cumulative_sample) file_scenarios_MFD.write(scenario + '\t' + str(mfd_scenario_cumulative_sample)+'\n') file_scenarios_MFD.close() #"#### plot for the whole tree file_branch_cumMFD_name = str(Run_name) + '/analysis/txt_files/branch_cumMFD.txt' file_branch_cumMFD = open(file_branch_cumMFD_name,'w') mega_mfd_cummulative = [] #will contain the cummulative MFD for each model of the logic tree total_list_BG_hyp = [] #wil contain the list of the M_trunc for each model of the logic tree total_list_complet_ScL = [] total_list_ScL = [] #wil contain the list of the ScL for each model of the logic tree total_list_dimension_used = [] #wil contain the list of the dimension used for each model of the logic tree total_list_b_value = [] total_list_MFD_type = [] total_list_scenario_name = [] total_list_model = [] total_list_sample = [] geologic_moment_rate = [] # list of the moment rate of each model geologic_moment_rate_no_as = [] # list of the moment rate of each modelif no aseismic slip is considered selected_ScL = 'Init0' Dimention_used = 'Init0' str_all_data = 'Init0' Model = 'Init0' BG_hyp = 'Init0' b_min = 'Init0' b_max = 'Init0' MFD_type = 'Init0' scenario_name = 'Init0' sample = 'Init0' mfd_i = np.zeros(len(mega_MFD[0][11::])) index = 0 for mega_mfd_i in mega_MFD : if (mega_mfd_i[0] == selected_ScL) and (mega_mfd_i[1] == Dimention_used) and (mega_mfd_i[2] == str_all_data) and (mega_mfd_i[3] == Model ) and (mega_mfd_i[4] == BG_hyp) and (mega_mfd_i[5] == b_min) and (mega_mfd_i[6] == b_max) and (mega_mfd_i[7] == MFD_type ) and (mega_mfd_i[8] == scenario_name) and (mega_mfd_i[9] == sample): #same model, we add sources #print 'ok' mfd_i += mega_mfd_i[11::].astype(np.float) else : #it means it a new model if sum(mfd_i) != 0. : #we calculate the cumulative MFD mfd_cummulative_i = [] geologic_moment_rate_i = 0. for i in range(len(mfd_i)): #calculate the cumulative for each source mfd_cummulative_i.append(np.sum(np.array(mfd_i)[-(len(mfd_i)-i):])) M0 = 10. ** (1.5 * mega_bining_in_mag[i] + 9.1) rate_M0 = M0 * mfd_i[i] geologic_moment_rate_i += rate_M0 geologic_moment_rate.append(geologic_moment_rate_i) geologic_moment_rate_no_as.append(geologic_moment_rate_i * 100. / (100. - float(a_s_model[index]))) mega_mfd_cummulative.append(mfd_cummulative_i) total_list_BG_hyp.append(BG_hyp) total_list_complet_ScL.append((str(selected_ScL) + '_' + str(Dimention_used) + '_' + str(str_all_data))) total_list_ScL.append(selected_ScL) total_list_dimension_used.append(Dimention_used) total_list_model.append(Model) total_list_b_value.append('bmin_'+str(b_min)+'_bmax_'+str(b_max)) total_list_MFD_type.append(MFD_type) total_list_scenario_name.append(scenario_name) total_list_sample.append(sample) file_branch_cumMFD.write(str(Model) + '\t' + str(MFD_type) + '\t' + str(BG_hyp) + '\t' + str(scenario_name) + '\t' + str((str(selected_ScL) + '_' + str(Dimention_used) + '_' + str(str_all_data))) + '\t' + 'bmin_'+str(b_min)+'_bmax_'+str(b_max) + '\t' + str(sample) + '\t' + '\t'.join(map(str,mfd_cummulative_i)) + '\n') index += 1 mfd_i = np.zeros(len(mega_mfd_i[11::])) selected_ScL = mega_mfd_i[0] Dimention_used = mega_mfd_i[1] str_all_data = mega_mfd_i[2] Model = mega_mfd_i[3] BG_hyp = mega_mfd_i[4] b_min = mega_mfd_i[5] b_max = mega_mfd_i[6] MFD_type = mega_mfd_i[7] #a_s = mega_mfd_i[8] scenario_name = mega_mfd_i[8] sample = mega_mfd_i[9] mfd_i += mega_mfd_i[11::].astype(np.float) #we write for the last model mfd_cummulative_i = [] geologic_moment_rate_i = 0. for i in range(len(mfd_i)): #calculate the cumulative for each source mfd_cummulative_i.append(np.sum(np.array(mfd_i)[-(len(mfd_i)-i):])) M0 = 10. ** (1.5 * mega_bining_in_mag[i] + 9.1) rate_M0 = M0 * mfd_i[i] geologic_moment_rate_i += rate_M0 geologic_moment_rate.append(geologic_moment_rate_i) geologic_moment_rate_no_as.append(geologic_moment_rate_i * 100. / (100. - float(a_s_model[index]))) geologic_moment_rate.append(geologic_moment_rate_i) geologic_moment_rate_no_as.append(geologic_moment_rate_i * 100. / (100. - float(a_s_model[index]))) mega_mfd_cummulative.append(mfd_cummulative_i) total_list_BG_hyp.append(BG_hyp) total_list_complet_ScL.append((str(selected_ScL) + '_' + str(Dimention_used) + '_' + str(str_all_data))) total_list_ScL.append(selected_ScL) total_list_dimension_used.append(Dimention_used) total_list_model.append(Model) total_list_b_value.append('bmin_'+str(b_min)+'_bmax_'+str(b_max)) total_list_MFD_type.append(MFD_type) total_list_scenario_name.append(scenario_name) total_list_sample.append(sample) file_branch_cumMFD.write(str(Model) + '\t' + str(MFD_type) + '\t' + str(BG_hyp) + '\t' + str(scenario_name) + '\t' + str((str(selected_ScL) + '_' + str(Dimention_used) + '_' + str(str_all_data))) + '\t' + 'bmin_'+str(b_min)+'_bmax_'+str(b_max) + '\t' + str(sample) + '\t' + '\t'.join(map(str,mfd_cummulative_i)) + '\n') file_branch_cumMFD.close() if len(mega_mfd_cummulative) < 4 : plot_mfd = False mfd_X = mega_mfd_cummulative for i in range(len(mfd_X)): plt.scatter(mega_bining_in_mag,mfd_X[i], c='darkcyan', s=50, edgecolor='',marker = '_',alpha = 0.5) axes = plt.gca() axes.set_xlim([xmin,xmax]) axes.set_ylim([ymin,ymax]) for index_mag in range(len(mega_bining_in_mag)): rate_plus = np.percentile(mfd_X,84,axis=0)[index_mag] rate_minus = np.percentile(mfd_X,16,axis=0)[index_mag] mag = mega_bining_in_mag[index_mag] mag_plus = mag+0.05 mag_minus = mag-0.05 verts = [(mag_minus, rate_minus ), (mag_minus, rate_plus), (mag_plus, rate_plus), (mag_plus, rate_minus), (mag_minus, rate_minus)] codes = [Path.MOVETO, Path.LINETO, Path.LINETO, Path.LINETO, Path.CLOSEPOLY] path_poly = Path(verts, codes) patch = patches.PathPatch(path_poly,facecolor = 'darkgreen', lw = 0., alpha = 0.15) axes.add_patch(patch) plt.scatter(mega_bining_in_mag,np.percentile(mfd_X,50,axis=0), c='darkgreen', s=25, edgecolor='',marker = 'o',alpha = 0.8) plt.scatter(mega_bining_in_mag,np.percentile(mfd_X,16,axis=0), c='darkgreen', s=60, edgecolor='',marker = '_',alpha = 0.8) plt.scatter(mega_bining_in_mag,np.percentile(mfd_X,84,axis=0), c='darkgreen', s=60, edgecolor='',marker = '_',alpha = 0.8) plt.plot(mega_bining_in_mag,np.array(mfd_X).mean(axis=0), color='darkgreen', linewidth = 2) plt.grid() #plot the MFDs of the wholle tree with mean and percentiles # for i in range(len(mega_mfd_cummulative)): # plt.scatter(mega_bining_in_mag,mega_mfd_cummulative[i], c='darkcyan', s=50, edgecolor='',marker = '_',alpha = 0.25) # # plt.scatter(mega_bining_in_mag,np.percentile(mega_mfd_cummulative,50,axis=0), # c='darkgreen', s=30, edgecolor='',marker = 'o',alpha = 0.8) # plt.scatter(mega_bining_in_mag,np.percentile(mega_mfd_cummulative,16,axis=0), # c='darkgreen', s=20, edgecolor='',marker = '+',alpha = 0.8) # plt.scatter(mega_bining_in_mag,np.percentile(mega_mfd_cummulative,84,axis=0), # c='darkgreen', s=20, edgecolor='',marker = '+',alpha = 0.8) # plt.scatter(mega_bining_in_mag,np.array(mega_mfd_cummulative).mean(axis=0), # c='darkslateblue', s=50, edgecolor='',marker = 's',alpha = 0.95) # # # axes = plt.gca() # axes.set_xlim([xmin,xmax]) # axes.set_ylim([ymin,ymax]) plt.grid() plt.yscale('log') plt.title('MFD of the whole tree ') plt.savefig(str(Run_name) + '/analysis/figures/mfd/mdf_whole_tree.png' , dpi = 180, transparent=True) #plt.show() plt.close() rate_in_catalog = catalog_cum_rate #bining_in_mag = np.linspace(5.,7.5,26) '''########################################## #plot mfd for each scenario of the logic tree ############################################''' if len(scenarios_names_list)>1: index_model = 0 for model in Model_list : rate_in_catalog = catalog_cum_rate[index_model] for scenario in scenarios_names_list : if not os.path.exists(str(Run_name) + '/analysis/figures/analyze_branches/scenario_set/' + scenario): os.makedirs(str(Run_name) + '/analysis/figures/analyze_branches/scenario_set/' + scenario) rows = np.where(np.array(total_list_scenario_name) == scenario)[0] mfd_X = [] for index in rows : mfd = mega_mfd_cummulative[index] mfd_X.append(mfd) #density plot if plot_mfd == True : hyp_name = scenario path = str(Run_name) + '/analysis/figures/analyze_branches/scenario_set/' + scenario do_the_plots(hyp_name,mfd_X,mega_bining_in_mag,xmin,xmax,ymin,ymax,Run_name,rate_in_catalog,plot_as_rep,a_s_model,rows,path,bining_in_mag) index_model += 1 '''########################################## #plot mfd for each model of the logic tree ############################################''' index_model = 0 for model in Model_list : # print catalog_cum_rate # print rate_in_catalog = catalog_cum_rate[index_model] if not os.path.exists(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model): os.makedirs(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model) rows = np.where(np.array(total_list_model) == model)[0] mfd_X = [] for index in rows : mfd = mega_mfd_cummulative[index] mfd_X.append(mfd) #density plot if plot_mfd == True : hyp_name = model path = str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model do_the_plots(hyp_name,mfd_X,mega_bining_in_mag,xmin,xmax,ymin,ymax,Run_name,rate_in_catalog,plot_as_rep,a_s_model,rows,path,bining_in_mag) index_model +=1 '''########################################## #plot mfd for each Background hypothesis of the logic tree ############################################''' if len(BG_hyp_list) > 1: for BG_hyp in BG_hyp_list : if not os.path.exists(str(Run_name) + '/analysis/figures/analyze_branches/BG/' + BG_hyp): os.makedirs(str(Run_name) + '/analysis/figures/analyze_branches/BG/' + BG_hyp) rows = np.where(np.array(total_list_BG_hyp) == BG_hyp)[0] mfd_X = [] index_check = 0 for index in rows : mfd = mega_mfd_cummulative[index] mfd_X.append(mfd) index_check += 1 #density plot if plot_mfd == True : hyp_name = BG_hyp path = str(Run_name) + '/analysis/figures/analyze_branches/BG/' + BG_hyp do_the_plots(hyp_name,mfd_X,mega_bining_in_mag,xmin,xmax,ymin,ymax,Run_name,rate_in_catalog,plot_as_rep,a_s_model,rows,path,bining_in_mag) '''########################################## #plot mfd for each MFD of the logic tree ############################################''' if len(MFD_type_list) > 1: for MFD_type in MFD_type_list : if not os.path.exists(str(Run_name) + '/analysis/figures/analyze_branches/MFD_type/' + MFD_type): os.makedirs(str(Run_name) + '/analysis/figures/analyze_branches/MFD_type/' + MFD_type) rows = np.where(np.array(total_list_MFD_type) == MFD_type)[0] mfd_X = [] for index in rows : mfd = mega_mfd_cummulative[index] mfd_X.append(mfd) #density plot if plot_mfd == True : hyp_name = MFD_type path = str(Run_name) + '/analysis/figures/analyze_branches/MFD_type/' + MFD_type do_the_plots(hyp_name,mfd_X,mega_bining_in_mag,xmin,xmax,ymin,ymax,Run_name,rate_in_catalog,plot_as_rep,a_s_model,rows,path,bining_in_mag) '''########################################## #plot mfd for each bvalue of the logic tree ############################################''' if len(b_value_list) > 1: for b in b_value_list : if not os.path.exists(str(Run_name) + '/analysis/figures/analyze_branches/b_value/' + b): os.makedirs(str(Run_name) + '/analysis/figures/analyze_branches/b_value/' + b) rows = np.where(np.array(total_list_b_value) == b)[0] mfd_X = [] for index in rows : mfd = mega_mfd_cummulative[index] mfd_X.append(mfd) #density plot if plot_mfd == True : hyp_name = b path = str(Run_name) + '/analysis/figures/analyze_branches/b_value/' + b do_the_plots(hyp_name,mfd_X,mega_bining_in_mag,xmin,xmax,ymin,ymax,Run_name,rate_in_catalog,plot_as_rep,a_s_model,rows,path,bining_in_mag) '''########################################## #plot mfd for scalling law of the logic tree ############################################''' if len(ScL_complet_list) > 1: for ScL in ScL_complet_list : if not os.path.exists(str(Run_name) + '/analysis/figures/analyze_branches/ScL/' + ScL): os.makedirs(str(Run_name) + '/analysis/figures/analyze_branches/ScL/' + ScL) rows = np.where(np.array(total_list_complet_ScL) == ScL)[0] mfd_X = [] for index in rows : mfd = mega_mfd_cummulative[index] mfd_X.append(mfd) #density plot if plot_mfd == True : hyp_name = ScL path = str(Run_name) + '/analysis/figures/analyze_branches/ScL/' + ScL do_the_plots(hyp_name,mfd_X,mega_bining_in_mag,xmin,xmax,ymin,ymax,Run_name,rate_in_catalog,plot_as_rep,a_s_model,rows,path,bining_in_mag) # '''###################################### #plot Mmax for each ScL of the logic tree ######################################''' for ScL in ScL_complet_list : rows = np.where(np.array(total_list_complet_ScL) == ScL)[0] #mfd_ScL_cumulative = [] Mmax_m_ScL = [] for index in rows : mfd = mega_mfd_cummulative[index] #mfd_ScL_cumulative.append(mfd) Mmax_m_ScL.append(m_Mmax[index]) if not os.path.exists(str(Run_name) + '/analysis/figures/Mmax/for_each_ScL'): os.makedirs(str(Run_name) + '/analysis/figures/Mmax/for_each_ScL') if plot_Mmax == True : plt.hist(Mmax_m_ScL,int(round(max(m_Mmax) - min(m_Mmax),1) * 10. + 1.)) plt.title(ScL) plt.savefig(str(Run_name) + '/analysis/figures/Mmax/for_each_ScL/Hist_Mmax_' + ScL +'.png',dpi = 100) #plt.show() plt.close() # '''###################################### #plot Mmax for each scenario set of the logic tree ######################################''' for Sc_set in scenarios_names_list : rows = np.where(np.array(total_list_scenario_name) == Sc_set)[0] #mfd_Sc_set_cumulative = [] Mmax_m_Sc_set = [] for index in rows : mfd = mega_mfd_cummulative[index] #mfd_Sc_set_cumulative.append(mfd) Mmax_m_Sc_set.append(m_Mmax[index]) if not os.path.exists(str(Run_name) + '/analysis/figures/Mmax/for_each_scenario_set'): os.makedirs(str(Run_name) + '/analysis/figures/Mmax/for_each_scenario_set') if plot_Mmax == True : plt.hist(Mmax_m_Sc_set,int(round(max(m_Mmax) - min(m_Mmax),1) * 10. + 1.)) plt.title(Sc_set) plt.savefig(str(Run_name) + '/analysis/figures/Mmax/for_each_scenario_set/Hist_Mmax_' + Sc_set +'.png',dpi = 100) #plt.show() plt.close() # ## # '''###################################### # # the magnitude of rupture in which each faults are involed, for each set of scenarios # work in kinda progress # ######################################''' # # for fault in faults_name_list: # for Sc_set in scenarios_names_list : # rows = np.where(np.array(mega_MFD) == Sc_set)[0] # #mfd_Sc_set_cumulative = [] # Mmax_m_Sc_set = [] # for index in rows : # mfd = mega_mfd_cummulative[index] # Mmax_m_Sc_set.append(m_Mmax[index]) # # # if not os.path.exists(str(Run_name) + '/analysis/figures/Mmax/for_each_scenario_set'): # os.makedirs(str(Run_name) + '/analysis/figures/Mmax/for_each_scenario_set') # # if plot_Mmax == True : # plt.hist(Mmax_m_Sc_set,int(round(max(m_Mmax) - min(m_Mmax),1) * 10. + 1.)) # plt.title(Sc_set) # plt.savefig(str(Run_name) + '/analysis/figures/Mmax/for_each_scenario_set/Hist_Mmax_' + Sc_set +'.png',dpi = 100) # #plt.show() # plt.close() # '''###################################### ######################################### # detailled plot for combinaison of # hypothesis ######################################### ######################################''' '''########################################## # calculate the difference between the mean rate of the model and the mean rate of the catalog ############################################''' if plot_mfd == True and plot_mfd_detailled == True: file_branch_to_catalog_name = str(Run_name) + '/analysis/txt_files/branch_vs_catalog.txt' file_branch_to_catalog = open(file_branch_to_catalog_name,'w') index_model = 0 for model in Model_list : rate_in_catalog = catalog_cum_rate[index_model] for MFD_type in MFD_type_list : for scenario in scenarios_names_list : for b_value in b_value_list : for BG_hyp in BG_hyp_list : for ScL in ScL_complet_list : rows_model = np.where(np.array(total_list_model) == model)[0] rows_mfd = np.where(np.array(total_list_MFD_type) == MFD_type)[0] rows_sc = np.where(np.array(total_list_scenario_name) == scenario)[0] rows_ScL = np.where(np.array(total_list_complet_ScL) == ScL)[0] rows_b = np.where(np.array(total_list_b_value) == b_value)[0] rows_bg = np.where(np.array(total_list_BG_hyp) == BG_hyp)[0] rows = list(set(rows_model).intersection(rows_mfd)) rows = list(set(rows).intersection(rows_sc)) rows = list(set(rows).intersection(rows_ScL)) rows = list(set(rows).intersection(rows_b)) rows = list(set(rows).intersection(rows_bg)) if len(rows) > 0: file_branch_to_catalog.write(str(model)+'\t') file_branch_to_catalog.write(str(MFD_type)+'\t') file_branch_to_catalog.write(str(scenario)+'\t') file_branch_to_catalog.write(str(b_value)+'\t') file_branch_to_catalog.write(str(BG_hyp)+'\t') file_branch_to_catalog.write(str(ScL)+'\t') mfd_X = [] for index in rows : mfd = mega_mfd_cummulative[index] mfd_X.append(mfd) mean_rate_model = np.array(mfd_X).mean(axis=0) mean_rate_catalog = np.array(rate_in_catalog)#.mean(axis=0) for i in range(len(mean_rate_catalog)): file_branch_to_catalog.write(str(mean_rate_model[i]/mean_rate_catalog[i]-1.)+'\t') file_branch_to_catalog.write('\n') index_model +=1 file_branch_to_catalog.close() '''########################################## # calculate the difference between the mean rate of the model and the mean rate of the catalog ############################################''' if plot_mfd == True and plot_mfd_detailled == True: file_branch_to_catalog_name = str(Run_name) + '/analysis/txt_files/branch_vs_catalog.txt' file_branch_to_catalog = open(file_branch_to_catalog_name,'w') index_model = 0 for model in Model_list : rate_in_catalog = catalog_cum_rate[index_model] mean_rate_catalog = np.array(rate_in_catalog)#.mean(axis=0) for MFD_type in MFD_type_list : for scenario in scenarios_names_list : for b_value in b_value_list : for BG_hyp in BG_hyp_list : for ScL in ScL_complet_list : rows_model = np.where(np.array(total_list_model) == model)[0] rows_mfd = np.where(np.array(total_list_MFD_type) == MFD_type)[0] rows_sc = np.where(np.array(total_list_scenario_name) == scenario)[0] rows_ScL = np.where(np.array(total_list_complet_ScL) == ScL)[0] rows_b = np.where(np.array(total_list_b_value) == b_value)[0] rows_bg = np.where(np.array(total_list_BG_hyp) == BG_hyp)[0] rows = list(set(rows_model).intersection(rows_mfd)) rows = list(set(rows).intersection(rows_sc)) rows = list(set(rows).intersection(rows_ScL)) rows = list(set(rows).intersection(rows_b)) rows = list(set(rows).intersection(rows_bg)) if len(rows) > 0: file_branch_to_catalog.write(str(model)+'\t') file_branch_to_catalog.write(str(MFD_type)+'\t') file_branch_to_catalog.write(str(scenario)+'\t') file_branch_to_catalog.write(str(b_value)+'\t') file_branch_to_catalog.write(str(BG_hyp)+'\t') file_branch_to_catalog.write(str(ScL)+'\t') mfd_X = [] for index in rows : mfd = mega_mfd_cummulative[index] mfd_X.append(mfd) mean_rate_model = np.array(mfd_X).mean(axis=0) for i in range(len(mean_rate_catalog)): file_branch_to_catalog.write(str(mean_rate_model[i]/mean_rate_catalog[i]-1.)+'\t') file_branch_to_catalog.write('\n') index_model +=1 file_branch_to_catalog.close() '''########################################## #plot mfd for each MFD shape hypothesis and scenario set ############################################''' if plot_mfd == True and plot_mfd_detailled == True: if len(MFD_type_list) > 1 and len(scenarios_names_list)>1: index_model = 0 for model in Model_list : rate_in_catalog = catalog_cum_rate[index_model] for MFD_type in MFD_type_list : for scenario in scenarios_names_list : if not os.path.exists(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model): os.makedirs(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model) if not os.path.exists(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/' + MFD_type): os.makedirs(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/' + MFD_type) if not os.path.exists(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/' + MFD_type+ '/' +scenario): os.makedirs(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/' + MFD_type+ '/' +scenario) rows_mfd = np.where(np.array(total_list_MFD_type) == MFD_type)[0] rows_sc = np.where(np.array(total_list_scenario_name) == scenario)[0] rows_i = list(set(rows_mfd).intersection(rows_sc)) rows_model = np.where(np.array(total_list_model) == model)[0] rows = list(set(rows_i).intersection(rows_model)) mfd_X = [] for index in rows : mfd = mega_mfd_cummulative[index] mfd_X.append(mfd) #density plot if plot_mfd == True : hyp_name = model + ' ' + MFD_type + ' ' + scenario path = str(Run_name) +'/analysis/figures/analyze_branches/Model/' + model+ '/' + MFD_type+ '/' +scenario do_the_plots(hyp_name,mfd_X,mega_bining_in_mag,xmin,xmax,ymin,ymax,Run_name,rate_in_catalog,plot_as_rep,a_s_model,rows,path,bining_in_mag) index_model +=1 '''########################################## #plot mfd for each background hypothesis and scenario set ############################################''' if plot_mfd == True and plot_mfd_detailled == True: if len(BG_hyp_list) > 1 and len(scenarios_names_list)>1: index_model = 0 for model in Model_list : rate_in_catalog = catalog_cum_rate[index_model] for BG_hyp in BG_hyp_list : for scenario in scenarios_names_list : if not os.path.exists(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/' + BG_hyp+ '/' +scenario): os.makedirs(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/' + BG_hyp+ '/' +scenario) rows_mfd = np.where(np.array(total_list_BG_hyp) == BG_hyp)[0] rows_sc = np.where(np.array(total_list_scenario_name) == scenario)[0] rows = list(set(rows_mfd).intersection(rows_sc)) rows_model = np.where(np.array(total_list_model) == model)[0] rows = list(set(rows).intersection(rows_model)) mfd_X = [] for index in rows : mfd = mega_mfd_cummulative[index] mfd_X.append(mfd) #density plot if plot_mfd == True : hyp_name = BG_hyp + ' ' + scenario path = str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/' + BG_hyp+ '/' +scenario #total_list_hyp = total_list_MFD_type do_the_plots(hyp_name,mfd_X,mega_bining_in_mag,xmin,xmax,ymin,ymax,Run_name,rate_in_catalog,plot_as_rep,a_s_model,rows,path,bining_in_mag) index_model +=1 '''########################################## #plot mfd for each model hypothesis and MFD ############################################''' if plot_mfd == True and plot_mfd_detailled == True: if len(Model_list) > 1 and len(MFD_type_list)>1: index_model = 0 for model in Model_list : rate_in_catalog = catalog_cum_rate[index_model] for MFD_type in MFD_type_list : if not os.path.exists(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/' +MFD_type): os.makedirs(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/' +MFD_type) rows_i = np.where(np.array(total_list_model) == model)[0] rows_j = np.where(np.array(total_list_MFD_type) == MFD_type)[0] rows = list(set(rows_i).intersection(rows_j)) mfd_X = [] for index in rows : mfd = mega_mfd_cummulative[index] mfd_X.append(mfd) #density plot if plot_mfd == True : hyp_name = model + ' ' + MFD_type path = str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/' +MFD_type #total_list_hyp = total_list_MFD_type do_the_plots(hyp_name,mfd_X,mega_bining_in_mag,xmin,xmax,ymin,ymax,Run_name,rate_in_catalog,plot_as_rep,a_s_model,rows,path,bining_in_mag) index_model +=1 '''########################################## #plot mfd for each background hypothesis and mfd ############################################''' if plot_mfd == True and plot_mfd_detailled == True: if len(BG_hyp_list) > 1 and len(MFD_type_list)>1: index_model = 0 for model in Model_list : rate_in_catalog = catalog_cum_rate[index_model] for BG_hyp in BG_hyp_list : for MFD_type in MFD_type_list : if not os.path.exists(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/' + BG_hyp+ '/' +MFD_type): os.makedirs(str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/' + BG_hyp+ '/' +MFD_type) rows_i = np.where(np.array(total_list_BG_hyp) == BG_hyp)[0] rows_j = np.where(np.array(total_list_MFD_type) == MFD_type)[0] rows = list(set(rows_i).intersection(rows_j)) rows_model = np.where(np.array(total_list_model) == model)[0] rows = list(set(rows).intersection(rows_model)) mfd_X = [] for index in rows : mfd = mega_mfd_cummulative[index] mfd_X.append(mfd) #density plot if plot_mfd == True : hyp_name = BG_hyp + ' ' + MFD_type path = str(Run_name) + '/analysis/figures/analyze_branches/Model/' + model+ '/'+ BG_hyp+ '/' +MFD_type #total_list_hyp = total_list_MFD_type do_the_plots(hyp_name,mfd_X,mega_bining_in_mag,xmin,xmax,ymin,ymax,Run_name,rate_in_catalog,plot_as_rep,a_s_model,rows,path,bining_in_mag) index_model +=1 return (total_list_ScL,total_list_dimension_used,geologic_moment_rate, geologic_moment_rate_no_as,total_list_scenario_name,total_list_MFD_type, mega_mfd_cummulative,total_list_model,total_list_sample,total_list_BG_hyp)
tif_ds.SetProjection(src_ds.GetProjection()) tif_ds.SetGeoTransform(src_ds.GetGeoTransform()) if src_ds.GetGCPCount() > 0: tif_ds.SetGCPs(src_ds.GetGCPs(), src_ds.GetGCPProjection()) # ---------------------------------------------------------------------------- # Do the processing one scanline at a time. progress(0.0) for iY in range(src_ds.RasterYSize): src_data = src_band.ReadAsArray(0, iY, src_ds.RasterXSize, 1) for iBand in range(out_bands): band_lookup = lookup[iBand] dst_data = Numeric.take(band_lookup, src_data) tif_ds.GetRasterBand(iBand + 1).WriteArray(dst_data, 0, iY) progress((iY + 1.0) / src_ds.RasterYSize) tif_ds = None # ---------------------------------------------------------------------------- # Translate intermediate file to output format if desired format is not TIFF. if tif_filename != dst_filename: tif_ds = gdal.Open(tif_filename) dst_driver.CreateCopy(dst_filename, tif_ds) tif_ds = None gtiff_driver.Delete(tif_filename)
def onp_take(x, indices): a = onp.take(x, indices) b = onp.take(x, indices, axis=-1) c = onp.take(x, indices, axis=0, mode='wrap') d = onp.take(x, indices, axis=1, mode='clip') return a, b, c, d
def make_SED(m, par, model, DIG=False): # set up the SEDs and images if DIG: dig_str = "_DIG" else: dig_str = "" if cfg.par.SED_MONOCHROMATIC == True: # since all sources have the same spectrum just take the nu # from the input SED from the first source monochromatic_nu = m.sources[0].spectrum['nu'] * u.Hz monochromatic_lam = (constants.c / monochromatic_nu).to(u.micron).value[::-1] if cfg.par.FIX_SED_MONOCHROMATIC_WAVELENGTHS == True: # idx = np.round(np.linspace(np.min(np.where(monochromatic_lam > cfg.par.SED_MONOCHROMATIC_min_lam)[0]),\ ## np.max(np.where(monochromatic_lam < cfg.par.SED_MONOCHROMATIC_max_lam)[0]),\ # cfg.par.SED_MONOCHROMATIC_nlam)) idx = np.where((monochromatic_lam > cfg.par.SED_MONOCHROMATIC_min_lam) & ( monochromatic_lam < cfg.par.SED_MONOCHROMATIC_max_lam))[0] monochromatic_lam = np.take(monochromatic_lam, list(idx)) m.set_monochromatic(True, wavelengths=monochromatic_lam) m.set_raytracing(True) m.set_n_photons(initial=par.n_photons_initial, imaging_sources=par.n_photons_imaging, imaging_dust=par.n_photons_imaging, raytracing_sources=par.n_photons_raytracing_sources, raytracing_dust=par.n_photons_raytracing_dust) m.set_n_initial_iterations(3) m.set_convergence(True, percentile=99., absolute=1.01, relative=1.01) sed = m.add_peeled_images(sed=True, image=False) if cfg.par.MANUAL_ORIENTATION == True: sed.set_viewing_angles(np.array(cfg.par.THETA), np.array(cfg.par.PHI)) else: sed.set_viewing_angles(np.linspace(0, 90, par.NTHETA).tolist() * par.NPHI, np.repeat(np.linspace(0, 90, par.NPHI), par.NPHI)) sed.set_track_origin('basic') if cfg.par.SKIP_RT == False: m.write(model.inputfile + '.sed', overwrite=True) m.run(model.outputfile + str(dig_str) + '.sed', mpi=True, n_processes=par.n_MPI_processes, overwrite=True) print( '[pd_front_end]: Beginning RT Stage: Calculating SED using a monochromatic spectrum equal to the input SED') else: m.set_raytracing(True) m.set_n_photons(initial=par.n_photons_initial, imaging=par.n_photons_imaging, raytracing_sources=par.n_photons_raytracing_sources, raytracing_dust=par.n_photons_raytracing_dust) m.set_n_initial_iterations(7) m.set_convergence(True, percentile=99., absolute=1.01, relative=1.01) sed = m.add_peeled_images(sed=True, image=False) sed.set_wavelength_range(2500, 0.001, 1000.) if cfg.par.MANUAL_ORIENTATION == True: sed.set_viewing_angles(np.array(cfg.par.THETA), np.array(cfg.par.PHI)) else: sed.set_viewing_angles(np.linspace(0, 90, par.NTHETA).tolist( ) * par.NPHI, np.repeat(np.linspace(0, 90, par.NPHI), par.NPHI)) sed.set_track_origin('basic') print('[pd_front_end]: Beginning RT Stage: Calculating SED using a binned spectrum') # Run the Model if cfg.par.SKIP_RT == False: m.write(model.inputfile + '.sed', overwrite=True) m.run(model.outputfile + '.sed', mpi=True, n_processes=par.n_MPI_processes, overwrite=True)
def apply(self, inputs_q, inputs_kv, num_heads, dtype=jnp.float32, qkv_features=None, out_features=None, attention_axis=None, causal_mask=False, padding_mask=None, key_padding_mask=None, segmentation=None, key_segmentation=None, cache=None, broadcast_dropout=True, dropout_rng=None, dropout_rate=0., deterministic=False, precision=None, kernel_init=default_kernel_init, bias_init=zeros, bias=True ): """Applies multi-head dot product attention on the input data. Projects the inputs into multi-headed query, key, and value vectors, applies dot-product attention and project the results to an output vector. This can be used for encoder-decoder attention by specifying both `inputs_q` and `inputs_kv` orfor self-attention by only specifying `inputs_q` and setting `inputs_kv` to None. Args: inputs_q: input queries of shape `[bs, dim1, dim2, ..., dimN, features]`. inputs_kv: key/values of shape `[bs, dim1, dim2, ..., dimN, features]` or None for self-attention, inn which case key/values will be derived from inputs_q. num_heads: number of attention heads. Features (i.e. inputs_q.shape[-1]) should be divisible by the number of heads. dtype: the dtype of the computation (default: float32) qkv_features: dimension of the key, query, and value. out_features: dimension of the last projection attention_axis: axes over which the attention is applied ( 'None' means attention over all axes, but batch, heads, and features). causal_mask: boolean specifying whether to apply a causal mask on the attention weights. If True, the output at timestep `t` will not depend on inputs at timesteps strictly greater than `t`. padding_mask: boolean specifying query tokens that are pad token w/ False. key_padding_mask: boolean specifying key-value tokens that are pad token w/ False. segmentation: segment indices for packed inputs_q data. key_segmentation: segment indices for packed inputs_kv data. cache: an instance of `flax.nn.attention.Cache` used for efficient autoregressive decoding. broadcast_dropout: bool: use a broadcasted dropout along batch dims. dropout_rng: JAX PRNGKey: to be used for dropout dropout_rate: dropout rate deterministic: bool, deterministic or not (to apply dropout) precision: numerical precision of the computation see `jax.lax.Precision` for details. kernel_init: initializer for the kernel of the Dense layers. bias_init: initializer for the bias of the Dense layers. bias: bool: whether pointwise QKVO dense transforms use bias. attention_fn: dot_product_attention or compatible function. Accepts query, key, value, and returns output of shape `[bs, dim1, dim2, ..., dimN,, num_heads, value_channels]`` Returns: output of shape `[bs, dim1, dim2, ..., dimN, features]`. """ assert causal_mask or not cache, ( 'Caching is only support for causal attention.') if inputs_kv is None: inputs_kv = inputs_q is_self_attention = inputs_kv is inputs_q if attention_axis is None: attention_axis = tuple(range(1, inputs_q.ndim - 1)) features = out_features or inputs_q.shape[-1] qkv_features = qkv_features or inputs_q.shape[-1] assert qkv_features % num_heads == 0, ( 'Memory dimension must be divisible by number of heads.') head_dim = qkv_features // num_heads dense = DenseGeneral.partial( axis=-1, features=(num_heads, head_dim), kernel_init=kernel_init, bias_init=bias_init, bias=bias, precision=precision) # project inputs_q to multi-headed q/k/v # dimensions are then [bs, dims..., n_heads, n_features_per_head] query, key, value = (dense(inputs_q, dtype=dtype, name='query'), dense(inputs_kv, dtype=dtype, name='key'), dense(inputs_kv, dtype=dtype, name='value')) if cache: assert isinstance(cache, Cache), 'cache must be an instance of Cache' if self.is_initializing(): cache.store(np.array((key.ndim,) + key.shape[-2:], dtype=np.int32)) else: cache_entry = cache.retrieve(None) expected_shape = list(cache_entry.key.shape[:-2]) for attn_dim in attention_axis: expected_shape[attn_dim] = 1 expected_shape = tuple(expected_shape) + inputs_q.shape[-1:] if expected_shape != inputs_q.shape: raise ValueError('Invalid shape provided, ' 'expected shape %s instead got %s.' % (expected_shape, inputs_q.shape)) if not isinstance(cache_entry, _CacheEntry): raise ValueError('Cache is not initialized.') cshape = cache_entry.key.shape indices = [0] * len(cshape) i = cache_entry.i attn_size = np.prod(np.take(cshape, attention_axis)) for attn_dim in attention_axis: attn_size //= cshape[attn_dim] indices[attn_dim] = i // attn_size i = i % attn_size key = lax.dynamic_update_slice(cache_entry.key, key, indices) value = lax.dynamic_update_slice(cache_entry.value, value, indices) one = jnp.array(1, jnp.uint32) cache_entry = cache_entry.replace(i=cache_entry.i + one, key=key, value=value) cache.store(cache_entry) # create attention masks mask_components = [] if causal_mask: if cache and not self.is_initializing(): bias_pre_shape = (1,) * (key.ndim - 1) attn_shape = tuple(np.take(key.shape, attention_axis)) attn_size = np.prod(attn_shape) ii = jnp.arange(attn_size, dtype=jnp.uint32) mask = ii < cache_entry.i mask_components.append(mask.reshape(bias_pre_shape + attn_shape)) else: mask_components.append(_make_causal_mask(key, attention_axis)) if (padding_mask is not None or key_padding_mask is not None) and not cache: if key_padding_mask is None: if is_self_attention: key_padding_mask = padding_mask else: key_padding_shape = [inputs_kv.shape[dim] for dim in attention_axis] key_padding_mask = jnp.full(key_padding_shape, True) if padding_mask is None: if is_self_attention: padding_mask = key_padding_mask else: padding_shape = [inputs_q.shape[dim] for dim in attention_axis] padding_mask = jnp.full(padding_shape, True) padding_mask = make_padding_mask( padding_mask_query=padding_mask, padding_mask_key=key_padding_mask, query_shape=query.shape, key_shape=key.shape, attention_axis=attention_axis) mask_components.append(padding_mask) if segmentation is not None: if key_segmentation is None: assert is_self_attention key_segmentation = segmentation segmentation_mask = make_padding_mask( padding_mask_query=segmentation, padding_mask_key=key_segmentation, query_shape=query.shape, key_shape=key.shape, attention_axis=attention_axis, segmentation_mask=True) mask_components.append(segmentation_mask) if mask_components: attention_mask = mask_components[0] for component in mask_components[1:]: attention_mask = jnp.logical_and(attention_mask, component) # attention mask in the form of attention bias attention_bias = lax.select( attention_mask > 0, jnp.full(attention_mask.shape, 0.).astype(dtype), jnp.full(attention_mask.shape, -1e10).astype(dtype)) else: attention_bias = None # apply attention x = self.fast_unstruct_rfm_dot_product_attention.dot_product_attention( query, key, value, dtype=dtype, axis=attention_axis, bias=attention_bias, precision=precision, dropout_rng=dropout_rng, dropout_rate=dropout_rate, broadcast_dropout=broadcast_dropout, deterministic=deterministic) # back to the original inputs dimensions out = DenseGeneral( x, features=features, axis=(-2, -1), kernel_init=kernel_init, bias_init=bias_init, bias=bias, dtype=dtype, precision=precision, name='out') return out
for_training=False, grad_req='null', shared_module=net) begin = time.time() for epoch in range(100): avg_cost = 0 total_batch = int(math.ceil(dataX.shape[0] / batch_size)) shuffle_ind = np.random.permutation(np.arange(dataX.shape[0])) dataX = dataX[shuffle_ind, :] dataY = dataY[shuffle_ind] for i in range(total_batch): # Slice the data batch and target batch. # Note that we use np.take to ensure that the batch will be padded correctly. data_npy = np.take(dataX, indices=np.arange(i * batch_size, (i+1) * batch_size), axis=0, mode="clip") target_npy = np.take(dataY, indices=np.arange(i * batch_size, (i + 1) * batch_size), axis=0, mode="clip") net.forward_backward(data_batch=mx.io.DataBatch(data=[nd.array(data_npy)], label=[nd.array(target_npy)])) loss = net.get_outputs()[0].asscalar() avg_cost += loss / total_batch net.update() print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost)) print('Learning Finished!') end = time.time() print("Total Time Spent: %gs" %(end - begin))
def load_and_augment_data(dataset_name, model_params): """ From datasets.CIFAR10: dataset.data: the image as numpy array, shape: (50000, 32, 32, 3) dataset.targets: labels of the images as list, len: 50000 :return: augmented_labeled_X: the tensor of augmented labeled images (K=1), size: (n_labeled_per_class * n_classes , 32, 32, 3) augmented_unlabeled_X: the tensor of augmented unlabeled images (K=2), size: ((N/10 - n_labeled_per_class - n_validation) * n_classes * K , 32, 32, 3) train_labeled_targets: the tensor of labeled targets, size = n_labeled_per_class * n_classes train_unlabeled_targets: the tensor of unlabeled targets, size = (N/10 - n_labeled_per_class - n_validation) * n_classes """ # Step 1: Set the model's hyperparameters n_classes = model_params["n_classes"] n_labeled_per_class = model_params["n_labeled_per_class"] n_validation = model_params["n_validation"] K = model_params["K"] # Step 2: Load the dataset if dataset_name == 'CIFAR10': dataset = datasets.CIFAR10(root="./datasets", train=True, download=True) elif dataset_name == 'SLT10': dataset = datasets.STL10(root="./datasets", download=True) else: raise ValueError("Invalid dataset name") # Step 3: Split the indexes train_labeled_indexes, train_unlabeled_indexes, validation_indexes = \ split_indexes(n_classes, n_labeled_per_class, n_validation, dataset.targets) # Step 4: Attract the images for training, validation train_labeled_images = np.take(dataset.data, train_labeled_indexes, axis=0) train_unlabeled_images = np.take(dataset.data, train_unlabeled_indexes, axis=0) target_array = np.asarray(dataset.targets) train_labeled_targets = np.take(target_array, train_labeled_indexes, axis=0) train_unlabeled_targets = np.take(target_array, train_unlabeled_indexes, axis=0) validation_images = np.take(dataset.data, validation_indexes, axis=0) validation_targets = np.take(target_array, validation_indexes, axis=0) # Step 5: Normalise the datasets train_labeled_images = normalise(train_labeled_images) train_unlabeled_images = normalise(train_unlabeled_images) # Step 6: Augment training images augmented_labeled_X = augment(train_labeled_images, K=1) augmented_unlabeled_X = augment(train_unlabeled_images, K=K) # Take a look at some of the augmented images # displayImages(train_labeled_images[:10], title1="Original-Labeled", title2="Augmented-Labeled", # augmented_images=augmented_labeled_X[:10], labels=train_labeled_targets[:10]) # n_unlabeled = train_unlabeled_images.shape[0] # displayImages(train_unlabeled_images[:10], title1="Original-Unlabeled", title2="Augmented-Unlabeled", # augmented_images=augmented_unlabeled_X[:10], labels=train_unlabeled_targets[:10]) # displayImages(augmented_unlabeled_X[:10], title1="Augmented-Unlabeled1", title2="Augmented-Unlabeled2", # augmented_images=augmented_unlabeled_X[n_unlabeled:10+n_unlabeled], # labels=train_unlabeled_targets[:10]) # Step 7: Change the dimension of np.array in oder for it to work with torch augmented_labeled_X = to_tensor_dim(augmented_labeled_X) augmented_unlabeled_X = to_tensor_dim(augmented_unlabeled_X) validation_images = to_tensor_dim(validation_images) return torch.from_numpy(augmented_labeled_X), torch.from_numpy(augmented_unlabeled_X), \ torch.from_numpy(train_labeled_targets), torch.from_numpy(train_unlabeled_targets), \ torch.from_numpy(validation_images), torch.from_numpy(validation_targets)
def __next__(self, batch_size=None): """Generate each mini-batch. Args: batch_size (int, optional): the size of mini-batch Returns: A tuple of `(inputs, labels, inputs_seq_len, labels_seq_len, input_names)` inputs: list of input data of size `[B, T, input_dim]` labels_main: list of target labels in the main task, of size `[B, T]` labels_sub: list of target labels in the sub task, of size `[B, T]` inputs_seq_len: list of length of inputs of size `[B]` input_names: list of file name of input data of size `[B]` is_new_epoch (bool): If true, one epoch is finished """ if self.max_epoch is not None and self.epoch >= self.max_epoch: raise StopIteration # NOTE: max_epoch = None means infinite loop if batch_size is None: batch_size = self.batch_size # reset if self.is_new_epoch: self.is_new_epoch = False if self.sort_utt: # Sort all uttrances by length if len(self.rest) > batch_size: data_indices = sorted(list(self.rest))[:batch_size] self.rest -= set(data_indices) # NOTE: rest is uttrance length order else: # Last mini-batch data_indices = list(self.rest) self.reset() self.is_new_epoch = True self.epoch += 1 if self.epoch == self.sort_stop_epoch: self.sort_utt = False # Shuffle data in the mini-batch random.shuffle(data_indices) elif self.shuffle: # Randomly sample uttrances if len(self.rest) > batch_size: data_indices = random.sample(list(self.rest), batch_size) self.rest -= set(data_indices) else: # Last mini-batch data_indices = list(self.rest) self.reset() self.is_new_epoch = True self.epoch += 1 # Shuffle selected mini-batch random.shuffle(data_indices) else: if len(self.rest) > batch_size: data_indices = sorted(list(self.rest))[:batch_size] self.rest -= set(data_indices) # NOTE: rest is in name order else: # Last mini-batch data_indices = list(self.rest) self.reset() self.is_new_epoch = True self.epoch += 1 # Compute max frame num in mini-batch max_frame_num = max( map(lambda x: x.shape[0], self.input_list[data_indices])) # Compute max target label length in mini-batch max_seq_len_main = max(map(len, self.label_main_list[data_indices])) max_seq_len_sub = max(map(len, self.label_sub_list[data_indices])) # Initialization inputs = np.zeros((len(data_indices), max_frame_num, self.input_list[0].shape[-1] * self.splice), dtype=np.float32) labels_main = np.array([[self.padded_value] * max_seq_len_main] * len(data_indices), dtype=np.int32) labels_sub = np.array([[self.padded_value] * max_seq_len_sub] * len(data_indices), dtype=np.int32) inputs_seq_len = np.zeros((len(data_indices), ), dtype=np.int32) input_names = np.array( list( map(lambda path: basename(path).split('.')[0], np.take(self.input_paths, data_indices, axis=0)))) # Set values of each data in mini-batch for i_batch, x in enumerate(data_indices): data_i = self.input_list[x] frame_num, input_size = data_i.shape # Splicing data_i = data_i.reshape(1, frame_num, input_size) data_i = do_splice(data_i, splice=self.splice, batch_size=1).reshape(frame_num, -1) inputs[i_batch, :frame_num, :] = data_i labels_main[i_batch, :len(self.label_main_list[x] )] = self.label_main_list[x] labels_sub[ i_batch, :len(self.label_sub_list[x])] = self.label_sub_list[x] inputs_seq_len[i_batch] = frame_num self.iteration += len(data_indices) return (inputs, labels_main, labels_sub, inputs_seq_len, input_names), self.is_new_epoch
def main(random_state=1, test_size=0.2, n_instances=1000000, out_dir='continuous'): # create logger logger = get_logger('log.txt') # columns to use cols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] # data dtypes for each column dtypes = {c: np.float32 for c in cols} dtypes[0] = np.uint8 # retrieve dataset start = time.time() df = pd.read_csv('day_0', sep='\t', header=None, usecols=cols, dtype=dtypes, nrows=n_instances) logger.info('reading in dataset...{:.3f}s'.format(time.time() - start)) logger.info('{}'.format(df)) logger.info('Memory usage: {:,} bytes'.format( df.memory_usage(deep=True).sum())) # get numpy array X = df.values df = None # impute missing values with the mean logger.info('imputing missing values with the mean...') assert np.isnan(X[:, 0]).sum() == 0 col_mean = np.nanmean(X, axis=0) nan_indices = np.where(np.isnan(X)) X[nan_indices] = np.take(col_mean, nan_indices[1]) # move the label column in X to the last column logger.info('moving label column to the last column...') y = X[:, 0].copy().reshape(-1, 1) X = np.delete(X, 0, 1) X = np.hstack([X, y]) # split into train and test logger.info('splitting into train and test sets...') indices = np.arange(X.shape[0]) n_train_samples = int(len(indices) * (1 - test_size)) np.random.seed(random_state) train_indices = np.random.choice(indices, size=n_train_samples, replace=False) test_indices = np.setdiff1d(indices, train_indices) train = X[train_indices] test = X[test_indices] logger.info('train.shape: {}, label sum: {}'.format( train.shape, train[:, -1].sum())) logger.info('test.shape: {}, label sum: {}'.format(test.shape, test[:, -1].sum())) # save to numpy format logger.info('saving...') os.makedirs(out_dir, exist_ok=True) np.save(os.path.join(out_dir, 'train.npy'), train) np.save(os.path.join(out_dir, 'test.npy'), test)
def make_plots(clust_CI, clust_MI, clust_MI_r, clust_params): ''' Plot the CI vs MI diagram for each MI. ''' # Color is associated with the dist; size with the initial mass and # the marker with the age. mrk = {7.: ('o', '$\log(age)=7.$'), 8.: ('s', '$\log(age)=8.$'), 9.: ('D', '$\log(age)=9.$')} # Make plot. plt.figure(figsize=(14, 25)) # create the top-level container gs = gridspec.GridSpec(4, 3, width_ratios=[1, 1, 0.05]) xy_font_s = 21 ax0 = plt.subplot(gs[0]) ax0.set_title('Decontamination algorithm', fontsize=xy_font_s) plt.ylabel('$MI_1$', fontsize=xy_font_s) plt.xlim(0., 0.97) plt.ylim(-0.01, 0.99) # make these tick labels invisible plt.setp(ax0.get_xticklabels(), visible=False) # Set steps in axis. ax0.yaxis.set_major_locator(MultipleLocator(0.2)) # Plot grid plt.grid(b=True, which='major', color='gray', linestyle='--', zorder=1) # Add text box with MI equation. text = r'$MI_1 = n_m/N_{cl}$' '\n' r' $(MP >\,0.9)$' x_align, y_align = 0.57, 0.85 plt.text(x_align, y_align, text, transform=ax0.transAxes, bbox=dict(facecolor='white', alpha=0.6), fontsize=(xy_font_s + 2)) # Define color map. cm = plt.cm.get_cmap('RdYlBu_r') # Order. mass, age, dist = clust_params order = np.argsort(-np.array(mass)) z1 = np.take((np.array(mass) / 5.), order) z2 = np.take(age, order) z3 = np.take(dist, order) # Order before plotting. x = np.take(clust_CI, order) y = np.take(clust_MI[0], order) for key, value in sorted(mrk.items()): s1 = (z2 == key) plt.scatter(x[s1], y[s1], marker=value[0], label=value[1], s=z1[s1], c=z3[s1], cmap=cm, lw=0.2) # Plot regression line. m, b = np.polyfit(clust_CI, clust_MI[0], 1) range_CI = np.linspace(0., 1., 10) plt.plot(range_CI, m * range_CI + b, c='k', ls='--') # # Random MI. ax1 = plt.subplot(gs[1]) ax1.set_title('Random probability', fontsize=xy_font_s) plt.xlim(0., 0.97) plt.ylim(-0.01, 0.99) # make these tick labels invisible plt.setp(ax1.get_yticklabels(), visible=False) plt.setp(ax1.get_xticklabels(), visible=False) ax1.yaxis.set_major_locator(MultipleLocator(0.2)) # Plot grid plt.grid(b=True, which='major', color='gray', linestyle='--', zorder=1) # Define color map. cm = plt.cm.get_cmap('RdYlBu_r') # Order. mass, age, dist = clust_params order = np.argsort(-np.array(mass)) z1 = np.take((np.array(mass) / 5.), order) z2 = np.take(age, order) z3 = np.take(dist, order) # Order before plotting. x = np.take(clust_CI, order) y = np.take(clust_MI_r[0], order) for key, value in sorted(mrk.items()): s1 = (z2 == key) SC = plt.scatter(x[s1], y[s1], marker=value[0], label=value[1], s=z1[s1], c=z3[s1], cmap=cm, lw=0.2) # Plot regression line. m, b = np.polyfit(clust_CI, clust_MI_r[0], 1) range_CI = np.linspace(0., 1., 10) plt.plot(range_CI, m * range_CI + b, c='k', ls='--') # Plot legend. legend = plt.legend(loc="upper right", markerscale=0.7, scatterpoints=1, fontsize=17) for i in range(len(mrk)): legend.legendHandles[i].set_color('k') # Colorbar axp2 = plt.subplot(gs[2]) cbar = plt.colorbar(SC, cax=axp2) cbar.set_ticks([0.5, 1., 3., 5.]) cbar.set_ticklabels([0.5, 1., 3., 5.]) cbar.set_label('$dist\,(kpc)$', fontsize=xy_font_s, labelpad=-15, y=0.35) # # Second MI. ax3 = plt.subplot(gs[3]) plt.xlabel('$CI$', fontsize=xy_font_s) plt.ylabel('$MI_2$', fontsize=xy_font_s) plt.xlim(0., 0.97) plt.ylim(max(min(clust_MI[1]) - 0.1, -2.5), 0.99) ax3.yaxis.set_major_locator(MultipleLocator(0.4)) # Plot grid plt.grid(b=True, which='major', color='gray', linestyle='--', zorder=1) # Add text box with MI equation. text = (r'$MI_2 = \frac{\left(\sum^{n_m}{p_m} - ' + r' \sum^{n_f}{p_f}\right)}{N_{cl}}$') x_align, y_align = 0.52, 0.86 plt.text(x_align, y_align, text, transform=ax3.transAxes, bbox=dict(facecolor='white', alpha=0.6), fontsize=(xy_font_s + 2)) plt.axhline(y=0., linestyle='--', color='r', zorder=3) # Define color map. cm = plt.cm.get_cmap('RdYlBu_r') # Order. mass, age, dist = clust_params order = np.argsort(-np.array(mass)) z1 = np.take((np.array(mass) / 5.), order) z2 = np.take(age, order) z3 = np.take(dist, order) # Order before plotting. x = np.take(clust_CI, order) y = np.take(clust_MI[1], order) for key, value in sorted(mrk.items()): s1 = (z2 == key) plt.scatter(x[s1], y[s1], marker=value[0], label=value[1], s=z1[s1], c=z3[s1], cmap=cm, lw=0.2) # Plot regression line. m, b = np.polyfit(clust_CI, clust_MI[1], 1) range_CI = np.linspace(0., 1., 10) plt.plot(range_CI, m * range_CI + b, c='k', ls='--') plt.axhline(y=0., linestyle='--', color='r', zorder=3) # # Second random MI. ax4 = plt.subplot(gs[4]) plt.xlabel('$CI$', fontsize=xy_font_s) plt.xlim(0., 0.97) plt.ylim(max(min(clust_MI[1]) - 0.1, -2.5), 0.99) # make these tick labels invisible plt.setp(ax4.get_yticklabels(), visible=False) ax4.yaxis.set_major_locator(MultipleLocator(0.4)) # Plot grid plt.grid(b=True, which='major', color='gray', linestyle='--', zorder=1) plt.axhline(y=0., linestyle='--', color='r', zorder=3) # Define color map. cm = plt.cm.get_cmap('RdYlBu_r') # Order. mass, age, dist = clust_params order = np.argsort(-np.array(mass)) z1 = np.take((np.array(mass) / 5.), order) z2 = np.take(age, order) z3 = np.take(dist, order) # Order before plotting. x = np.take(clust_CI, order) y = np.take(clust_MI_r[1], order) for key, value in sorted(mrk.items()): s1 = (z2 == key) plt.scatter(x[s1], y[s1], marker=value[0], label=value[1], s=z1[s1], c=z3[s1], cmap=cm, lw=0.2) # Plot regression line. m, b = np.polyfit(clust_CI, clust_MI_r[1], 1) range_CI = np.linspace(0., 1., 10) plt.plot(range_CI, m * range_CI + b, c='k', ls='--') # Colorbar #axp4 = plt.subplot(gs[5]) #cbar = plt.colorbar(SC2, cax=axp4) #cbar.set_ticks([0.5, 1., 3., 5.]) #cbar.set_ticklabels([0.5, 1., 3., 5.]) #cbar.set_label('$dist\,(kpc)$', fontsize=xy_font_s, labelpad=-15, y=0.35) # Save to output png file. plt.tight_layout() out_png = dir_memb_files + 'MI_analisys.png' plt.savefig(out_png, dpi=150) print 'Plot done.'
def load_dataset(data_dir, test_size, val_size): """ Args: data_dir: path to folder test_size: test set percentage val_size: val set percentage Returns: dict containing mapping from from class to training, validation and testing set """ tot = test_size + val_size train_size = 100 - tot assert test_size >= 1, 'Test percent must be non-negative' assert val_size >= 1, 'Valid percent must be non-negative' assert test_size <= 25, 'Keep test percent below 25' assert val_size <= 25, 'Keep valid percent below 25' assert tot <= 40, 'Train on atleast 60%. Current training percent {}'.format( train_size) if os.path.exists(data_dir): dataset = {} print('/{} exists'.format(data_dir)) folders = [ folder for folder in os.listdir(data_dir) if not folder == '.DS_Store' ] print(folders) for folder in folders: files = [] files = [ file for file in os.listdir(data_dir + '/' + folder) if not file == '.DS_Store' ] num_files = len(files) shuffled = np.random.permutation(num_files) n_val, n_test = int((val_size / 100) * num_files), int( (test_size / 100) * num_files) valid_idx, test_idx, train_idx = shuffled[:n_val], shuffled[ n_val:n_val + n_test], shuffled[n_val + n_test:] print('{} has {} images'.format(folder, num_files)) train_set, test_set, valid_set = [], [], [] train_set = list(np.squeeze(list(np.take(files, train_idx)))) test_set = list(np.squeeze(list(np.take(files, test_idx)))) valid_set = list(np.squeeze(list(np.take(files, valid_idx)))) dataset[folder] = { 'train': train_set, 'valid': valid_set, 'test': test_set } return folders, dataset print('Path does not exist!!') return None
def get_multiplier(output_tensor, new_shape): class_binary = [[0], [1]] class_binary = np.asarray(class_binary, dtype=np.uint8) output = output_tensor.reshape(new_shape) output_colors = np.take(class_binary, output, axis=0) return output_colors
def merge_percentiles(finalq, qs, vals, interpolation="lower", Ns=None): """Combine several percentile calculations of different data. Parameters ---------- finalq : numpy.array Percentiles to compute (must use same scale as ``qs``). qs : sequence of :class:`numpy.array`s Percentiles calculated on different sets of data. vals : sequence of :class:`numpy.array`s Resulting values associated with percentiles ``qs``. Ns : sequence of integers The number of data elements associated with each data set. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} Specify the type of interpolation to use to calculate final percentiles. For more information, see :func:`numpy.percentile`. Examples -------- >>> finalq = [10, 20, 30, 40, 50, 60, 70, 80] >>> qs = [[20, 40, 60, 80], [20, 40, 60, 80]] >>> vals = [np.array([1, 2, 3, 4]), np.array([10, 11, 12, 13])] >>> Ns = [100, 100] # Both original arrays had 100 elements >>> merge_percentiles(finalq, qs, vals, Ns=Ns) array([ 1, 2, 3, 4, 10, 11, 12, 13]) """ from .utils import array_safe, empty_like_safe if isinstance(finalq, Iterator): finalq = list(finalq) finalq = array_safe(finalq, like=finalq) qs = list(map(list, qs)) vals = list(vals) if Ns is None: vals, Ns = zip(*vals) Ns = list(Ns) L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N])) if not L: raise ValueError("No non-trivial arrays found") qs, vals, Ns = L # TODO: Perform this check above in percentile once dtype checking is easy # Here we silently change meaning if vals[0].dtype.name == "category": result = merge_percentiles(finalq, qs, [v.codes for v in vals], interpolation, Ns) import pandas as pd return pd.Categorical.from_codes(result, vals[0].categories, vals[0].ordered) if not np.issubdtype(vals[0].dtype, np.number): interpolation = "nearest" if len(vals) != len(qs) or len(Ns) != len(qs): raise ValueError("qs, vals, and Ns parameters must be the same length") # transform qs and Ns into number of observations between percentiles counts = [] for q, N in zip(qs, Ns): count = empty_like_safe(finalq, shape=len(q)) count[1:] = np.diff(array_safe(q, like=q[0])) count[0] = q[0] count *= N counts.append(count) # Sort by calculated percentile values, then number of observations. combined_vals = np.concatenate(vals) combined_counts = array_safe(np.concatenate(counts), like=combined_vals) sort_order = np.argsort(combined_vals) combined_vals = np.take(combined_vals, sort_order) combined_counts = np.take(combined_counts, sort_order) # percentile-like, but scaled by total number of observations combined_q = np.cumsum(combined_counts) # rescale finalq percentiles to match combined_q finalq = array_safe(finalq, like=combined_vals) desired_q = finalq * sum(Ns) # the behavior of different interpolation methods should be # investigated further. if interpolation == "linear": rv = np.interp(desired_q, combined_q, combined_vals) else: left = np.searchsorted(combined_q, desired_q, side="left") right = np.searchsorted(combined_q, desired_q, side="right") - 1 np.minimum(left, len(combined_vals) - 1, left) # don't exceed max index lower = np.minimum(left, right) upper = np.maximum(left, right) if interpolation == "lower": rv = combined_vals[lower] elif interpolation == "higher": rv = combined_vals[upper] elif interpolation == "midpoint": rv = 0.5 * (combined_vals[lower] + combined_vals[upper]) elif interpolation == "nearest": lower_residual = np.abs(combined_q[lower] - desired_q) upper_residual = np.abs(combined_q[upper] - desired_q) mask = lower_residual > upper_residual index = lower # alias; we no longer need lower index[mask] = upper[mask] rv = combined_vals[index] else: raise ValueError("interpolation can only be 'linear', 'lower', " "'higher', 'midpoint', or 'nearest'") return rv
def fetch_openml(name=None, version='active', data_id=None, data_home=None, target_column='default-target', cache=True, return_X_y=False): """Fetch dataset from openml by name or dataset id. Datasets are uniquely identified by either an integer ID or by a combination of name and version (i.e. there might be multiple versions of the 'iris' dataset). Please give either name or data_id (not both). In case a name is given, a version can also be provided. Read more in the :ref:`User Guide <openml>`. .. note:: EXPERIMENTAL The API is experimental in version 0.20 (particularly the return value structure), and might have small backward-incompatible changes in future releases. Parameters ---------- name : str or None String identifier of the dataset. Note that OpenML can have multiple datasets with the same name. version : integer or 'active', default='active' Version of the dataset. Can only be provided if also ``name`` is given. If 'active' the oldest version that's still active is used. Since there may be more than one active version of a dataset, and those versions may fundamentally be different from one another, setting an exact version is highly recommended. data_id : int or None OpenML ID of the dataset. The most specific way of retrieving a dataset. If data_id is not given, name (and potential version) are used to obtain a dataset. data_home : string or None, default None Specify another download and cache folder for the data sets. By default all scikit-learn data is stored in '~/scikit_learn_data' subfolders. target_column : string, list or None, default 'default-target' Specify the column name in the data to use as target. If 'default-target', the standard target column a stored on the server is used. If ``None``, all columns are returned as data and the target is ``None``. If list (of strings), all columns with these names are returned as multi-target (Note: not all scikit-learn classifiers can handle all types of multi-output combinations) cache : boolean, default=True Whether to cache downloaded datasets using joblib. return_X_y : boolean, default=False. If True, returns ``(data, target)`` instead of a Bunch object. See below for more information about the `data` and `target` objects. Returns ------- data : Bunch Dictionary-like object, with attributes: data : np.array or scipy.sparse.csr_matrix of floats The feature matrix. Categorical features are encoded as ordinals. target : np.array The regression target or classification labels, if applicable. Dtype is float if numeric, and object if categorical. DESCR : str The full description of the dataset feature_names : list The names of the dataset columns categories : dict Maps each categorical feature name to a list of values, such that the value encoded as i is ith in the list. details : dict More metadata from OpenML (data, target) : tuple if ``return_X_y`` is True .. note:: EXPERIMENTAL This interface is **experimental** as at version 0.20 and subsequent releases may change attributes without notice (although there should only be minor changes to ``data`` and ``target``). Missing values in the 'data' are represented as NaN's. Missing values in 'target' are represented as NaN's (numerical target) or None (categorical target) """ data_home = get_data_home(data_home=data_home) data_home = join(data_home, 'openml') if cache is False: # no caching will be applied data_home = None # check valid function arguments. data_id XOR (name, version) should be # provided if name is not None: # OpenML is case-insensitive, but the caching mechanism is not # convert all data names (str) to lower case name = name.lower() if data_id is not None: raise ValueError( "Dataset data_id={} and name={} passed, but you can only " "specify a numeric data_id or a name, not " "both.".format(data_id, name)) data_info = _get_data_info_by_name(name, version, data_home) data_id = data_info['did'] elif data_id is not None: # from the previous if statement, it is given that name is None if version is not "active": raise ValueError( "Dataset data_id={} and version={} passed, but you can only " "specify a numeric data_id or a version, not " "both.".format(data_id, name)) else: raise ValueError( "Neither name nor data_id are provided. Please provide name or " "data_id.") data_description = _get_data_description_by_id(data_id, data_home) if data_description['status'] != "active": warn("Version {} of dataset {} is inactive, meaning that issues have " "been found in the dataset. Try using a newer version from " "this URL: {}".format( data_description['version'], data_description['name'], data_description['url'])) if 'error' in data_description: warn("OpenML registered a problem with the dataset. It might be " "unusable. Error: {}".format(data_description['error'])) if 'warning' in data_description: warn("OpenML raised a warning on the dataset. It might be " "unusable. Warning: {}".format(data_description['warning'])) # download data features, meta-info about column types features_list = _get_data_features(data_id, data_home) for feature in features_list: if 'true' in (feature['is_ignore'], feature['is_row_identifier']): continue if feature['data_type'] == 'string': raise ValueError('STRING attributes are not yet supported') if target_column == "default-target": # determines the default target based on the data feature results # (which is currently more reliable than the data description; # see issue: https://github.com/openml/OpenML/issues/768) target_column = [feature['name'] for feature in features_list if feature['is_target'] == 'true'] elif isinstance(target_column, string_types): # for code-simplicity, make target_column by default a list target_column = [target_column] elif target_column is None: target_column = [] elif not isinstance(target_column, list): raise TypeError("Did not recognize type of target_column" "Should be six.string_type, list or None. Got: " "{}".format(type(target_column))) data_columns = _valid_data_column_names(features_list, target_column) # prepare which columns and data types should be returned for the X and y features_dict = {feature['name']: feature for feature in features_list} # XXX: col_slice_y should be all nominal or all numeric _verify_target_data_type(features_dict, target_column) col_slice_y = [int(features_dict[col_name]['index']) for col_name in target_column] col_slice_x = [int(features_dict[col_name]['index']) for col_name in data_columns] for col_idx in col_slice_y: feat = features_list[col_idx] nr_missing = int(feat['number_of_missing_values']) if nr_missing > 0: raise ValueError('Target column {} has {} missing values. ' 'Missing values are not supported for target ' 'columns. '.format(feat['name'], nr_missing)) # determine arff encoding to return return_sparse = False if data_description['format'].lower() == 'sparse_arff': return_sparse = True # obtain the data arff = _download_data_arff(data_description['file_id'], return_sparse, data_home) arff_data = arff['data'] # nominal attributes is a dict mapping from the attribute name to the # possible values. Includes also the target column (which will be popped # off below, before it will be packed in the Bunch object) nominal_attributes = {k: v for k, v in arff['attributes'] if isinstance(v, list) and k in data_columns + target_column} X, y = _convert_arff_data(arff_data, col_slice_x, col_slice_y) is_classification = {col_name in nominal_attributes for col_name in target_column} if not is_classification: # No target pass elif all(is_classification): y = np.hstack([np.take(np.asarray(nominal_attributes.pop(col_name), dtype='O'), y[:, i:i+1].astype(int)) for i, col_name in enumerate(target_column)]) elif any(is_classification): raise ValueError('Mix of nominal and non-nominal targets is not ' 'currently supported') description = u"{}\n\nDownloaded from openml.org.".format( data_description.pop('description')) # reshape y back to 1-D array, if there is only 1 target column; back # to None if there are not target columns if y.shape[1] == 1: y = y.reshape((-1,)) elif y.shape[1] == 0: y = None if return_X_y: return X, y bunch = Bunch( data=X, target=y, feature_names=data_columns, DESCR=description, details=data_description, categories=nominal_attributes, url="https://www.openml.org/d/{}".format(data_id)) return bunch
def __call__(self, image, boxes, classes, crop_h_=None, crop_w_=None): if len(boxes) == 0: return image, boxes, classes h, w, _ = np.shape(image) gt_bbox = boxes # NOTE Original method attempts to generate one candidate for each # threshold then randomly sample one from the resulting list. # Here a short circuit approach is taken, i.e., randomly choose a # threshold and attempt to find a valid crop, and simply return the # first one found. # The probability is not exactly the same, kinda resembling the # "Monty Hall" problem. Actually carrying out the attempts will affect # observability (just like opening doors in the "Monty Hall" game). thresholds = list(self.thresholds) if self.allow_no_crop: thresholds.append('no_crop') np.random.shuffle(thresholds) for thresh in thresholds: if thresh == 'no_crop': return image, boxes, classes found = False for i in range(self.num_attempts): scale = np.random.uniform(*self.scaling) min_ar, max_ar = self.aspect_ratio aspect_ratio = np.random.uniform(max(min_ar, scale**2), min(max_ar, scale**-2)) if crop_h_ is not None: crop_h = min(crop_h_, h) else: crop_h = int(h * scale / np.sqrt(aspect_ratio)) if crop_w_ is not None: crop_w = min(crop_w_, w) else: crop_w = int(w * scale * np.sqrt(aspect_ratio)) if h > crop_h: crop_y = np.random.randint(0, h - crop_h) else: crop_y = 0 if w > crop_w: crop_x = np.random.randint(0, w - crop_w) else: crop_x = 0 crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h] iou = self._iou_matrix(gt_bbox, np.array([crop_box], dtype=np.float32)) if iou.max() < thresh: continue if self.cover_all_box and iou.min() < thresh: continue cropped_box, valid_ids = self._crop_box_with_center_constraint( gt_bbox, np.array(crop_box, dtype=np.float32)) if valid_ids.size > 0: found = True break if found: image = self._crop_image(image, crop_box) boxes = np.take(cropped_box, valid_ids, axis=0) classes = np.take(classes, valid_ids, axis=0) #sample['w'] = crop_box[2] - crop_box[0] #sample['h'] = crop_box[3] - crop_box[1] return image, boxes, classes return image, boxes, classes
def generate_knntriplets(self, X, k_genuine, k_impostor): """ Generates triplets from labeled data. For every point (X_a) the triplets (X_a, X_b, X_c) are constructed from all the combinations of taking one of its `k_genuine`-nearest neighbors of the same class (X_b) and taking one of its `k_impostor`-nearest neighbors of other classes (X_c). In the case a class doesn't have enough points in the same class (other classes) to yield `k_genuine` (`k_impostor`) neighbors a warning will be raised and the maximum value of genuine (impostor) neighbors will be used for that class. Parameters ---------- X : (n x d) matrix Input data, where each row corresponds to a single instance. k_genuine : int Number of neighbors of the same class to be taken into account. k_impostor : int Number of neighbors of different classes to be taken into account. Returns ------- triplets : array-like, shape=(n_constraints, 3) 2D array of triplets of indicators. """ # Ignore unlabeled samples known_labels_mask = self.partial_labels >= 0 known_labels = self.partial_labels[known_labels_mask] X = X[known_labels_mask] labels, labels_count = np.unique(known_labels, return_counts=True) len_input = known_labels.shape[0] # Handle the case where there are too few elements to yield k_genuine or # k_impostor neighbors for every class. k_genuine_vec = np.full_like(labels, k_genuine) k_impostor_vec = np.full_like(labels, k_impostor) for i, count in enumerate(labels_count): if k_genuine + 1 > count: k_genuine_vec[i] = count - 1 warnings.warn( "The class {} has {} elements, which is not sufficient " "to generate {} genuine neighbors as specified by " "k_genuine. Will generate {} genuine neighbors instead." "\n".format(labels[i], count, k_genuine + 1, k_genuine_vec[i])) if k_impostor > len_input - count: k_impostor_vec[i] = len_input - count warnings.warn( "The class {} has {} elements of other classes, which is" " not sufficient to generate {} impostor neighbors as " "specified by k_impostor. Will generate {} impostor " "neighbors instead.\n".format(labels[i], k_impostor_vec[i], k_impostor, k_impostor_vec[i])) # The total number of possible triplets combinations per label comes from # taking one of the k_genuine_vec[i] genuine neighbors and one of the # k_impostor_vec[i] impostor neighbors for the labels_count[i] elements comb_per_label = labels_count * k_genuine_vec * k_impostor_vec # Get start and finish for later triplet assigning # append zero at the begining for start and get cumulative sum start_finish_indices = np.hstack((0, comb_per_label)).cumsum() # Total number of triplets is the sum of all possible combinations per # label num_triplets = start_finish_indices[-1] triplets = np.empty((num_triplets, 3), dtype=np.intp) neigh = NearestNeighbors() for i, label in enumerate(labels): # generate mask for current label gen_mask = known_labels == label gen_indx = np.where(gen_mask) # get k_genuine genuine neighbors neigh.fit(X=X[gen_indx]) # Take elements of gen_indx according to the yielded k-neighbors gen_relative_indx = neigh.kneighbors(n_neighbors=k_genuine_vec[i], return_distance=False) gen_neigh = np.take(gen_indx, gen_relative_indx) # generate mask for impostors of current label imp_indx = np.where(~gen_mask) # get k_impostor impostor neighbors neigh.fit(X=X[imp_indx]) # Take elements of imp_indx according to the yielded k-neighbors imp_relative_indx = neigh.kneighbors(n_neighbors=k_impostor_vec[i], X=X[gen_mask], return_distance=False) imp_neigh = np.take(imp_indx, imp_relative_indx) # length = len_label*k_genuine*k_impostor start, finish = start_finish_indices[i:i + 2] triplets[start:finish, :] = comb(gen_indx, gen_neigh, imp_neigh, k_genuine_vec[i], k_impostor_vec[i]) return triplets
def generate(s): ms_set = np.take(self.elements, s, axis=0) return self.model_class(ms_set)
fig, ax = plt.subplots() ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) val_labels_list = [] val_predicted_prob_list = [] for i, (train_indices, val_indices) in enumerate(kfold_cv.split(train_val_text)): print("Fold {} of outer crossvalidation".format(i)) # Split the train and validation set train_text = np.take(train_val_text, train_indices) train_labels = np.take(train_val_labels, train_indices) val_text = np.take(train_val_text, val_indices) val_labels = np.take(train_val_labels, val_indices) # A distilled model of BERT is used with less parameters as we do not have # a lot of data. Preprocessing from text to numeric data is done in the # code below in a way designed for the BERT algorithm. print("Preprocessing data") tf.autograph.set_verbosity(0) bert_model = 'distilbert-base-uncased' t = ktrain_text.Transformer(bert_model, maxlen=500, class_names=[0, 1]) train_preprocessed = t.preprocess_train(train_text, train_labels) val_preprocessed = t.preprocess_test(val_text, val_labels)
def group_adjust(in_val, in_groups, in_weights): """ Calculate a group adjustment (demean). Parameters ---------- vals : List of floats/ints The original values to adjust groups : List of Lists A list of groups. Each group will be a list of ints weights : List of floats A list of weights for the groupings. Returns ------- A list-like demeaned version of the input values """ vals = np.asarray(in_val, dtype=np.float) groups = np.asarray(in_groups) weights = np.asarray(in_weights) # check if # of groups equals to length of vals if (len(groups) != len(weights)): raise ValueError("Exception Not Same Size of groups and weights") # check if # of groups equals to length of vals for i in range(len(groups)): if (len(groups[i]) != len(vals)): raise ValueError( "Exception Not Same # of elments in vals and groups") group_index = 0 # initialize with intial value in vals demeaned = np.asarray(in_val, dtype=np.float) # iterate over groups for each_group in groups: # get count of no of unique item in each group unique, counts = np.unique(each_group, return_counts=True) # dictonary key = group item and value is freq uni_dict = dict(zip(unique, counts)) for key in uni_dict: # get list of positions for each key pos_list = np.where(each_group == key)[0] #extract values for postions that match value_from_pos = np.take(vals, pos_list) freq = uni_dict[key] # Check for None/np.NaN nan_pos = np.argwhere(np.isnan(value_from_pos)) if (len(nan_pos) > 0): freq = freq - len(nan_pos) # sum total = np.nansum(value_from_pos) # mean means = np.true_divide(float(total), float(freq)) weighted_means = np.multiply(float(means), float(weights[group_index])) demeaned[pos_list] -= float(weighted_means) group_index += 1 return demeaned
def histogram_enhancement(im, etype='linear2', target=None, maxCount=255, showHistogram=False, userInputs=False): import numpy import matplotlib.pyplot as plot # Extra arguments showHistogram == True: program will display histogram/CDF from original and modified image # userInputs == True: allows you to input user-specified values for certain modification types, such as the # cutoff range for the linear2 histogram modification, and whether to do rolled color channels versus individual # color channels for histogram matching. shape_im = im.shape shape_target = target.shape histogramFlag = 'rolled' if len( shape_im) == 2: # determines if the given image is a 2D greyscale array or 3D array. If 2D, converts to 3D greyscale array shape_im_3D = (shape_im[0], shape_im[1], 3) # for ease of calculation. im3D = numpy.zeros(shape_im_3D) for n in range(0, 3): im3D[:, :, n] = im n = n + 1 im = im3D else: n = 0 if len( shape_target) == 2: # determines if the given image is a 2D greyscale array or 3D array. If 2D, converts to 3D greyscale array shape_target_3D = (shape_target[0], shape_target[1], 3) # for ease of calculation. target3D = numpy.zeros(shape_target_3D) for n in range(0, 3): target3D[:, :, n] = target n = n + 1 target = target3D else: n = 0 im = im.astype(int) # compute original image histograms num_bins = maxCount + 1 counts, bin_edges = numpy.histogram(im, bins=num_bins, range=(0, maxCount), density=False) im_pdf = counts / im.size im_cdf = numpy.cumsum(counts) / im.size if etype == 'linear1': # compute rise/run to find slope, where rise = desired range (0,255) and run is current range (DCmin,DCmax). # then use slope to find y intercept and come up with a linear LUT rise = maxCount run = int(numpy.max(im)) - int(numpy.min(im)) slope = rise / run b = 0 - (slope * int(numpy.min(im))) LUT = numpy.linspace(0, maxCount, maxCount + 1) * ( slope) + b # building the base LUT in the form of LUT = slope(0:255)+b LUT = LUT.astype(int) n = 0 for n in range(0, maxCount + 1): # clipping function for the LUT if LUT[n] >= maxCount: LUT[n] = maxCount elif LUT[n] <= 0: LUT[n] = 0 n = n + 1 output = numpy.take(LUT, im) # numpy.take is a very fast LUT applicator. output = output.astype( numpy.uint8) # outputting to UINT8 for display. This would need to be changed if we were outputting to a different bit depth. elif etype == 'linear2': # We need to find the cutoff values. Best way to do this is to subtract the cutoff percentiles (I will do 5 and 95) # from the CDF LUT, take the absolute value, and then take the minimum. if userInputs == True: # userInputs flag allows one to specify the boundary if desired. trimamount = float(input("Specify the percentage for boundary cutoff (for example, 5% --> 0.05): ")) else: trimamount = 0.02 # The amount being trimmed off either histogram. input_lo = trimamount input_hi = 1 - trimamount CDF_locut = abs(im_cdf - input_lo) CDF_hicut = abs(im_cdf - input_hi) CDF_locut = numpy.ndarray.tolist(CDF_locut) CDF_hicut = numpy.ndarray.tolist(CDF_hicut) pos_locut = CDF_locut.index(min(CDF_locut)) pos_hicut = CDF_hicut.index(min(CDF_hicut)) rise = maxCount run = pos_hicut - pos_locut # Run is the index of the lowcut and highcut CVs # from here on out, same as linear1 slope = rise / run b = 0 - (slope * int(numpy.min(im))) LUT = numpy.linspace(0, maxCount, maxCount + 1) * (slope) + b LUT = LUT.astype(int) n = 0 for n in range(0, maxCount + 1): if LUT[n] >= maxCount: LUT[n] = maxCount elif LUT[n] <= 0: LUT[n] = 0 n = n + 1 output = numpy.take(LUT, im) output = output.astype(numpy.uint8) elif etype == 'equalize': # Start by dividing out maximum bit depth to scale between 0 and 1 LUT = (numpy.linspace(0, 1, maxCount + 1) * im_cdf) * maxCount # scale by the CDF, then return to 0-255 scale LUT = LUT.astype(numpy.uint8) output = numpy.take(LUT, im) output = output.astype(numpy.uint8) elif etype == 'match': # create finding function for array indexing import numpy as np def find_nearest(array, value): # I found this function on stackoverflow, linking here for transparency: array = np.asarray(array) # https://stackoverflow.com/questions/2566412/find-nearest-value-in-numpy-array idx = (np.abs( array - value)).argmin() # The function takes an array, and searches for the closest index to the value given. return array[idx] if userInputs == True: # This specifies whether you can do independent channel matching or "rolled together" matching with one histogram. # Defaults to rolled together. histogramFlag = input( "Specify 'rolled' for rolled histogram matching, or 'independent' for channel-independent histogram matching:") if histogramFlag == 'rolled': # Check flag first if len(target.shape) >= 1: # This conditional is passing through an image to match histograms if the number of dimensions is greater than 1. # Match probability from source CDF to target CDF, take the index value there. This becomes the lookup CV at the source CV at matching probability. source_cdf = im_cdf # Create the original source PDF and CDF from image num_bins = maxCount + 1 target_counts, bin_edges = numpy.histogram(target, bins=num_bins, range=(0, maxCount), density=False) bin_edges = numpy.linspace(0, maxCount, maxCount + 1) target_pdf = target_counts / target.size target_cdf = numpy.cumsum(target_pdf) LUT = numpy.zeros(maxCount + 1) nearest_target = numpy.zeros(maxCount + 1) n = 0 # Use the probability desired to match to in the find_nearest function, then store this in # the "nearest_targets" array to find indices. Indices form the LUT. for n in range(0, maxCount + 1): matching_probability = source_cdf[n] nearest_target[n] = find_nearest(target_cdf, matching_probability) n = n + 1 target_cdf = numpy.ndarray.tolist(target_cdf) for n in range(0, maxCount + 1): LUT[n] = target_cdf.index(nearest_target[n]) n = n + 1 else: source_cdf = im_cdf # Same procedure as before, but program detects that the matching image is a pre-built target_pdf = target # LUT by determining the size of the array beforehand. target_cdf = numpy.cumsum(target_pdf) LUT = numpy.zeros(maxCount + 1) nearest_target = numpy.zeros(maxCount + 1) n = 0 for n in range(0, maxCount + 1): matching_probability = source_cdf[n] nearest_target[n] = find_nearest(target_cdf, matching_probability) n = n + 1 target_cdf = numpy.ndarray.tolist(target_cdf) # Converting to list such that I can index from the list for n in range(0, maxCount + 1): LUT[n] = target_cdf.index(nearest_target[n]) n = n + 1 elif histogramFlag == 'independent': # Independent channel matching is an experiment more for myself than anything. if len(target.shape) >= 1: # It has the same procedure as rolled matching, but simply computes for three num_bins = maxCount + 1 # independent color bands, so additional loops were nested. n = 0 counts = numpy.zeros((3, maxCount + 1)) target_counts = numpy.zeros((3, maxCount + 1)) im_pdf = numpy.zeros((3, maxCount + 1)) target_pdf = numpy.zeros((3, maxCount + 1)) source_cdf = numpy.zeros((3, maxCount + 1)) target_cdf = numpy.zeros((3, maxCount + 1)) LUT = numpy.zeros((3, maxCount + 1)) nearest_target = numpy.zeros((3, maxCount + 1)) for n in range(0, 3): counts[n], bin_edges = numpy.histogram(im[:, :, n], bins=num_bins, range=(0, maxCount), density=False) im_pdf[n] = counts[n] / (im.size / 3) source_cdf[n] = numpy.cumsum(counts[n]) / (im.size / 3) n = n + 1 for n in range(0, 3): target_counts[n], bin_edges = numpy.histogram(target[:, :, n], bins=num_bins, range=(0, maxCount), density=False) target_pdf[n] = target_counts[n] / (target.size / 3) target_cdf[n] = numpy.cumsum(target_pdf[n]) n = n + 1 for n in range(0, 3): cdf_list = target_cdf for m in range(0, maxCount + 1): matching_probability = source_cdf[n, m] nearest_target[n, m] = find_nearest(target_cdf[n, :], matching_probability) m = m + 1 cdf_list = numpy.ndarray.tolist(cdf_list[n, :]) for m in range(0, maxCount + 1): LUT[n, m] = cdf_list.index(nearest_target[n, m]) m = m + 1 n = n + 1 else: print("Error: Must pass in image for per-channel matching to work") exit() LUT = LUT.astype(numpy.uint8) output = numpy.take(LUT, im) output = output.astype(numpy.uint8) if showHistogram == True: # this is an optional conditional that will permit you to view the histograms before and if histogramFlag == 'independent': # after enhancement if desired. Defaults to off. print("Still working on independent channel histograms!") return output # create output histograms for reference num_bins = maxCount + 1 output_counts, bin_edges = numpy.histogram(output, bins=num_bins, range=(0, maxCount), density=False) bin_edges = numpy.linspace(0, maxCount, maxCount + 1) output_pdf = output_counts / output.size output_cdf = numpy.cumsum(output_counts) / output.size fig, axs = plot.subplots(4, 1, sharex=True) plot.suptitle('Distributions Before and After Enhancement', horizontalalignment='center', verticalalignment='top') fig.subplots_adjust(hspace=0.25) axs[0].plot(bin_edges[0:], im_pdf, '-b', label='Plot of Original PDF') axs[0].set_xlim(0, 255) axs[1].plot(bin_edges[0:], im_cdf, '-r', label='Plot of Original CDF') axs[1].set_xlim(0, 255) axs[1].set_ylim(0, 1) axs[2].plot(bin_edges[0:], output_pdf, '-y', label='PDF enhanced via: {etype}') axs[2].set_xlim(0, 255) axs[3].plot(bin_edges[0:], output_cdf, '-g', label='CDF enhanced via: {etype}') axs[3].set_xlim(0, 255) axs[3].set_ylim(0, 1) axs[3].set_xlabel('Digital Count') fig.legend(loc='center right', fontsize='x-small') plot.show() return output
def check_fun_data(self, testfunc, targfunc, testarval, targarval, targarnanval, check_dtype=True, empty_targfunc=None, **kwargs): for axis in list(range(targarval.ndim)) + [None]: for skipna in [False, True]: targartempval = targarval if skipna else targarnanval if skipna and empty_targfunc and isna(targartempval).all(): targ = empty_targfunc(targartempval, axis=axis, **kwargs) else: targ = targfunc(targartempval, axis=axis, **kwargs) try: res = testfunc(testarval, axis=axis, skipna=skipna, **kwargs) self.check_results(targ, res, axis, check_dtype=check_dtype) if skipna: res = testfunc(testarval, axis=axis, **kwargs) self.check_results(targ, res, axis, check_dtype=check_dtype) if axis is None: res = testfunc(testarval, skipna=skipna, **kwargs) self.check_results(targ, res, axis, check_dtype=check_dtype) if skipna and axis is None: res = testfunc(testarval, **kwargs) self.check_results(targ, res, axis, check_dtype=check_dtype) except BaseException as exc: exc.args += ('axis: %s of %s' % (axis, testarval.ndim - 1), 'skipna: %s' % skipna, 'kwargs: %s' % kwargs) raise if testarval.ndim <= 1: return try: testarval2 = np.take(testarval, 0, axis=-1) targarval2 = np.take(targarval, 0, axis=-1) targarnanval2 = np.take(targarnanval, 0, axis=-1) except ValueError: return self.check_fun_data(testfunc, targfunc, testarval2, targarval2, targarnanval2, check_dtype=check_dtype, empty_targfunc=empty_targfunc, **kwargs)
def SortEigenDecomposition(e, v): if v.ndim < 2: return e, v else: perm = np.argsort(e, -1) return np.take(e, perm, -1), np.take(v, perm, -1)
def samp_entropy(a, m, r, tau=1, relative_r=True): """ Compute the sample entropy [RIC00]_ of a signal with embedding dimension `de` and delay `tau` [PYEEG]_. Vectorised version of the eponymous PyEEG function. In addition, this function can also be used to vary tau and therefore compute Multi-Scale Entropy(MSE) [COS05]_ by coarse grainning the time series (see example bellow). By default, r is expressed as relatively to the standard deviation of the signal. :param a: a one dimensional floating-point array representing a time series. :type a: :class:`~numpy.ndarray` or :class:`~pyrem.time_series.Signal` :param m: the scale :type m: int :param r: The tolerance :type r: float :param tau: The scale for coarse grainning. :type tau: int :param relative_r: whether the argument r is relative to the standard deviation. If false, an absolute value should be given for r. :type relative_r: true :return: the approximate entropy, a scalar :rtype: float Example: """ if len(a) <= 2: return np.nan coarse_a = _coarse_grainning(a, tau) if relative_r: coarse_a /= np.std(coarse_a) embsp = _embed_seq(coarse_a, 1, m + 1) embsp_last = embsp[:, -1] embs_mini = embsp[:, :-1] # Buffers are preallocated chunks of memory storing temporary results. # see the `out` argument in numpy *ufun* documentation dist_buffer = np.zeros(embsp.shape[0] - 1, dtype=np.float32) subtract_buffer = np.zeros((dist_buffer.size, m), dtype=np.float32) in_range_buffer = np.zeros_like(dist_buffer, dtype=np.bool) sum_cm, sum_cmp = 0.0, 0.0 # we iterate through all templates (rows), except last one. for i, template in enumerate(embs_mini[:-1]): # these are just views to the buffer arrays. to store intermediary matrices dist_b_view = dist_buffer[i:] sub_b_view = subtract_buffer[i:] range_b_view = in_range_buffer[i:] embsp_view = embsp_last[i+1:] # substract the template from each subsequent row of the embedded matrix np.subtract(embs_mini[i+1:], template, out=sub_b_view) # Absolute distance np.abs(sub_b_view, out=sub_b_view) # Maximal absolute difference between a scroll and a template is the distance np.max(sub_b_view, axis=1, out=dist_b_view) # we compare this distance to a tolerance r np.less_equal(dist_b_view, r, out=range_b_view) # score one for this template for each match in_range_sum = np.sum(range_b_view) sum_cm += in_range_sum # reuse the buffers for last column dist_b_view = dist_buffer[:in_range_sum] where = np.flatnonzero(range_b_view) dist_b_view = np.take(embsp_view, where, out=dist_b_view) range_b_view = in_range_buffer[range_b_view] # score one to TODO for each match of the last element dist_b_view -= embsp_last[i] np.abs(dist_b_view, out=dist_b_view) np.less_equal(dist_b_view, r, out=range_b_view) sum_cmp += np.sum(range_b_view) if sum_cm == 0 or sum_cmp ==0: return np.NaN return np.log(sum_cm/sum_cmp)
def leastsq(func, x0, args=(), Dfun=None, full_output=0, col_deriv=0, ftol=1.49012e-8, xtol=1.49012e-8, gtol=0.0, maxfev=0, epsfcn=None, factor=100, diag=None): """ Minimize the sum of squares of a set of equations. :: x = arg min(sum(func(y)**2,axis=0)) y Parameters ---------- func : callable should take at least one (possibly length N vector) argument and returns M floating point numbers. It must not return NaNs or fitting might fail. x0 : ndarray The starting estimate for the minimization. args : tuple, optional Any extra arguments to func are placed in this tuple. Dfun : callable, optional A function or method to compute the Jacobian of func with derivatives across the rows. If this is None, the Jacobian will be estimated. full_output : bool, optional non-zero to return all optional outputs. col_deriv : bool, optional non-zero to specify that the Jacobian function computes derivatives down the columns (faster, because there is no transpose operation). ftol : float, optional Relative error desired in the sum of squares. xtol : float, optional Relative error desired in the approximate solution. gtol : float, optional Orthogonality desired between the function vector and the columns of the Jacobian. maxfev : int, optional The maximum number of calls to the function. If `Dfun` is provided then the default `maxfev` is 100*(N+1) where N is the number of elements in x0, otherwise the default `maxfev` is 200*(N+1). epsfcn : float, optional A variable used in determining a suitable step length for the forward- difference approximation of the Jacobian (for Dfun=None). Normally the actual step length will be sqrt(epsfcn)*x If epsfcn is less than the machine precision, it is assumed that the relative errors are of the order of the machine precision. factor : float, optional A parameter determining the initial step bound (``factor * || diag * x||``). Should be in interval ``(0.1, 100)``. diag : sequence, optional N positive entries that serve as a scale factors for the variables. Returns ------- x : ndarray The solution (or the result of the last iteration for an unsuccessful call). cov_x : ndarray Uses the fjac and ipvt optional outputs to construct an estimate of the jacobian around the solution. None if a singular matrix encountered (indicates very flat curvature in some direction). This matrix must be multiplied by the residual variance to get the covariance of the parameter estimates -- see curve_fit. infodict : dict a dictionary of optional outputs with the key s: ``nfev`` The number of function calls ``fvec`` The function evaluated at the output ``fjac`` A permutation of the R matrix of a QR factorization of the final approximate Jacobian matrix, stored column wise. Together with ipvt, the covariance of the estimate can be approximated. ``ipvt`` An integer array of length N which defines a permutation matrix, p, such that fjac*p = q*r, where r is upper triangular with diagonal elements of nonincreasing magnitude. Column j of p is column ipvt(j) of the identity matrix. ``qtf`` The vector (transpose(q) * fvec). mesg : str A string message giving information about the cause of failure. ier : int An integer flag. If it is equal to 1, 2, 3 or 4, the solution was found. Otherwise, the solution was not found. In either case, the optional output variable 'mesg' gives more information. Notes ----- "leastsq" is a wrapper around MINPACK's lmdif and lmder algorithms. cov_x is a Jacobian approximation to the Hessian of the least squares objective function. This approximation assumes that the objective function is based on the difference between some observed target data (ydata) and a (non-linear) function of the parameters `f(xdata, params)` :: func(params) = ydata - f(xdata, params) so that the objective function is :: min sum((ydata - f(xdata, params))**2, axis=0) params """ x0 = asarray(x0).flatten() n = len(x0) if not isinstance(args, tuple): args = (args, ) shape, dtype = minpack._check_func('leastsq', 'func', func, x0, args, n) m = shape[0] # if n > m: # raise TypeError('Improper input: N=%s must not exceed M=%s' % (n, m)) if epsfcn is None: epsfcn = finfo(dtype).eps if Dfun is None: if maxfev == 0: maxfev = 200 * (n + 1) retval = minpack._minpack._lmdif(func, x0, args, full_output, ftol, xtol, gtol, maxfev, epsfcn, factor, diag) else: if col_deriv: minpack._check_func('leastsq', 'Dfun', Dfun, x0, args, n, (n, m)) else: minpack._check_func('leastsq', 'Dfun', Dfun, x0, args, n, (m, n)) if maxfev == 0: maxfev = 100 * (n + 1) retval = minpack._minpack._lmder(func, Dfun, x0, args, full_output, col_deriv, ftol, xtol, gtol, maxfev, factor, diag) errors = { 0: ["Improper input parameters.", TypeError], 1: [ "Both actual and predicted relative reductions " "in the sum of squares\n are at most %f" % ftol, None ], 2: [ "The relative error between two consecutive " "iterates is at most %f" % xtol, None ], 3: [ "Both actual and predicted relative reductions in " "the sum of squares\n are at most %f and the " "relative error between two consecutive " "iterates is at \n most %f" % (ftol, xtol), None ], 4: [ "The cosine of the angle between func(x) and any " "column of the\n Jacobian is at most %f in " "absolute value" % gtol, None ], 5: [ "Number of calls to function has reached " "maxfev = %d." % maxfev, ValueError ], 6: [ "ftol=%f is too small, no further reduction " "in the sum of squares\n is possible." "" % ftol, ValueError ], 7: [ "xtol=%f is too small, no further improvement in " "the approximate\n solution is possible." % xtol, ValueError ], 8: [ "gtol=%f is too small, func(x) is orthogonal to the " "columns of\n the Jacobian to machine " "precision." % gtol, ValueError ], 'unknown': ["Unknown error.", TypeError] } info = retval[-1] # The FORTRAN return value if info not in [1, 2, 3, 4] and not full_output: if info in [5, 6, 7, 8]: minpack.warnings.warn(errors[info][0], RuntimeWarning) else: try: raise errors[info][1](errors[info][0]) except KeyError: raise errors['unknown'][1](errors['unknown'][0]) mesg = errors[info][0] if full_output: cov_x = None if info in [1, 2, 3, 4]: from numpy.dual import inv from numpy.linalg import LinAlgError perm = take(eye(n), retval[1]['ipvt'] - 1, 0) r = triu(transpose(retval[1]['fjac'])[:n, :]) R = dot(r, perm) try: cov_x = inv(dot(transpose(R), R)) except (LinAlgError, ValueError): pass return (retval[0], cov_x) + retval[1:-1] + (mesg, info) else: return (retval[0], info)
def fdrcorrection_twostage(pvals, alpha=0.05, method='bky', iter=False, is_sorted=False): '''(iterated) two stage linear step-up procedure with estimation of number of true hypotheses Benjamini, Krieger and Yekuteli, procedure in Definition 6 Parameters ---------- pvals : array_like set of p-values of the individual tests. alpha : float error rate method : {'bky', 'bh') see Notes for details 'bky' : implements the procedure in Definition 6 of Benjamini, Krieger and Yekuteli 2006 'bh' : implements the two stage method of Benjamini and Hochberg iter ; bool Returns ------- rejected : array, bool True if a hypothesis is rejected, False if not pvalue-corrected : array pvalues adjusted for multiple hypotheses testing to limit FDR m0 : int ntest - rej, estimated number of true hypotheses alpha_stages : list of floats A list of alphas that have been used at each stage Notes ----- The returned corrected p-values are specific to the given alpha, they cannot be used for a different alpha. The returned corrected p-values are from the last stage of the fdr_bh linear step-up procedure (fdrcorrection0 with method='indep') corrected for the estimated fraction of true hypotheses. This means that the rejection decision can be obtained with ``pval_corrected <= alpha``, where ``alpha`` is the origianal significance level. (Note: This has changed from earlier versions (<0.5.0) of statsmodels.) BKY described several other multi-stage methods, which would be easy to implement. However, in their simulation the simple two-stage method (with iter=False) was the most robust to the presence of positive correlation TODO: What should be returned? ''' pvals = np.asarray(pvals) if not is_sorted: pvals_sortind = np.argsort(pvals) pvals = np.take(pvals, pvals_sortind) ntests = len(pvals) if method == 'bky': fact = (1. + alpha) alpha_prime = alpha / fact elif method == 'bh': fact = 1. alpha_prime = alpha else: raise ValueError("only 'bky' and 'bh' are available as method") alpha_stages = [alpha_prime] rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_prime, method='indep', is_sorted=True) r1 = rej.sum() if (r1 == 0) or (r1 == ntests): return rej, pvalscorr * fact, ntests - r1, alpha_stages ri_old = r1 while True: ntests0 = 1.0 * ntests - ri_old alpha_star = alpha_prime * ntests / ntests0 alpha_stages.append(alpha_star) #print ntests0, alpha_star rej, pvalscorr = fdrcorrection(pvals, alpha=alpha_star, method='indep', is_sorted=True) ri = rej.sum() if (not iter) or ri == ri_old: break elif ri < ri_old: # prevent cycles and endless loops raise RuntimeError(" oops - shouldn't be here") ri_old = ri # make adjustment to pvalscorr to reflect estimated number of Non-Null cases # decision is then pvalscorr < alpha (or <=) pvalscorr *= ntests0 * 1.0 / ntests if method == 'bky': pvalscorr *= (1. + alpha) if not is_sorted: pvalscorr_ = np.empty_like(pvalscorr) pvalscorr_[pvals_sortind] = pvalscorr del pvalscorr reject = np.empty_like(rej) reject[pvals_sortind] = rej return reject, pvalscorr_, ntests - ri, alpha_stages else: return rej, pvalscorr, ntests - ri, alpha_stages
def extract_particles(self, segmentation): """ Saves particle centers into output .star file, afetr dismissing regions that are too big to contain a particle. Args: segmentation: Segmentation of the micrograph into noise and particle projections. """ segmentation = segmentation[self.query_size // 2 - 1:-self.query_size // 2, self.query_size // 2 - 1:-self.query_size // 2] labeled_segments, _ = ndimage.label(segmentation, np.ones((3, 3))) values, repeats = np.unique(labeled_segments, return_counts=True) values_to_remove = np.where(repeats > self.max_size ** 2) values = np.take(values, values_to_remove) values = np.reshape(values, (1, 1, np.prod(values.shape)), 'F') labeled_segments = np.reshape(labeled_segments, (labeled_segments.shape[0], labeled_segments.shape[1], 1), 'F') matrix1 = np.repeat(labeled_segments, values.shape[2], 2) matrix2 = np.repeat(values, matrix1.shape[0], 0) matrix2 = np.repeat(matrix2, matrix1.shape[1], 1) matrix3 = np.equal(matrix1, matrix2) matrix4 = np.sum(matrix3, 2) segmentation[np.where(matrix4 == 1)] = 0 labeled_segments, _ = ndimage.label(segmentation, np.ones((3, 3))) max_val = np.amax(np.reshape(labeled_segments, (np.prod(labeled_segments.shape)))) center = center_of_mass(segmentation, labeled_segments, np.arange(1, max_val)) center = np.rint(center) img = np.zeros((segmentation.shape[0], segmentation.shape[1])) img[center[:, 0].astype(int), center[:, 1].astype(int)] = 1 y, x = np.ogrid[-self.moa:self.moa+1, -self.moa:self.moa+1] element = x*x+y*y <= self.moa * self.moa img = binary_dilation(img, structure=element) labeled_img, _ = ndimage.label(img, np.ones((3, 3))) values, repeats = np.unique(labeled_img, return_counts=True) y = np.where(repeats == np.count_nonzero(element)) y = np.array(y) y = y.astype(int) y = np.reshape(y, (np.prod(y.shape)), 'F') y -= 1 center = center[y, :] center = center + (self.query_size // 2 - 1) * np.ones(center.shape) center = center + (self.query_size // 2 - 1) * np.ones(center.shape) center = center + np.ones(center.shape) center = 2 * center center = center + 99 * np.ones(center.shape) # swap columns to align with Relion col_2 = center[:, 1].copy() center[:, 1] = center[:, 0] center[:, 0] = col_2[:] basename = os.path.basename(self.filenames) name_str, ext = os.path.splitext(basename) applepick_path = os.path.join(self.output_directory, "{}_applepick.star".format(name_str)) with open(applepick_path, "w") as f: np.savetxt(f, ["data_root\n\nloop_\n_rlnCoordinateX #1\n_rlnCoordinateY #2"], fmt='%s') np.savetxt(f, center, fmt='%d %d') return center
def fdrcorrection(pvals, alpha=0.05, method='indep', is_sorted=False): '''pvalue correction for false discovery rate This covers Benjamini/Hochberg for independent or positively correlated and Benjamini/Yekutieli for general or negatively correlated tests. Both are available in the function multipletests, as method=`fdr_bh`, resp. `fdr_by`. Parameters ---------- pvals : array_like set of p-values of the individual tests. alpha : float error rate method : {'indep', 'negcorr') Returns ------- rejected : array, bool True if a hypothesis is rejected, False if not pvalue-corrected : array pvalues adjusted for multiple hypothesis testing to limit FDR Notes ----- If there is prior information on the fraction of true hypothesis, then alpha should be set to alpha * m/m_0 where m is the number of tests, given by the p-values, and m_0 is an estimate of the true hypothesis. (see Benjamini, Krieger and Yekuteli) The two-step method of Benjamini, Krieger and Yekutiel that estimates the number of false hypotheses will be available (soon). Method names can be abbreviated to first letter, 'i' or 'p' for fdr_bh and 'n' for fdr_by. ''' pvals = np.asarray(pvals) if not is_sorted: pvals_sortind = np.argsort(pvals) pvals_sorted = np.take(pvals, pvals_sortind) else: pvals_sorted = pvals # alias if method in ['i', 'indep', 'p', 'poscorr']: ecdffactor = _ecdf(pvals_sorted) elif method in ['n', 'negcorr']: cm = np.sum(1. / np.arange(1, len(pvals_sorted) + 1)) #corrected this ecdffactor = _ecdf(pvals_sorted) / cm ## elif method in ['n', 'negcorr']: ## cm = np.sum(np.arange(len(pvals))) ## ecdffactor = ecdf(pvals_sorted)/cm else: raise ValueError('only indep and negcorr implemented') reject = pvals_sorted <= ecdffactor * alpha if reject.any(): rejectmax = max(np.nonzero(reject)[0]) reject[:rejectmax] = True pvals_corrected_raw = pvals_sorted / ecdffactor pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] del pvals_corrected_raw pvals_corrected[pvals_corrected > 1] = 1 if not is_sorted: pvals_corrected_ = np.empty_like(pvals_corrected) pvals_corrected_[pvals_sortind] = pvals_corrected del pvals_corrected reject_ = np.empty_like(reject) reject_[pvals_sortind] = reject return reject_, pvals_corrected_ else: return reject, pvals_corrected
def multipletests(pvals, alpha=0.05, method='hs', is_sorted=False, returnsorted=False): '''test results and p-value correction for multiple tests Parameters ---------- pvals : array_like uncorrected p-values alpha : float FWER, family-wise error rate, e.g. 0.1 method : string Method used for testing and adjustment of pvalues. Can be either the full name or initial letters. Available methods are :: `bonferroni` : one-step correction `sidak` : one-step correction `holm-sidak` : step down method using Sidak adjustments `holm` : step-down method using Bonferroni adjustments `simes-hochberg` : step-up method (independent) `hommel` : closed method based on Simes tests (non-negative) `fdr_bh` : Benjamini/Hochberg (non-negative) `fdr_by` : Benjamini/Yekutieli (negative) `fdr_tsbh` : two stage fdr correction (non-negative) `fdr_tsbky` : two stage fdr correction (non-negative) is_sorted : bool If False (default), the p_values will be sorted, but the corrected pvalues are in the original order. If True, then it assumed that the pvalues are already sorted in ascending order. returnsorted : bool not tested, return sorted p-values instead of original sequence Returns ------- reject : array, boolean true for hypothesis that can be rejected for given alpha pvals_corrected : array p-values corrected for multiple tests alphacSidak: float corrected alpha for Sidak method alphacBonf: float corrected alpha for Bonferroni method Notes ----- There may be API changes for this function in the future. Except for 'fdr_twostage', the p-value correction is independent of the alpha specified as argument. In these cases the corrected p-values can also be compared with a different alpha. In the case of 'fdr_twostage', the corrected p-values are specific to the given alpha, see ``fdrcorrection_twostage``. The 'fdr_gbs' procedure is not verified against another package, p-values are derived from scratch and are not derived in the reference. In Monte Carlo experiments the method worked correctly and maintained the false discovery rate. All procedures that are included, control FWER or FDR in the independent case, and most are robust in the positively correlated case. `fdr_gbs`: high power, fdr control for independent case and only small violation in positively correlated case **Timing**: Most of the time with large arrays is spent in `argsort`. When we want to calculate the p-value for several methods, then it is more efficient to presort the pvalues, and put the results back into the original order outside of the function. Method='hommel' is very slow for large arrays, since it requires the evaluation of n partitions, where n is the number of p-values. ''' import gc pvals = np.asarray(pvals) alphaf = alpha # Notation ? if not is_sorted: sortind = np.argsort(pvals) pvals = np.take(pvals, sortind) ntests = len(pvals) alphacSidak = 1 - np.power((1. - alphaf), 1. / ntests) alphacBonf = alphaf / float(ntests) if method.lower() in ['b', 'bonf', 'bonferroni']: reject = pvals <= alphacBonf pvals_corrected = pvals * float(ntests) elif method.lower() in ['s', 'sidak']: reject = pvals <= alphacSidak pvals_corrected = 1 - np.power((1. - pvals), ntests) elif method.lower() in ['hs', 'holm-sidak']: alphacSidak_all = 1 - np.power( (1. - alphaf), 1. / np.arange(ntests, 0, -1)) notreject = pvals > alphacSidak_all del alphacSidak_all nr_index = np.nonzero(notreject)[0] if nr_index.size == 0: # nonreject is empty, all rejected notrejectmin = len(pvals) else: notrejectmin = np.min(nr_index) notreject[notrejectmin:] = True reject = ~notreject del notreject pvals_corrected_raw = 1 - np.power( (1. - pvals), np.arange(ntests, 0, -1)) pvals_corrected = np.maximum.accumulate(pvals_corrected_raw) del pvals_corrected_raw elif method.lower() in ['h', 'holm']: notreject = pvals > alphaf / np.arange(ntests, 0, -1) nr_index = np.nonzero(notreject)[0] if nr_index.size == 0: # nonreject is empty, all rejected notrejectmin = len(pvals) else: notrejectmin = np.min(nr_index) notreject[notrejectmin:] = True reject = ~notreject pvals_corrected_raw = pvals * np.arange(ntests, 0, -1) pvals_corrected = np.maximum.accumulate(pvals_corrected_raw) del pvals_corrected_raw gc.collect() elif method.lower() in ['sh', 'simes-hochberg']: alphash = alphaf / np.arange(ntests, 0, -1) reject = pvals <= alphash rejind = np.nonzero(reject) if rejind[0].size > 0: rejectmax = np.max(np.nonzero(reject)) reject[:rejectmax] = True pvals_corrected_raw = np.arange(ntests, 0, -1) * pvals pvals_corrected = np.minimum.accumulate( pvals_corrected_raw[::-1])[::-1] del pvals_corrected_raw elif method.lower() in ['ho', 'hommel']: # we need a copy because we overwrite it in a loop a = pvals.copy() for m in range(ntests, 1, -1): cim = np.min(m * pvals[-m:] / np.arange(1, m + 1.)) a[-m:] = np.maximum(a[-m:], cim) a[:-m] = np.maximum(a[:-m], np.minimum(m * pvals[:-m], cim)) pvals_corrected = a reject = a <= alphaf elif method.lower() in ['fdr_bh', 'fdr_i', 'fdr_p', 'fdri', 'fdrp']: # delegate, call with sorted pvals reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha, method='indep', is_sorted=True) elif method.lower() in ['fdr_by', 'fdr_n', 'fdr_c', 'fdrn', 'fdrcorr']: # delegate, call with sorted pvals reject, pvals_corrected = fdrcorrection(pvals, alpha=alpha, method='n', is_sorted=True) elif method.lower() in ['fdr_tsbky', 'fdr_2sbky', 'fdr_twostage']: # delegate, call with sorted pvals reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha, method='bky', is_sorted=True)[:2] elif method.lower() in ['fdr_tsbh', 'fdr_2sbh']: # delegate, call with sorted pvals reject, pvals_corrected = fdrcorrection_twostage(pvals, alpha=alpha, method='bh', is_sorted=True)[:2] elif method.lower() in ['fdr_gbs']: #adaptive stepdown in Gavrilov, Benjamini, Sarkar, Annals of Statistics 2009 ## notreject = pvals > alphaf / np.arange(ntests, 0, -1) #alphacSidak ## notrejectmin = np.min(np.nonzero(notreject)) ## notreject[notrejectmin:] = True ## reject = ~notreject ii = np.arange(1, ntests + 1) q = (ntests + 1. - ii) / ii * pvals / (1. - pvals) pvals_corrected_raw = np.maximum.accumulate(q) #up requirementd pvals_corrected = np.minimum.accumulate( pvals_corrected_raw[::-1])[::-1] del pvals_corrected_raw reject = pvals_corrected <= alpha else: raise ValueError('method not recognized') if not pvals_corrected is None: #not necessary anymore pvals_corrected[pvals_corrected > 1] = 1 if is_sorted or returnsorted: return reject, pvals_corrected, alphacSidak, alphacBonf else: pvals_corrected_ = np.empty_like(pvals_corrected) pvals_corrected_[sortind] = pvals_corrected del pvals_corrected reject_ = np.empty_like(reject) reject_[sortind] = reject return reject_, pvals_corrected_, alphacSidak, alphacBonf