def has_non_float(expr, variables): ''' Whether the given expression has an integer or boolean variable in it. Parameters ---------- expr : str The expression to check variables : dict-like `Variable` and `Function` object for all the identifiers used in `expr` Returns ------- has_non_float : bool Whether `expr` has an integer or boolean in it ''' identifiers = get_identifiers_recursively([expr], variables, include_numbers=True) # Check whether there is an integer literal in the expression: for name in identifiers: if name not in variables: try: int(name) # if this worked, this was an integer literal return True except (TypeError, ValueError): pass # not an integer literal non_float_var = any((name in variables and isinstance(name, Variable) and (np.issubdtype(variables[name].dtype, np.integer) or np.issubdtype(variables[name].dtype, np.bool_))) for name in identifiers) return non_float_var
def autotyped(cls, data, units=None): """ Automatically choose between Component and CategoricalComponent, based on the input data type. :param data: The data to pack into a Component :type data: Array-like :param units: Optional units :type units: str :returns: A Component (or subclass) """ data = np.asarray(data) if np.issubdtype(data.dtype, np.object_): return CategoricalComponent(data, units=units) n = coerce_numeric(data) thresh = 0.5 try: use_categorical = np.issubdtype(data.dtype, np.character) and \ np.isfinite(n).mean() <= thresh except TypeError: # isfinite not supported. non-numeric dtype use_categorical = True if use_categorical: return CategoricalComponent(data, units=units) else: return Component(n, units=units)
def _smoketest(self, spxlu, check, dtype): if np.issubdtype(dtype, np.complexfloating): A = self.A + 1j*self.A.T else: A = self.A A = A.astype(dtype) lu = spxlu(A) rng = random.RandomState(1234) # Input shapes for k in [None, 1, 2, self.n, self.n+2]: msg = "k=%r" % (k,) if k is None: b = rng.rand(self.n) else: b = rng.rand(self.n, k) if np.issubdtype(dtype, np.complexfloating): b = b + 1j*rng.rand(*b.shape) b = b.astype(dtype) x = lu.solve(b) check(A, b, x, msg) x = lu.solve(b, 'T') check(A.T, b, x, msg) x = lu.solve(b, 'H') check(A.T.conj(), b, x, msg)
def entrymean(self, m, axis=None): """Average a matrix over the given axis. If the axis is None, average over both rows and columns, returning a scalar. (via some SciPy function) """ # Mimic numpy's casting. The int32/int64 check works around numpy # 1.5.x behavior of np.issubdtype, see gh-2677. if (np.issubdtype(m.dtype, np.float_) or np.issubdtype(m.dtype, np.int_) or m.dtype in [np.dtype('int32'), np.dtype('int64')] or np.issubdtype(m.dtype, np.bool_)): res_dtype = np.float_ elif np.issubdtype(m.dtype, np.complex_): res_dtype = np.complex_ else: res_dtype = m.dtype m = m.astype(res_dtype) mu = m.sum(None) / m.getnnz() # if user or item has no ratings (stripped from training data), set to 0 b_i = m.sum(0) b_u = m.sum(1) with np.errstate(invalid='ignore'): b_i = (b_i / m.getnnz(axis=0)) - mu b_u = (b_u.T / m.getnnz(axis=1)) - mu b_i[np.isnan(b_i)] = 0 b_u[np.isnan(b_u)] = 0 return mu, np.array(b_i)[0], np.array(b_u)[0]
def assert_image_equal(actual, expected): if np.issubdtype(actual.dtype, np.integer): assert_equal(actual, expected) else: if np.issubdtype(expected.dtype, np.integer): expected = expected/float(np.iinfo(expected.dtype).max) assert_allclose(actual, expected, atol=1/256.)
def sum(self, axis=None): """Sum the matrix over the given axis. If the axis is None, sum over both rows and columns, returning a scalar. """ # We use multiplication by an array of ones to achieve this. # For some sparse matrix formats more efficient methods are # possible -- these should override this function. m, n = self.shape # Mimic numpy's casting. if np.issubdtype(self.dtype, np.float_): res_dtype = np.float_ elif (np.issubdtype(self.dtype, np.int_) or np.issubdtype(self.dtype, np.bool_)): res_dtype = np.int_ elif np.issubdtype(self.dtype, np.complex_): res_dtype = np.complex_ else: res_dtype = self.dtype if axis is None: # sum over rows and columns return (self * np.asmatrix(np.ones((n, 1), dtype=res_dtype))).sum() if axis < 0: axis += 2 if axis == 0: # sum over columns return np.asmatrix(np.ones((1, m), dtype=res_dtype)) * self elif axis == 1: # sum over rows return self * np.asmatrix(np.ones((n, 1), dtype=res_dtype)) else: raise ValueError("axis out of bounds")
def mean(self, axis=None): """Average the matrix over the given axis. If the axis is None, average over both rows and columns, returning a scalar. """ # Mimic numpy's casting. The int32/int64 check works around numpy # 1.5.x behavior of np.issubdtype, see gh-2677. if (np.issubdtype(self.dtype, np.float_) or np.issubdtype(self.dtype, np.int_) or self.dtype in [np.dtype('int32'), np.dtype('int64')] or np.issubdtype(self.dtype, np.bool_)): res_dtype = np.float_ elif np.issubdtype(self.dtype, np.complex_): res_dtype = np.complex_ else: res_dtype = self.dtype if axis is None: return self.sum(None) * 1.0 / (self.shape[0]*self.shape[1]) if axis < 0: axis += 2 if axis == 0: mean = self.astype(res_dtype).sum(0) mean *= 1.0 / self.shape[0] return mean elif axis == 1: mean = self.astype(res_dtype).sum(1) mean *= 1.0 / self.shape[1] return mean else: raise ValueError("axis out of bounds")
def RATWriteArray(rat, array, field, start=0): """ Pure Python implementation of writing a chunk of the RAT from a numpy array. Type of array is coerced to one of the types (int, double, string) supported. Called from RasterAttributeTable.WriteArray """ if array is None: raise ValueError("Expected array of dim 1") # if not the array type convert it to handle lists etc if not isinstance(array, numpy.ndarray): array = numpy.array(array) if array.ndim != 1: raise ValueError("Expected array of dim 1") if (start + array.size) > rat.GetRowCount(): raise ValueError("Array too big to fit into RAT from start position") if numpy.issubdtype(array.dtype, numpy.integer): # is some type of integer - coerce to standard int # TODO: must check this is fine on all platforms # confusingly numpy.int 64 bit even if native type 32 bit array = array.astype(numpy.int32) elif numpy.issubdtype(array.dtype, numpy.floating): # is some type of floating point - coerce to double array = array.astype(numpy.double) elif numpy.issubdtype(array.dtype, numpy.character): # cast away any kind of Unicode etc array = array.astype(numpy.character) else: raise ValueError("Array not of a supported type (integer, double or string)") return RATValuesIONumPyWrite(rat, field, start, array)
def _set_dtype(self, dtype, union=False): if np.issubdtype(dtype, np.complexfloating) \ or np.issubdtype(self.dtype, np.complexfloating): self.dtype = np.complex_ else: if not union or self.dtype != np.complex_: self.dtype = np.float_
def fftconvolve(in1, in2, in3=None, mode="full"): """Convolve two N-dimensional arrays using FFT. See convolve. copied from scipy, but here used to try out inverse filter doesn't work or I can't get it to work """ s1 = array(in1.shape) s2 = array(in2.shape) complex_result = (np.issubdtype(in1.dtype, np.complex) or np.issubdtype(in2.dtype, np.complex)) size = s1+s2-1 # Always use 2**n-sized FFT fsize = 2**np.ceil(np.log2(size)) IN1 = fftn(in1,fsize) #IN1 *= fftn(in2,fsize) IN1 /= fftn(in2,fsize) # use inverse filter # note the inverse is elementwise not matrix inverse # is this correct, NO doesn't seem to work fslice = tuple([slice(0, int(sz)) for sz in size]) ret = ifftn(IN1)[fslice].copy() del IN1 if not complex_result: ret = ret.real if mode == "full": return ret elif mode == "same": if product(s1,axis=0) > product(s2,axis=0): osize = s1 else: osize = s2 return _centered(ret,osize) elif mode == "valid": return _centered(ret,abs(s2-s1)+1)
def transform_python_types(self, obj): """handle special scalars, default to default json encoder """ # Pandas Timestamp if is_pandas and isinstance(obj, pd.tslib.Timestamp): return obj.value / 10**6.0 #nanosecond to millisecond elif np.issubdtype(type(obj), np.float): return float(obj) elif np.issubdtype(type(obj), np.int): return int(obj) elif np.issubdtype(type(obj), np.bool_): return bool(obj) # Datetime # datetime is a subclass of date. elif isinstance(obj, dt.datetime): return calendar.timegm(obj.timetuple()) * 1000. + obj.microsecond / 1000. # Date elif isinstance(obj, dt.date): return calendar.timegm(obj.timetuple()) * 1000. # Numpy datetime64 elif isinstance(obj, np.datetime64): epoch_delta = obj - np.datetime64('1970-01-01T00:00:00Z') return (epoch_delta / np.timedelta64(1, 'ms')) # Time elif isinstance(obj, dt.time): return (obj.hour * 3600 + obj.minute * 60 + obj.second) * 1000 + obj.microsecond / 1000. elif is_dateutil and isinstance(obj, relativedelta): return dict(years=obj.years, months=obj.months, days=obj.days, hours=obj.hours, minutes=obj.minutes, seconds=obj.seconds, microseconds=obj.microseconds) # Decimal elif isinstance(obj, decimal.Decimal): return float(obj) else: return super(BokehJSONEncoder, self).default(obj)
def from_numpy_dtype(self, dt): """ From Numpy dtype. >>> from datashape import CType >>> from numpy import dtype >>> CType.from_numpy_dtype(dtype('int32')) ctype("int32") >>> CType.from_numpy_dtype(dtype('i8')) ctype("int64") >>> CType.from_numpy_dtype(dtype('M8')) DateTime(None) >>> CType.from_numpy_dtype(dtype('U30')) ctype("string[30, 'U32']") """ try: return Type.lookup_type(dt.name) except KeyError: pass if np.issubdtype(dt, np.datetime64): unit, _ = np.datetime_data(dt) defaults = {'D': date_, 'Y': date_, 'M': date_, 'W': date_} return defaults.get(unit, datetime_) elif np.issubdtype(dt, np.timedelta64): unit, _ = np.datetime_data(dt) return TimeDelta(unit=unit) elif np.issubdtype(dt, np.unicode_): return String(dt.itemsize // 4, 'U32') elif np.issubdtype(dt, np.str_) or np.issubdtype(dt, np.bytes_): return String(dt.itemsize, 'ascii') raise NotImplementedError("NumPy datatype %s not supported" % dt)
def to_json(o, level=0): ''' format JSON with no line break between list items ''' INDENT = 3 SPACE = " " NEWLINE = "\n" ret = "" if isinstance(o, dict): ret += "{" + NEWLINE comma = "" for k,v in o.iteritems(): ret += comma comma = ",\n" ret += SPACE * INDENT * (level+1) ret += '"' + str(k) + '":' + SPACE ret += Utils.to_json(v, level + 1) ret += NEWLINE + SPACE * INDENT * level + "}" elif isinstance(o, basestring): ret += '"' + o + '"' elif isinstance(o, list): ret += "[" + ",".join([Utils.to_json(e, level+1) for e in o]) + "]" elif isinstance(o, bool): ret += "true" if o else "false" elif isinstance(o, int): ret += str(o) elif isinstance(o, float): ret += '%.7g' % o elif isinstance(o, numpy.ndarray) and numpy.issubdtype(o.dtype, numpy.integer): ret += "[" + ','.join(map(str, o.flatten().tolist())) + "]" elif isinstance(o, numpy.ndarray) and numpy.issubdtype(o.dtype, numpy.inexact): ret += "[" + ','.join(map(lambda x: '%.7g' % x, o.flatten().tolist())) + "]" else: raise TypeError("Unknown type '%s' for json serialization" % str(type(o))) return ret
def into(a, b, **kwargs): dialect = b.dialect.copy() del dialect['lineterminator'] dates = [i for i, typ in enumerate(b.schema[0].types) if 'date' in str(typ)] schema = b.schema if '?' in str(schema): schema = dshape(str(schema).replace('?', '')) dtypes = valmap(to_numpy_dtype, schema[0].dict) datenames = [name for name in dtypes if np.issubdtype(dtypes[name], np.datetime64)] dtypes = dict((k, v) for k, v in dtypes.items() if not np.issubdtype(v, np.datetime64)) if 'strict' in dialect: del dialect['strict'] # Pass only keyword arguments appropriate for read_csv kws = keywords(pd.read_csv) options = toolz.merge(dialect, kwargs) options = toolz.keyfilter(lambda k: k in kws, options) if b.open == gzip.open: options['compression'] = 'gzip' return pd.read_csv(b.path, skiprows=1 if b.header else 0, dtype=dtypes, parse_dates=datenames, names=b.columns, **options)
def find_linear_scale(data): scale = [] scale_name = [] linear_scale = False longest = None if type(data.columns) == pd.MultiIndex: for n, l in enumerate(data.columns.levels): if l.dtype == np.dtype('O'): # Object; maybe str? if longest is None or len(l) > longest: longest = len(l) elif np.issubdtype(l.dtype, np.integer) or np.issubdtype(l.dtype, np.float): linear_scale = True scale = [v[n] for v in data.columns.values] scale_name = data.columns.names[n] if np.issubdtype(l.dtype, np.float): # Prefer float scales, assume more accurate break else: scale = [] linear_scale = True for x in data.columns.values: try: scale.append(float(x)) except: linear_scale = False break return scale, linear_scale, scale_name
def __init__(self, shape, size, dtype=np.uint8, saturation=None, hard_radius=None, signal=None, noise=0, feat_func=feat_gauss, **feat_kwargs): self.ndim = len(shape) self.shape = shape self.dtype = dtype self.image = Frame(np.zeros(shape, dtype=dtype)) self.size = validate_tuple(size, self.ndim) self.isotropic = np.all([self.size[1:] == self.size[:-1]]) self.feat_func = feat_func self.feat_kwargs = feat_kwargs self.noise = noise if saturation is None and np.issubdtype(dtype, np.integer): self.saturation = np.iinfo(dtype).max elif saturation is None and np.issubdtype(dtype, np.float): self.saturation = 1 else: self.saturation = saturation if signal is None: self.signal = self.saturation else: self.signal = signal self.center = tuple([s // 2 for s in shape]) self.hard_radius = hard_radius self._coords = [] self.pos_columns = ['z', 'y', 'x'][-self.ndim:] if self.isotropic: self.size_columns = ['size'] else: self.size_columns = ['size_z', 'size_y', 'size_x'][-self.ndim:]
def _diff(self): if self.a.shape != self.b.shape: self.diff_dimensions = (self.a.shape, self.b.shape) # Don't do any further comparison if the dimensions differ # TODO: Perhaps we could, however, diff just the intersection # between the two images return # Find the indices where the values are not equal # If neither a nor b are floating point, ignore self.tolerance if not ((np.issubdtype(self.a.dtype, float) or np.issubdtype(self.a.dtype, complex)) or (np.issubdtype(self.b.dtype, float) or np.issubdtype(self.b.dtype, complex))): tolerance = 0 else: tolerance = self.tolerance diffs = where_not_allclose(self.a, self.b, atol=0.0, rtol=tolerance) self.diff_total = len(diffs[0]) if self.diff_total == 0: # Then we're done return if self.numdiffs < 0: numdiffs = self.diff_total else: numdiffs = self.numdiffs self.diff_pixels = [(idx, (self.a[idx], self.b[idx])) for idx in islice(izip(*diffs), 0, numdiffs)] self.diff_ratio = float(self.diff_total) / float(len(self.a.flat))
def Execute(self): self.PrintLog('Converting Numpy Array to vtkImageData') self.Image = vtk.vtkImageData() self.Image.SetDimensions(self.ArrayDict['Dimensions']) self.Image.SetOrigin(self.ArrayDict['Origin']) self.Image.SetSpacing(self.ArrayDict['Spacing']) self.Image.SetExtent((0, self.ArrayDict['Dimensions'][0] - 1, 0, self.ArrayDict['Dimensions'][1] - 1, 0, self.ArrayDict['Dimensions'][2] - 1,)) self.PrintLog('converting point data') for pointKey in self.ArrayDict['PointData'].keys(): if np.issubdtype(self.ArrayDict['PointData'][pointKey].dtype, np.floating): pointDataArrayType = vtk.VTK_FLOAT else: for checkDt in [int, np.uint8, np.uint16, np.uint32, np.uint64]: if np.issubdtype(self.ArrayDict['PointData'][pointKey].dtype, checkDt): pointDataArrayType = vtk.VTK_INT break else: continue flatArray = self.ArrayDict['PointData'][pointKey].ravel(order='F') pointDataArray = dsa.numpyTovtkDataArray(flatArray, name=pointKey, array_type=pointDataArrayType) self.Image.GetPointData().SetActiveScalars(pointKey) self.Image.GetPointData().SetScalars(pointDataArray)
def fftconvolve(in1, in2, mode="full"): """Convolve two N-dimensional arrays using FFT. See convolve. """ s1 = array(in1.shape) s2 = array(in2.shape) complex_result = (np.issubdtype(in1.dtype, np.complex) or np.issubdtype(in2.dtype, np.complex)) size = s1+s2-1 IN1 = fftn(in1,size) IN1 *= fftn(in2,size) ret = ifftn(IN1) del IN1 if not complex_result: ret = ret.real if mode == "full": return ret elif mode == "same": if product(s1,axis=0) > product(s2,axis=0): osize = s1 else: osize = s2 return _centered(ret,osize) elif mode == "valid": return _centered(ret,abs(s2-s1)+1)
def isEqual(left, right, eps=None, masked_equal=True): ''' This function checks if two numpy arrays or scalars are equal within machine precision, and returns a scalar logical. ''' diff_type = "Both arguments to function 'isEqual' must be of the same class!" if isinstance(left,np.ndarray): # ndarray if not isinstance(right,np.ndarray): raise TypeError(diff_type) if not left.dtype==right.dtype: right = right.astype(left.dtype) # casting='same_kind' doesn't work... if np.issubdtype(left.dtype, np.inexact): # also catch float32 etc if eps is None: return ma.allclose(left, right, masked_equal=masked_equal) else: return ma.allclose(left, right, masked_equal=masked_equal, atol=eps) elif np.issubdtype(left.dtype, np.integer) or np.issubdtype(left.dtype, np.bool): return np.all( left == right ) # need to use numpy's all() elif isinstance(left,(float,np.inexact)): # numbers if not isinstance(right,(float,np.inexact)): raise TypeError(diff_type) if eps is None: eps = 100.*floateps # default if ( isinstance(right,float) or isinstance(right,float) ) or left.dtype.itemsize == right.dtype.itemsize: return np.absolute(left-right) <= eps else: if left.dtype.itemsize < right.dtype.itemsize: right = left.dtype.type(right) else: left = right.dtype.type(left) return np.absolute(left-right) <= eps elif isinstance(left,(int,bool,np.integer,np.bool)): # logicals if not isinstance(right,(int,bool,np.integer,np.bool)): raise TypeError(diff_type) return left == right else: raise TypeError(left)
def test_random_like(self): """ Test that the random_like function produces sensible data """ # Try for floats and complex data for dtype in [np.float32, np.float64, np.complex64, np.complex128]: # Test random array creation with same # shape and type as existing array shape = (np.random.randint(1, 50), np.random.randint(1, 50)) ary = np.empty(shape=shape, dtype=dtype) random_ary = mbu.random_like(ary) # Test that that the shape and type is correct self.assertTrue(random_ary.shape == ary.shape) self.assertTrue(random_ary.dtype == dtype) # Test that we're getting complex data out if np.issubdtype(dtype, np.complexfloating): proportion_cplx = np.sum(np.iscomplex(random_ary)) / random_ary.size self.assertTrue(proportion_cplx > 0.9) # Test random array creation with supplied shape and type shape = (np.random.randint(1, 50), np.random.randint(1, 50)) random_ary = mbu.random_like(shape=shape, dtype=dtype) # Test that that the shape and type is correct self.assertTrue(random_ary.shape == shape) self.assertTrue(random_ary.dtype == dtype) # Test that we're getting complex data out if np.issubdtype(dtype, np.complexfloating): proportion_cplx = np.sum(np.iscomplex(random_ary)) / random_ary.size self.assertTrue(proportion_cplx > 0.9)
def half_fft_convolve(in1, in2, size, mode = 'full', return_type='real'): """ Rewrite of fftconvolve from scipy.signal ((c) Travis Oliphant 1999-2002) to deal with fft convolution where one signal is not fft transformed and the other one is. Application is, for example, in a loop where convolution happens repeatedly with different kernels over the same signal. First input is not transformed, second input is. """ s1 = np.array(in1.shape) s2 = size - s1 + 1 complex_result = (np.issubdtype( in1.dtype, np.complex) or np.issubdtype( in2.dtype, np.complex) ) # Always use 2**n-sized FFT fsize = 2 **np.ceil( np.log2( size) ) IN1 = fftn(in1, fsize) IN1 *= in2 fslice = tuple( [slice( 0, int(sz)) for sz in size] ) ret = ifftn(IN1)[fslice].copy() del IN1 if not complex_result: ret = ret.real if return_type == 'real': ret = ret.real if mode == 'full': return ret elif mode == 'same': if np.product(s1, axis=0) > np.product(s2, axis=0): osize = s1 else: osize = s2 return _centered(ret, osize) elif mode == 'valid': return _centered(ret, abs(s2 - s1) + 1)
def DtypeToNumberConverter(self, dtype): """Converts a Numpy dtype to a converter method if applicable. The converter method takes in a numpy array of objects of the provided dtype and returns a numpy array of the numbers backing that object for statistical analysis. Returns None if no converter is necessary. Args: dtype: The numpy dtype to make a converter for. Returns: The converter method or None. """ if np.issubdtype(dtype, np.datetime64): def DatetimesToNumbers(dt_list): return np.array([pd.Timestamp(dt).value for dt in dt_list]) return DatetimesToNumbers elif np.issubdtype(dtype, np.timedelta64): def TimedetlasToNumbers(td_list): return np.array([pd.Timedelta(td).value for td in td_list]) return TimedetlasToNumbers else: return None
def load_image(image_file): """ Loads an analyze/nifti image, generally for 3D images. Casts input as 32-bit float (if float) or 32-bit uint (if int). Paramaters ---------- image_file: str path to data Returns ------- dat: nparray Image as numpy array (i.e., 3D array) """ img = nib.load(image_file) dat = img.get_data() # Ensure that data is cast as at least 32-bit if np.issubdtype(dat.dtype, float): dat = dat.astype("float32") # Check for negative values if (dat < 0).any(): print "found negative values, setting to zero (see file: %s)" % image_file dat[dat < 0] = 0 elif np.issubdtype(dat.dtype, int): dat = dat.astype("uint32") else: msg = "Error: Unknown datatype %s" % dat.dtype print msg raise Exception(msg) return dat
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs): """ Read dask Dataframe from bcolz.ctable Parameters ---------- x : bcolz.ctable Input data chunksize : int (optional) The size of blocks to pull out from ctable. Ideally as large as can comfortably fit in memory categorize : bool (defaults to True) Automatically categorize all string dtypes index : string (optional) Column to make the index See Also -------- from_array: more generic function not optimized for bcolz """ import dask.array as da import bcolz if isinstance(x, (str, unicode)): x = bcolz.ctable(rootdir=x) bc_chunklen = max(x[name].chunklen for name in x.names) if chunksize is None and bc_chunklen > 10000: chunksize = bc_chunklen categories = dict() if categorize: for name in x.names: if (np.issubdtype(x.dtype[name], np.string_) or np.issubdtype(x.dtype[name], np.unicode_) or np.issubdtype(x.dtype[name], np.object_)): a = da.from_array(x[name], chunks=(chunksize * len(x.names),)) categories[name] = da.unique(a) columns = tuple(x.dtype.names) divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:] if divisions[-1] != len(x) - 1: divisions = divisions + (len(x) - 1,) new_name = 'from_bcolz' + next(tokens) dsk = dict(((new_name, i), (dataframe_from_ctable, x, (slice(i * chunksize, (i + 1) * chunksize),), None, categories)) for i in range(0, int(ceil(len(x) / chunksize)))) result = DataFrame(dsk, new_name, columns, divisions) if index: assert index in x.names a = da.from_array(x[index], chunks=(chunksize * len(x.names),)) q = np.linspace(0, 100, len(x) // chunksize + 2) divisions = da.percentile(a, q).compute() return set_partition(result, index, divisions, **kwargs) else: return result
def has_inf_or_nan(datum, tensor): """A predicate for whether a tensor consists of any bad numerical values. This predicate is common enough to merit definition in this module. Bad numerical values include nans and infs. The signature of this function follows the requiremnet of DebugDumpDir's find() method. Args: datum: (DebugTensorDatum) Datum metadata. tensor: (numpy.ndarray or None) Value of the tensor. None represents an uninitialized tensor. Returns: (bool) True if and only if tensor consists of any nan or inf values. """ _ = datum # Datum metadata is unused in this predicate. if tensor is None: # Uninitialized tensor doesn't have bad numerical values. return False elif (np.issubdtype(tensor.dtype, np.float) or np.issubdtype(tensor.dtype, np.complex) or np.issubdtype(tensor.dtype, np.integer)): return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor)) else: return False
def load_onto_vtk(self, vtk_data): """ Load the stored information onto a vtk data container. Parameters ---------- vtk_data : vtkPointData or vtkCellData The vtk container to load the value onto. Data are loaded onto the vtk container based on their data type. The name of the added array is the name of the CUBA key (i.e. :samp:`{CUBA}.name`). Currently only scalars and three dimensional vectors are supported. """ def replacer(data): return nan if data is None else data for cuba in self.keys: default = dummy_cuba_value(cuba) if (numpy.issubdtype(type(default), numpy.float) or numpy.issubdtype(type(default), numpy.int)): data = numpy.array(self._data[cuba], dtype=float) index = vtk_data.add_array(data) vtk_data.get_array(index).name = cuba.name elif isinstance(default, numpy.ndarray) and default.size == 3: nan = numpy.array([None, None, None], dtype=float) data = numpy.array( tuple(replacer(data) for data in self._data[cuba]), dtype=numpy.float) index = vtk_data.add_array(data) vtk_data.get_array(index).name = cuba.name else: message = 'property {!r} is currently ignored' warnings.warn(message.format(cuba))
def test_apply_loop_invariant_optimisation_integer(): variables = {'v': Variable('v', scalar=False), 'N': Constant('N', 10), 'b': Variable('b', scalar=True, dtype=int), 'c': Variable('c', scalar=True, dtype=int), 'd': Variable('d', scalar=True, dtype=int), 'y': Variable('y', scalar=True, dtype=float), 'z': Variable('z', scalar=True, dtype=float), 'w': Variable('w', scalar=True, dtype=float), } statements = [Statement('v', '=', 'v % (2*3*N)', '', np.float32), # integer version doesn't get rewritten but float version does Statement('a', ':=', 'b//(c//d)', '', int), Statement('x', ':=', 'y/(z/w)', '', float), ] scalar, vector = optimise_statements([], statements, variables) assert len(scalar) == 3 assert np.issubdtype(scalar[0].dtype, np.signedinteger) assert scalar[0].var == '_lio_1' expr = scalar[0].expr.replace(' ', '') assert expr=='6*N' or expr=='N*6' assert np.issubdtype(scalar[1].dtype, np.signedinteger) assert scalar[1].var == '_lio_2' expr = scalar[1].expr.replace(' ', '') assert expr=='b//(c//d)' assert np.issubdtype(scalar[2].dtype, np.floating) assert scalar[2].var == '_lio_3' expr = scalar[2].expr.replace(' ', '') assert expr=='(y*w)/z' or expr=='(w*y)/z'
def mean(self, axis=None): """Average the matrix over the given axis. If the axis is None, average over both rows and columns, returning a scalar. """ # Mimic numpy's casting. if (np.issubdtype(self.dtype, np.float_) or np.issubdtype(self.dtype, np.integer) or np.issubdtype(self.dtype, np.bool_)): res_dtype = np.float_ elif np.issubdtype(self.dtype, np.complex_): res_dtype = np.complex_ else: res_dtype = self.dtype if axis is None: return self.sum(None) * 1.0 / (self.shape[0]*self.shape[1]) if axis < 0: axis += 2 if axis == 0: mean = self.astype(res_dtype).sum(0) mean *= 1.0 / self.shape[0] return mean elif axis == 1: mean = self.astype(res_dtype).sum(1) mean *= 1.0 / self.shape[1] return mean else: raise ValueError("axis out of bounds")
def _argsortData(self, data, order): if data.ndim == 1: indices = np.argsort(data, kind='mergesort') if order == Qt.DescendingOrder: indices = indices[::-1] # Always sort NaNs last if np.issubdtype(data.dtype, np.number): indices = np.roll(indices, -np.isnan(data).sum()) else: assert np.issubdtype(data.dtype, np.number), \ 'We do not deal with non numeric values in sorting by ' \ 'multiple values' if order == Qt.DescendingOrder: data[:, -1] = -data[:, -1] # In order to make sure NaNs always appear at the end, insert a # indicator whether NaN or not. Note that the data array must # contain an empty column of zeros at index -2 since inserting an # extra column after the fact can result in a MemoryError for data # with a large amount of variables assert np.all(data[:, -2] == 0), \ 'Add an empty column of zeros at index -2 to accomodate NaNs' np.isnan(data[:, -1], out=data[:, -2]) indices = np.lexsort(np.flip(data.T, axis=0)) return indices
def solve_bvp(fun, bc, x, y, p=None, S=None, fun_jac=None, bc_jac=None, tol=1e-3, max_nodes=1000, verbose=0, args=()): """Solve a boundary-value problem for a system of ODEs. This function numerically solves a first order system of ODEs subject to two-point boundary conditions:: dy / dx = f(x, y, p) + S * y / (x - a), a <= x <= b bc(y(a), y(b), p) = 0 Here x is a 1-dimensional independent variable, y(x) is a n-dimensional vector-valued function and p is a k-dimensional vector of unknown parameters which is to be found along with y(x). For the problem to be determined there must be n + k boundary conditions, i.e. bc must be (n + k)-dimensional function. The last singular term in the right-hand side of the system is optional. It is defined by an n-by-n matrix S, such that the solution must satisfy S y(a) = 0. This condition will be forced during iterations, so it must not contradict boundary conditions. See [2]_ for the explanation how this term is handled when solving BVPs numerically. Problems in a complex domain can be solved as well. In this case y and p are considered to be complex, and f and bc are assumed to be complex-valued functions, but x stays real. Note that f and bc must be complex differentiable (satisfy Cauchy-Riemann equations [4]_), otherwise you should rewrite your problem for real and imaginary parts separately. To solve a problem in a complex domain, pass an initial guess for y with a complex data type (see below). Parameters ---------- fun : callable Right-hand side of the system. The calling signature is ``fun(x, y)``, or ``fun(x, y, p)`` if parameters are present. All arguments are ndarray: ``x`` with shape (m,), ``y`` with shape (n, m), meaning that ``y[:, i]`` corresponds to ``x[i]``, and ``p`` with shape (k,). The return value must be an array with shape (n, m) and with the same layout as ``y``. bc : callable Function evaluating residuals of the boundary conditions. The calling signature is ``bc(ya, yb)``, or ``bc(ya, yb, p)`` if parameters are present. All arguments are ndarray: ``ya`` and ``yb`` with shape (n,), and ``p`` with shape (k,). The return value must be an array with shape (n + k,). x : array_like, shape (m,) Initial mesh. Must be a strictly increasing sequence of real numbers with ``x[0]=a`` and ``x[-1]=b``. y : array_like, shape (n, m) Initial guess for the function values at the mesh nodes, i-th column corresponds to ``x[i]``. For problems in a complex domain pass `y` with a complex data type (even if the initial guess is purely real). p : array_like with shape (k,) or None, optional Initial guess for the unknown parameters. If None (default), it is assumed that the problem doesn't depend on any parameters. S : array_like with shape (n, n) or None Matrix defining the singular term. If None (default), the problem is solved without the singular term. fun_jac : callable or None, optional Function computing derivatives of f with respect to y and p. The calling signature is ``fun_jac(x, y)``, or ``fun_jac(x, y, p)`` if parameters are present. The return must contain 1 or 2 elements in the following order: * df_dy : array_like with shape (n, n, m) where an element (i, j, q) equals to d f_i(x_q, y_q, p) / d (y_q)_j. * df_dp : array_like with shape (n, k, m) where an element (i, j, q) equals to d f_i(x_q, y_q, p) / d p_j. Here q numbers nodes at which x and y are defined, whereas i and j number vector components. If the problem is solved without unknown parameters df_dp should not be returned. If `fun_jac` is None (default), the derivatives will be estimated by the forward finite differences. bc_jac : callable or None, optional Function computing derivatives of bc with respect to ya, yb and p. The calling signature is ``bc_jac(ya, yb)``, or ``bc_jac(ya, yb, p)`` if parameters are present. The return must contain 2 or 3 elements in the following order: * dbc_dya : array_like with shape (n, n) where an element (i, j) equals to d bc_i(ya, yb, p) / d ya_j. * dbc_dyb : array_like with shape (n, n) where an element (i, j) equals to d bc_i(ya, yb, p) / d yb_j. * dbc_dp : array_like with shape (n, k) where an element (i, j) equals to d bc_i(ya, yb, p) / d p_j. If the problem is solved without unknown parameters dbc_dp should not be returned. If `bc_jac` is None (default), the derivatives will be estimated by the forward finite differences. tol : float, optional Desired tolerance of the solution. If we define ``r = y' - f(x, y)`` where y is the found solution, then the solver tries to achieve on each mesh interval ``norm(r / (1 + abs(f)) < tol``, where ``norm`` is estimated in a root mean squared sense (using a numerical quadrature formula). Default is 1e-3. max_nodes : int, optional Maximum allowed number of the mesh nodes. If exceeded, the algorithm terminates. Default is 1000. verbose : {0, 1, 2}, optional Level of algorithm's verbosity: * 0 (default) : work silently. * 1 : display a termination report. * 2 : display progress during iterations. Returns ------- Bunch object with the following fields defined: sol : PPoly Found solution for y as `scipy.interpolate.PPoly` instance, a C1 continuous cubic spline. p : ndarray or None, shape (k,) Found parameters. None, if the parameters were not present in the problem. x : ndarray, shape (m,) Nodes of the final mesh. y : ndarray, shape (n, m) Solution values at the mesh nodes. yp : ndarray, shape (n, m) Solution derivatives at the mesh nodes. rms_residuals : ndarray, shape (m - 1,) RMS values of the relative residuals over each mesh interval (see the description of `tol` parameter). niter : int Number of completed iterations. status : int Reason for algorithm termination: * 0: The algorithm converged to the desired accuracy. * 1: The maximum number of mesh nodes is exceeded. * 2: A singular Jacobian encountered when solving the collocation system. message : string Verbal description of the termination reason. success : bool True if the algorithm converged to the desired accuracy (``status=0``). Notes ----- This function implements a 4-th order collocation algorithm with the control of residuals similar to [1]_. A collocation system is solved by a damped Newton method with an affine-invariant criterion function as described in [3]_. Note that in [1]_ integral residuals are defined without normalization by interval lengths. So their definition is different by a multiplier of h**0.5 (h is an interval length) from the definition used here. .. versionadded:: 0.18.0 References ---------- .. [1] J. Kierzenka, L. F. Shampine, "A BVP Solver Based on Residual Control and the Maltab PSE", ACM Trans. Math. Softw., Vol. 27, Number 3, pp. 299-316, 2001. .. [2] L.F. Shampine, P. H. Muir and H. Xu, "A User-Friendly Fortran BVP Solver". .. [3] U. Ascher, R. Mattheij and R. Russell "Numerical Solution of Boundary Value Problems for Ordinary Differential Equations". .. [4] `Cauchy-Riemann equations <https://en.wikipedia.org/wiki/Cauchy-Riemann_equations>`_ on Wikipedia. Examples -------- In the first example we solve Bratu's problem:: y'' + k * exp(y) = 0 y(0) = y(1) = 0 for k = 1. We rewrite the equation as a first order system and implement its right-hand side evaluation:: y1' = y2 y2' = -exp(y1) >>> def fun(x, y): ... return num.vstack((y[1], -num.exp(y[0]))) Implement evaluation of the boundary condition residuals: >>> def bc(ya, yb): ... return num.array([ya[0], yb[0]]) Define the initial mesh with 5 nodes: >>> x = num.linspace(0, 1, 5) This problem is known to have two solutions. To obtain both of them we use two different initial guesses for y. We denote them by subscripts a and b. >>> y_a = num.zeros((2, x.size)) >>> y_b = num.zeros((2, x.size)) >>> y_b[0] = 3 Now we are ready to run the solver. >>> from scipy.integrate import solve_bvp >>> res_a = solve_bvp(fun, bc, x, y_a) >>> res_b = solve_bvp(fun, bc, x, y_b) Let's plot the two found solutions. We take an advantage of having the solution in a spline form to produce a smooth plot. >>> x_plot = num.linspace(0, 1, 100) >>> y_plot_a = res_a.sol(x_plot)[0] >>> y_plot_b = res_b.sol(x_plot)[0] >>> import matplotlib.pyplot as plt >>> plt.plot(x_plot, y_plot_a, label='y_a') >>> plt.plot(x_plot, y_plot_b, label='y_b') >>> plt.legend() >>> plt.xlabel("x") >>> plt.ylabel("y") >>> plt.show() We see that the two solutions have similar shape, but differ in scale significantly. In the second example we solve a simple Sturm-Liouville problem:: y'' + k**2 * y = 0 y(0) = y(1) = 0 It is known that a non-trivial solution y = A * sin(k * x) is possible for k = pi * n, where n is an integer. To establish the normalization constant A = 1 we add a boundary condition:: y'(0) = k Again we rewrite our equation as a first order system and implement its right-hand side evaluation:: y1' = y2 y2' = -k**2 * y1 >>> def fun(x, y, p): ... k = p[0] ... return num.vstack((y[1], -k**2 * y[0])) Note that parameters p are passed as a vector (with one element in our case). Implement the boundary conditions: >>> def bc(ya, yb, p): ... k = p[0] ... return num.array([ya[0], yb[0], ya[1] - k]) Setup the initial mesh and guess for y. We aim to find the solution for k = 2 * pi, to achieve that we set values of y to approximately follow sin(2 * pi * x): >>> x = num.linspace(0, 1, 5) >>> y = num.zeros((2, x.size)) >>> y[0, 1] = 1 >>> y[0, 3] = -1 Run the solver with 6 as an initial guess for k. >>> sol = solve_bvp(fun, bc, x, y, p=[6]) We see that the found k is approximately correct: >>> sol.p[0] 6.28329460046 And finally plot the solution to see the anticipated sinusoid: >>> x_plot = num.linspace(0, 1, 100) >>> y_plot = sol.sol(x_plot)[0] >>> plt.plot(x_plot, y_plot) >>> plt.xlabel("x") >>> plt.ylabel("y") >>> plt.show() """ x = num.asarray(x, dtype=float) if x.ndim != 1: raise ValueError("`x` must be 1 dimensional.") h = num.diff(x) if num.any(h <= 0): raise ValueError("`x` must be strictly increasing.") a = x[0] y = num.asarray(y) if num.issubdtype(y.dtype, num.complexfloating): dtype = complex else: dtype = float y = y.astype(dtype, copy=False) if y.ndim != 2: raise ValueError("`y` must be 2 dimensional.") if y.shape[1] != x.shape[0]: raise ValueError("`y` is expected to have {} columns, but actually " "has {}.".format(x.shape[0], y.shape[1])) if p is None: p = num.array([]) else: p = num.asarray(p, dtype=dtype) if p.ndim != 1: raise ValueError("`p` must be 1 dimensional.") if tol < 100 * EPS: warn("`tol` is too low, setting to {:.2e}".format(100 * EPS)) tol = 100 * EPS if verbose not in [0, 1, 2]: raise ValueError("`verbose` must be in [0, 1, 2].") n = y.shape[0] k = p.shape[0] if S is not None: S = num.asarray(S, dtype=dtype) if S.shape != (n, n): raise ValueError("`S` is expected to have shape {}, " "but actually has {}".format((n, n), S.shape)) # Compute I - S^+ S to impose necessary boundary conditions. B = num.identity(n) - num.dot(pinv(S), S) y[:, 0] = num.dot(B, y[:, 0]) # Compute (I - S)^+ to correct derivatives at x=a. D = pinv(num.identity(n) - S) else: B = None D = None fun_wrapped, bc_wrapped, fun_jac_wrapped, bc_jac_wrapped = wrap_functions( fun, bc, fun_jac, bc_jac, k, a, S, D, dtype) f = fun_wrapped(x, y, p, args) if f.shape != y.shape: raise ValueError("`fun` return is expected to have shape {}, " "but actually has {}.".format(y.shape, f.shape)) bc_res = bc_wrapped(y[:, 0], y[:, -1], p, args) if bc_res.shape != (n + k, ): raise ValueError("`bc` return is expected to have shape {}, " "but actually has {}.".format((n + k, ), bc_res.shape)) status = 0 iteration = 0 if verbose == 2: print_iteration_header() while True: m = x.shape[0] col_fun, jac_sys = prepare_sys(n, m, k, fun_wrapped, bc_wrapped, fun_jac_wrapped, bc_jac_wrapped, x, h, args) y, p, singular = solve_newton(n, m, h, col_fun, bc_wrapped, jac_sys, y, p, B, tol, args) iteration += 1 col_res, y_middle, f, f_middle = collocation_fun( fun_wrapped, y, p, x, h, args) # This relation is not trivial, but can be verified. r_middle = 1.5 * col_res / h sol = create_spline(y, f, x, h) rms_res = estimate_rms_residuals(fun_wrapped, sol, x, h, p, r_middle, f_middle, args) max_rms_res = num.max(rms_res) if singular: status = 2 break insert_1, = num.nonzero((rms_res > tol) & (rms_res < 100 * tol)) insert_2, = num.nonzero(rms_res >= 100 * tol) nodes_added = insert_1.shape[0] + 2 * insert_2.shape[0] if m + nodes_added > max_nodes: status = 1 if verbose == 2: nodes_added = "({})".format(nodes_added) print_iteration_progress(iteration, max_rms_res, m, nodes_added) break if verbose == 2: print_iteration_progress(iteration, max_rms_res, m, nodes_added) if nodes_added > 0: x = modify_mesh(x, insert_1, insert_2) h = num.diff(x) y = sol(x) else: status = 0 break if verbose > 0: if status == 0: print("Solved in {} iterations, number of nodes {}, " "maximum relative residual {:.2e}.".format( iteration, x.shape[0], max_rms_res)) elif status == 1: print("Number of nodes is exceeded after iteration {}, " "maximum relative residual {:.2e}.".format( iteration, max_rms_res)) elif status == 2: print("Singular Jacobian encountered when solving the collocation " "system on iteration {}, maximum relative residual {:.2e}.". format(iteration, max_rms_res)) if p.size == 0: p = None return BVPResult(sol=sol, p=p, x=x, y=y, yp=f, rms_residuals=rms_res, niter=iteration, status=status, message=TERMINATION_MESSAGES[status], success=status == 0)
def test_same(self): for cls in (np.float32, np.int32): for w1, w2 in itertools.product(self.wrappers, repeat=2): assert_(np.issubdtype(w1(cls), w2(cls)))
def test_both_abstract(self): assert_(np.issubdtype(np.floating, np.inexact)) assert_(not np.issubdtype(np.inexact, np.floating))
def array2vtk(num_array, vtk_array=None): """Converts a real numpy Array (or a Python list) to a VTK array object. This function only works for real arrays. Complex arrays are NOT handled. It also works for multi-component arrays. However, only 1, and 2 dimensional arrays are supported. This function is very efficient, so large arrays should not be a problem. Even in cases when no copy of the numpy array data is performed, a reference to the array is cached. The passed array can therefore be deleted safely in all circumstances. Parameters ---------- - num_array : numpy array or Python list/tuple The input array must be 1 or 2D. A copy of the numeric array data passed is made in the following circumstances: 1. A Python list/tuple was passed. 2. A non-contiguous numpy array was passed. 3. A `vtkBitArray` instance was passed as the second argument. 4. The types of the `vtk_array` and the `num_array` are not equivalent to each other. For example if one is an integer array and the other a float. - vtk_array : `vtkDataArray` (default: `None`) If an optional `vtkDataArray` instance, is passed as an argument then a new array is not created and returned. The passed array is itself returned. """ z = numpy.asarray(num_array) shape = z.shape assert len(shape) < 3, \ "Only arrays of dimensionality 2 or lower are allowed!" assert not numpy.issubdtype(z.dtype, numpy.complexfloating), \ "Complex numpy arrays cannot be converted to vtk arrays."\ "Use real() or imag() to get a component of the array before"\ " passing it to vtk." # First create an array of the right type by using the typecode. # Bit arrays need special casing. bit_array = False if vtk_array is None: vtk_typecode = get_vtk_array_type(z.dtype) result_array = create_vtk_array(vtk_typecode) elif vtk_array.GetDataType() == vtkConstants.VTK_BIT: vtk_typecode = vtkConstants.VTK_CHAR result_array = create_vtk_array(vtkConstants.VTK_CHAR) bit_array = True else: vtk_typecode = vtk_array.GetDataType() result_array = vtk_array # Find the shape and set number of components. if len(shape) == 1: result_array.SetNumberOfComponents(1) else: result_array.SetNumberOfComponents(shape[1]) result_array.SetNumberOfTuples(shape[0]) # Ravel the array appropriately. arr_dtype = get_numeric_array_type(vtk_typecode) if numpy.issubdtype(z.dtype, arr_dtype): z_flat = numpy.ravel(z) else: z_flat = numpy.ravel(z).astype(arr_dtype) # Point the VTK array to the numpy data. The last argument (1) # tells the array not to deallocate. result_array.SetVoidArray(getbuffer(z_flat), len(z_flat), 1) if bit_array: # Handle bit arrays -- they have to be copied. Note that bit # arrays are used ONLY when the user has passed one as an # argument to this function. vtk_array.SetNumberOfTuples(result_array.GetNumberOfTuples()) vtk_array.SetNumberOfComponents(result_array.GetNumberOfComponents()) for i in range(result_array.GetNumberOfComponents()): vtk_array.CopyComponent(i, result_array, i) result_array = vtk_array else: # Save a reference to the flatted array in the array cache. # This prevents the user from deleting or resizing the array # and getting into serious trouble. This is only done for # non-bit array cases where the data is not copied. global _array_cache _array_cache.add(result_array, z_flat) return result_array
def assign_policies_to_panel(cases_df, policies, cases_level, aggregate_vars=[], get_latlons=True, errors='raise'): """Assign all policy variables from `policies` to `cases_df` Args: cases_df (pandas.DataFrame): table to assign policy variables to, typically with case data already assigned policies (pandas.DataFrame): table of policies, listed by date and regions affected cases_level (int): Adminisrative unit level used for analysis of policy effects, typically the lowest level which pop-weights have been applied to aggregate_vars (list of str): list of policy variables where optional version should be treated independently of mandatory version Returns: pandas.DataFrame: a version of `cases_df` with all policies from `policies` assigned as new columns """ # Make sure policies input doesn't change unexpectedly policies = policies.copy() # Convert 'optional' to indicator variable if not np.issubdtype(policies['optional'].dtype, np.number): policies['optional'] = policies['optional'].replace({"Y":1, "N":0}) # fill any nans with 0 policies['optional'] = policies['optional'].fillna(0).astype(int) policies['optional'] = policies['optional'].fillna(0) if errors == 'raise': assert len(policies['optional'].unique()) <= 2 elif errors == 'warn': if len(policies['optional'].unique()) > 2: print('there were more than two values for optional: {0}'.format(policies['optional'].unique())) policies['date_end'] = policies['date_end'].fillna(pd.to_datetime('2099-12-31')) # Assign population columns to `policies` and `cases_df` policies, cases_df = cpop.assign_all_populations(policies, cases_df, cases_level, get_latlons=get_latlons, errors=errors) # Assign policy_level to distinguish policies specified at different admin-unit levels policies['policy_level'] = policies.apply(get_policy_level, axis=1) # Treat policies in `aggregate_vars` as independent policies (just like mandatory policies) # Set optional to 0 to avoid applying normal optional logic in `get_policy_vals()` for policy in aggregate_vars: policies.loc[policies['optional'] == 1, 'policy'] = policies.loc[policies['optional'] == 1, 'policy'] + '_opt' policies.loc[policies['optional'] == 1, 'optional'] = 0 policy_list = list(policies['policy'].unique()) policy_popwts = [p + '_popwt' for p in policy_list if p not in exclude_from_popweights] date_min = cases_df['date'].min() date_max = cases_df['date'].max() # Initalize panel with same structure as `cases_df` policy_panel = pd.DataFrame( index=pd.MultiIndex.from_product([ pd.date_range(date_min, date_max), sorted(cases_df[f'adm{cases_level}_name'].unique()) ]), columns=policy_list + policy_popwts).reset_index().rename( columns={'level_0':'date', 'level_1':f'adm{cases_level}_name'} ).fillna(0) # Assign each policy one-by-one to the panel for policy in policy_list: policy_pickle_dict = dict() # Get Series of 4-tuples for mandatory pop-weighted, mandatory indicator, # optional pop-weighted, optional indicator tmp = policy_panel.apply(lambda row: get_policy_vals(policies, policy, row['date'], row[f'adm{cases_level}_name'], cases_level, policy_pickle_dict), axis=1) # Assign regular policy indicator policy_panel[policy] = tmp.apply(lambda x: x[1]) # Assign opt-column if there's anything there opt_col = tmp.apply(lambda x: x[3]) use_opt_col = opt_col.sum() > 0 if use_opt_col: policy_panel[policy + '_opt'] = tmp.apply(lambda x: x[3]) # Assign pop-weighted column if it's not excluded from pop-weighting, and opt-pop-weighted if # Optional and pop-weighted are both used if policy not in exclude_from_popweights: policy_panel[policy + '_popwt'] = tmp.apply(lambda x: x[0]) if use_opt_col: policy_panel[policy + '_opt_popwt'] = tmp.apply(lambda x: x[2]) policy_panel = count_policies_enacted(policy_panel, policy_list) # Merge panel with `cases_df` merged = pd.merge(cases_df, policy_panel, left_on=['date', f'adm{cases_level}_name'], right_on=['date', f'adm{cases_level}_name']) return merged
def vq(obs, code_book, check_finite=True): """ Assign codes from a code book to observations. Assigns a code from a code book to each observation. Each observation vector in the 'M' by 'N' `obs` array is compared with the centroids in the code book and assigned the code of the closest centroid. The features in `obs` should have unit variance, which can be achieved by passing them through the whiten function. The code book can be created with the k-means algorithm or a different encoding algorithm. Parameters ---------- obs : ndarray Each row of the 'M' x 'N' array is an observation. The columns are the "features" seen during each observation. The features must be whitened first using the whiten function or something equivalent. code_book : ndarray The code book is usually generated using the k-means algorithm. Each row of the array holds a different code, and the columns are the features of the code. >>> # f0 f1 f2 f3 >>> code_book = [ ... [ 1., 2., 3., 4.], #c0 ... [ 1., 2., 3., 4.], #c1 ... [ 1., 2., 3., 4.]] #c2 check_finite : bool, optional Whether to check that the input matrices contain only finite numbers. Disabling may give a performance gain, but may result in problems (crashes, non-termination) if the inputs do contain infinities or NaNs. Default: True Returns ------- code : ndarray A length M array holding the code book index for each observation. dist : ndarray The distortion (distance) between the observation and its nearest code. Examples -------- >>> from numpy import array >>> from scipy.cluster.vq import vq >>> code_book = array([[1.,1.,1.], ... [2.,2.,2.]]) >>> features = array([[ 1.9,2.3,1.7], ... [ 1.5,2.5,2.2], ... [ 0.8,0.6,1.7]]) >>> vq(features,code_book) (array([1, 1, 0],'i'), array([ 0.43588989, 0.73484692, 0.83066239])) """ obs = _asarray_validated(obs, check_finite=check_finite) code_book = _asarray_validated(code_book, check_finite=check_finite) ct = np.common_type(obs, code_book) c_obs = obs.astype(ct, copy=False) c_code_book = code_book.astype(ct, copy=False) if np.issubdtype(ct, np.float64) or np.issubdtype(ct, np.float32): return _vq.vq(c_obs, c_code_book) return py_vq(obs, code_book, check_finite=False)
def diff_array(array1, array2, showdiffs=10, raiseondiff=False): if len(array1) != len(array2): print("length is different: %d vs %d" % (len(array1), len(array2))) ids1 = array1['id'] ids2 = array2['id'] all_ids = np.union1d(ids1, ids2) notin1 = np.setdiff1d(ids1, all_ids) notin2 = np.setdiff1d(ids2, all_ids) if notin1: print("the following ids are not present in file 1:", notin1) elif notin2: print("the following ids are not present in file 2:", notin2) else: # some ids must be duplicated if len(ids1) > len(all_ids): print("file 1 contain duplicate ids:", end=' ') uniques, dupes = unique_dupes(ids1) print(dupes) array1 = array1[uniques] if len(ids2) > len(all_ids): print("file 2 contain duplicate ids:", end=' ') uniques, dupes = unique_dupes(ids2) print(dupes) array2 = array2[uniques] fields1 = get_fields(array1) fields2 = get_fields(array2) fnames1 = set(array1.dtype.names) fnames2 = set(array2.dtype.names) # use merge_items instead of fnames1 | fnames2 to preserve ordering for fname, _ in merge_items(fields1, fields2): print(" - %s:" % fname, end=' ') if fname not in fnames1: print("missing in file 1") continue elif fname not in fnames2: print("missing in file 2") continue col1, col2 = array1[fname], array2[fname] if np.issubdtype(col1.dtype, np.inexact): if len(col1) == len(col2): both_nan = np.isnan(col1) & np.isnan(col2) eq = np.all(both_nan | (col1 == col2)) else: eq = False else: eq = np.array_equal(col1, col2) if eq: print("ok") else: print("different", end=' ') if len(col1) != len(col2): print("(length)") else: diff = (col1 != col2).nonzero()[0] print("(%d differences)" % len(diff)) ids = array1['id'] if len(diff) > showdiffs: diff = diff[:showdiffs] print( PrettyTable( [['id', fname + ' (file1)', fname + ' (file2)']] + [[ids[idx], col1[idx], col2[idx]] for idx in diff])) if raiseondiff: raise Exception('different')
def is_complex(dtype): """Returns whether this is a complex floating point type.""" dtype = tf.as_dtype(dtype) if hasattr(dtype, 'is_complex'): return dtype.is_complex return np.issubdtype(np.dtype(dtype), np.complex)
def is_floating(dtype): """Returns whether this is a (non-quantized, real) floating point type.""" dtype = tf.as_dtype(dtype) if hasattr(dtype, 'is_floating'): return dtype.is_floating return np.issubdtype(np.dtype(dtype), np.float)
def upload(meta, cache, image, offset, mip, compress=None, cdn_cache=None, parallel=1, progress=False, delete_black_uploads=False, non_aligned_writes=False, location=None, location_bbox=None, location_order='F', use_shared_memory=False, use_file=False, green=False): """Upload img to vol with offset. This is the primary entry point for uploads.""" if not np.issubdtype(image.dtype, np.dtype(meta.dtype).type): raise ValueError(""" The uploaded image data type must match the volume data type. Volume: {} Image: {} """.format(meta.dtype, image.dtype)) shape = Vec(*image.shape)[:3] offset = Vec(*offset)[:3] bounds = Bbox(offset, shape + offset) is_aligned = check_grid_aligned(meta, image, bounds, mip, throw_error=(non_aligned_writes == False)) if is_aligned: upload_aligned( meta, cache, image, offset, mip, compress=compress, cdn_cache=cdn_cache, parallel=parallel, progress=progress, location=location, location_bbox=location_bbox, location_order=location_order, use_shared_memory=use_shared_memory, use_file=use_file, delete_black_uploads=delete_black_uploads, green=green, ) return # Upload the aligned core expanded = bounds.expand_to_chunk_size(meta.chunk_size(mip), meta.voxel_offset(mip)) retracted = bounds.shrink_to_chunk_size(meta.chunk_size(mip), meta.voxel_offset(mip)) core_bbox = retracted.clone() - bounds.minpt if not core_bbox.subvoxel(): core_img = image[core_bbox.to_slices()] upload_aligned( meta, cache, core_img, retracted.minpt, mip, compress=compress, cdn_cache=cdn_cache, parallel=parallel, progress=progress, location=location, location_bbox=location_bbox, location_order=location_order, use_shared_memory=use_shared_memory, use_file=use_file, delete_black_uploads=delete_black_uploads, green=green, ) # Download the shell, paint, and upload all_chunks = set( chunknames(expanded, meta.bounds(mip), meta.key(mip), meta.chunk_size(mip))) core_chunks = set( chunknames(retracted, meta.bounds(mip), meta.key(mip), meta.chunk_size(mip))) shell_chunks = all_chunks.difference(core_chunks) def shade_and_upload(img3d, bbox): # decode is returning non-writable chunk # we're throwing them away so safe to write img3d.setflags(write=1) shade(img3d, bbox, image, bounds) threaded_upload_chunks( meta, cache, img3d, mip, ((Vec(0, 0, 0), Vec(*img3d.shape[:3]), bbox.minpt, bbox.maxpt), ), compress=compress, cdn_cache=cdn_cache, progress=progress, n_threads=0, delete_black_uploads=delete_black_uploads, green=green, ) compress_cache = should_compress(meta.encoding(mip), compress, cache, iscache=True) download_chunks_threaded(meta, cache, mip, shell_chunks, fn=shade_and_upload, fill_missing=False, progress=progress, compress_cache=compress_cache, green=green)
def validate_cuba_keyword(value, key): ''' Validate the given `value` against `key` such that shape and type of value matches what was specified Parameters ---------- value : object any value key : str CUBA key, can be stripped of 'CUBA.' Returns ------- None Raises ------ TypeError - if key is a CUBA keyword with shape and the value's shape or type does not match - if key corresponds to a class defined by the meta data and the value is not an instance of that class ''' from . import api from simphony.core.keywords import KEYWORDS # Sanitising, although generated code already did key = key.replace('CUBA.', '') # Class name, e.g. cuds_item -> CUDSItem class_name = to_camel_case(key) # The corresponding class in the metadata api_class = getattr(api, class_name, None) # Keyword name in KEYWORDS keyword_name = key.upper() if api_class: if not isinstance(value, api_class): message = '{0!r} is not an instance of {1}' raise TypeError(message.format(value, api_class)) elif keyword_name in KEYWORDS: keyword = KEYWORDS[keyword_name] # Check type value_arr = numpy.asarray(value) if not numpy.issubdtype(value_arr.dtype, keyword.dtype): message = ('value has dtype {dtype1} while {key} ' 'needs to be a {dtype2}') raise TypeError( message.format(dtype1=value_arr.dtype, key=key, dtype2=keyword.dtype)) # FIXME: STRING # cuba.yml gives a fix length for the shape of string # It actually means the maximum length of the string # We will skip checking validating it for now if keyword.dtype is str and value_arr.dtype.char[0] in ('S', 'U'): warnings.warn('Value is a string, its shape is not validated. ' 'Please fix the cuba.yml shape syntax.') return check_cuba_shape(value, keyword_name) else: message = '{} is not defined in CUBA keyword or meta data' warnings.warn(message.format(key.upper()))
def is_integer(dtype): """Returns whether this is a (non-quantized) integer type.""" dtype = tf.as_dtype(dtype) if hasattr(dtype, 'is_integer') and not callable(dtype.is_integer): return dtype.is_integer return np.issubdtype(np.dtype(dtype), np.integer)
np.issctype(object) np.issctype("S8") np.obj2sctype(list) np.obj2sctype(list, default=None) np.obj2sctype(list, default=np.string_) np.issubclass_(np.int32, int) np.issubclass_(np.float64, float) np.issubclass_(np.float64, (int, float)) np.issubsctype("int64", int) np.issubsctype(np.array([1]), np.array([1])) np.issubdtype("S1", np.string_) np.issubdtype(np.float64, np.float32) np.sctype2char("S1") np.sctype2char(list) np.find_common_type([], [np.int64, np.float32, complex]) np.find_common_type((), (np.int64, np.float32, complex)) np.find_common_type([np.int64, np.float32], []) np.find_common_type([np.float32], [np.int64, np.float64]) np.cast[int] np.cast["i8"] np.cast[np.int64] np.nbytes[int]
def _reduce(tf_fn, a, axis=None, dtype=None, keepdims=None, promote_int=_TO_INT64, tf_bool_fn=None, preserve_bool=False): """A general reduction function. Args: tf_fn: the TF reduction function. a: the array to be reduced. axis: (optional) the axis along which to do the reduction. If None, all dimensions are reduced. dtype: (optional) the dtype of the result. keepdims: (optional) whether to keep the reduced dimension(s). promote_int: how to promote integer and bool inputs. There are three choices: (1) _TO_INT64: always promote them to int64 or uint64; (2) _TO_FLOAT: always promote them to a float type (determined by dtypes.default_float_type); (3) None: don't promote. tf_bool_fn: (optional) the TF reduction function for bool inputs. It will only be used if `dtype` is explicitly set to `np.bool_` or if `a`'s dtype is `np.bool_` and `preserve_bool` is True. preserve_bool: a flag to control whether to use `tf_bool_fn` if `a`'s dtype is `np.bool_` (some reductions such as np.sum convert bools to integers, while others such as np.max preserve bools. Returns: An ndarray. """ if dtype: dtype = utils.result_type(dtype) if keepdims is None: keepdims = False a = array_creation.asarray(a, dtype=dtype) if ((dtype == np.bool_ or preserve_bool and a.dtype == np.bool_) and tf_bool_fn is not None): return utils.tensor_to_ndarray( tf_bool_fn(input_tensor=a.data, axis=axis, keepdims=keepdims)) if dtype is None: dtype = a.dtype if np.issubdtype(dtype, np.integer) or dtype == np.bool_: if promote_int == _TO_INT64: # If a is an integer/bool type and whose bit width is less than 64, # numpy up-casts it to 64-bit. if dtype == np.bool_: is_signed = True width = 8 # We can use any number here that is less than 64 else: is_signed = np.issubdtype(dtype, np.signedinteger) width = np.iinfo(dtype).bits if width < 64: if is_signed: dtype = np.int64 else: dtype = np.uint64 a = a.astype(dtype) elif promote_int == _TO_FLOAT: a = a.astype(dtypes.default_float_type()) return utils.tensor_to_ndarray( tf_fn(input_tensor=a.data, axis=axis, keepdims=keepdims))
def iscomplexobj(x): x = np_array_ops.array(x) return np.issubdtype(x.dtype.as_numpy_dtype, np.complexfloating)
def test_voxel_pooling(ml, pos_dtype, feat_dtype, position_fn, feature_fn): # yapf: disable points = np.array([ # 3 points in voxel [0.5, 0.5, 0.5], [0.7, 0.2, 0.3], [0.7, 0.5, 0.9], # 2 points in another voxel [1.4, 1.5, 1.4], [1.7, 1.2, 1.3], ], dtype=pos_dtype) features = np.array([ # 3 points in voxel [1,1], [2,1], [3,1], # 2 points in another voxel [4,1], [5,1], ], dtype=feat_dtype) # yapf: enable voxel_size = 1 ans = mltest.run_op(ml, ml.device, True, ml.ops.voxel_pooling, points, features, voxel_size, position_fn, feature_fn) if position_fn == 'average': expected_positions = np.stack( [np.mean(points[:3], axis=0), np.mean(points[3:], axis=0)]) elif position_fn == 'center': expected_positions = np.array([[0.5, 0.5, 0.5], [1.5, 1.5, 1.5]], dtype=pos_dtype) elif position_fn == 'nearest_neighbor': expected_positions = np.array([points[0], points[3]], dtype=pos_dtype) assert len(ans.pooled_positions) == 2 # compute assignment if np.linalg.norm(ans.pooled_positions[0] - expected_positions[0]) < np.linalg.norm( ans.pooled_positions[0] - expected_positions[1]): index = [0, 1] else: index = [1, 0] np.testing.assert_allclose(ans.pooled_positions, expected_positions[index]) if feature_fn == 'average': if np.issubdtype(feat_dtype, np.integer): expected_features = np.stack([ np.sum(features[:3], axis=0) // 3, np.sum(features[3:], axis=0) // 2 ]) else: expected_features = np.stack( [np.mean(features[:3], axis=0), np.mean(features[3:], axis=0)]) elif feature_fn == 'max': expected_features = np.stack( [np.max(features[:3], axis=0), np.max(features[3:], axis=0)]) elif feature_fn == 'nearest_neighbor': expected_features = np.array([features[0], features[3]]) np.testing.assert_allclose(ans.pooled_features, expected_features[index])
def _daal_check_array(array, accept_sparse=False, *, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, estimator=None): """Input validation on an array, list, sparse matrix or similar. By default, the input is checked to be a non-empty 2D array containing only finite values. If the dtype of the array is object, attempt converting to float, raising on failure. Parameters ---------- array : object Input object to check / convert. accept_sparse : string, boolean or list/tuple of strings (default=False) String[s] representing allowed sparse matrix formats, such as 'csc', 'csr', etc. If the input is sparse but not in the allowed format, it will be converted to the first listed format. True allows the input to be any format. False means that a sparse matrix input will raise an error. accept_large_sparse : bool (default=True) If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by accept_sparse, accept_large_sparse=False will cause it to be accepted only if its indices are stored with a 32-bit dtype. .. versionadded:: 0.20 dtype : string, type, list of types or None (default="numeric") Data type of result. If None, the dtype of the input is preserved. If "numeric", dtype is preserved unless array.dtype is object. If dtype is a list of types, conversion on the first type is only performed if the dtype of the input is not in the list. order : 'F', 'C' or None (default=None) Whether an array will be forced to be fortran or c-style. When order is None (default), then if copy=False, nothing is ensured about the memory layout of the output array; otherwise (copy=True) the memory layout of the returned array is kept as close as possible to the original array. copy : boolean (default=False) Whether a forced copy will be triggered. If copy=False, a copy might be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) Whether to raise an error on np.inf, np.nan, pd.NA in array. The possibilities are: - True: Force all values of array to be finite. - False: accepts np.inf, np.nan, pd.NA in array. - 'allow-nan': accepts only np.nan and pd.NA values in array. Values cannot be infinite. .. versionadded:: 0.20 ``force_all_finite`` accepts the string ``'allow-nan'``. .. versionchanged:: 0.23 Accepts `pd.NA` and converts it into `np.nan` ensure_2d : boolean (default=True) Whether to raise a value error if array is not 2D. allow_nd : boolean (default=False) Whether to allow array.ndim > 2. ensure_min_samples : int (default=1) Make sure that the array has a minimum number of samples in its first axis (rows for a 2D array). Setting to 0 disables this check. ensure_min_features : int (default=1) Make sure that the 2D array has some minimum number of features (columns). The default value of 1 rejects empty datasets. This check is only enforced when the input data has effectively 2 dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 disables this check. estimator : str or estimator instance (default=None) If passed, include the name of the estimator in warning messages. Returns ------- array_converted : object The converted and validated array. """ if force_all_finite not in (True, False, 'allow-nan'): raise ValueError('force_all_finite should be a bool or "allow-nan"' '. Got {!r} instead'.format(force_all_finite)) if estimator is not None: if isinstance(estimator, str): estimator_name = estimator else: estimator_name = estimator.__class__.__name__ else: estimator_name = "Estimator" context = " by %s" % estimator_name if estimator is not None else "" array_orig = array # a branch for heterogeneous pandas.DataFrame if is_DataFrame(array) and get_number_of_types(array) > 1: from pandas.api.types import is_sparse if hasattr(array, 'sparse') or \ not array.dtypes.apply(is_sparse).any(): return _pandas_check_array(array, array_orig, force_all_finite, ensure_min_samples, ensure_min_features, copy, context) # store whether originally we wanted numeric dtype dtype_numeric = isinstance(dtype, str) and dtype == "numeric" dtype_orig = getattr(array, "dtype", None) if not hasattr(dtype_orig, 'kind'): # not a data type (e.g. a column named dtype in a pandas DataFrame) dtype_orig = None # check if the object contains several dtypes (typically a pandas # DataFrame), and store them. If not, store None. dtypes_orig = None has_pd_integer_array = False if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'): # throw warning if columns are sparse. If all columns are sparse, then # array.sparse exists and sparsity will be perserved (later). with suppress(ImportError): from pandas.api.types import is_sparse if not hasattr(array, 'sparse') and \ array.dtypes.apply(is_sparse).any(): warnings.warn("pandas.DataFrame with sparse columns found." "It will be converted to a dense numpy array.") dtypes_orig = list(array.dtypes) # pandas boolean dtype __array__ interface coerces bools to objects for i, dtype_iter in enumerate(dtypes_orig): if dtype_iter.kind == 'b': dtypes_orig[i] = np.dtype(np.object) elif dtype_iter.name.startswith(("Int", "UInt")): # name looks like an Integer Extension Array, now check for # the dtype with suppress(ImportError): from pandas import (Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype) if isinstance( dtype_iter, (Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype, UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype)): has_pd_integer_array = True if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig): dtype_orig = np.result_type(*dtypes_orig) if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O": # if input is object, convert to float. dtype = np.float64 else: dtype = None if isinstance(dtype, (list, tuple)): if dtype_orig is not None and dtype_orig in dtype: # no dtype conversion required dtype = None else: # dtype conversion required. Let's select the first element of the # list of accepted types. dtype = dtype[0] if has_pd_integer_array: # If there are any pandas integer extension arrays, array = array.astype(dtype) # When all dataframe columns are sparse, convert to a sparse array if hasattr(array, 'sparse') and array.ndim > 1: # DataFrame.sparse only supports `to_coo` array = array.sparse.to_coo() if sp.issparse(array): _ensure_no_complex_data(array) array = _ensure_sparse_format(array, accept_sparse=accept_sparse, dtype=dtype, copy=copy, force_all_finite=force_all_finite, accept_large_sparse=accept_large_sparse) else: # If np.array(..) gives ComplexWarning, then we convert the warning # to an error. This is needed because specifying a non complex # dtype to the function converts complex to real dtype, # thereby passing the test made in the lines following the scope # of warnings context manager. with warnings.catch_warnings(): try: warnings.simplefilter('error', ComplexWarning) if dtype is not None and np.dtype(dtype).kind in 'iu': # Conversion float -> int should not contain NaN or # inf (numpy#14412). We cannot use casting='safe' because # then conversion float -> int would be disallowed. array = np.asarray(array, order=order) if array.dtype.kind == 'f': _daal_assert_all_finite(array, allow_nan=False, msg_dtype=dtype) array = array.astype(dtype, casting="unsafe", copy=False) else: array = np.asarray(array, order=order, dtype=dtype) except ComplexWarning: raise ValueError("Complex data not supported\n" "{}\n".format(array)) # It is possible that the np.array(..) gave no warning. This happens # when no dtype conversion happened, for example dtype = None. The # result is that np.array(..) produces an array of complex dtype # and we need to catch and raise exception for such cases. _ensure_no_complex_data(array) # doing nothing for DataFrame if ensure_2d: # If input is scalar raise error if array.ndim == 0: raise ValueError( "Expected 2D array, got scalar array instead:\narray={}.\n" "Reshape your data either using array.reshape(-1, 1) if " "your data has a single feature or array.reshape(1, -1) " "if it contains a single sample.".format(array)) # If input is 1D raise error if array.ndim == 1: raise ValueError( "Expected 2D array, got 1D array instead:\narray={}.\n" "Reshape your data either using array.reshape(-1, 1) if " "your data has a single feature or array.reshape(1, -1) " "if it contains a single sample.".format(array)) # in the future np.flexible dtypes will be handled like object dtypes if dtype_numeric and np.issubdtype(array.dtype, np.flexible): warnings.warn( "Beginning in version 0.22, arrays of bytes/strings will be " "converted to decimal numbers if dtype='numeric'. " "It is recommended that you convert the array to " "a float dtype before using it in scikit-learn, " "for example by using " "your_array = your_array.astype(np.float64).", FutureWarning, stacklevel=2) # make sure we actually converted to numeric: if dtype_numeric and array.dtype.kind == "O": array = array.astype(np.float64) if not allow_nd and array.ndim >= 3: raise ValueError("Found array with dim %d. %s expected <= 2." % (array.ndim, estimator_name)) if force_all_finite: _daal_assert_all_finite(array, allow_nan=force_all_finite == 'allow-nan') if ensure_min_samples > 0: n_samples = _num_samples(array) if n_samples < ensure_min_samples: raise ValueError( "Found array with %d sample(s) (shape=%s) while a" " minimum of %d is required%s." % (n_samples, array.shape, ensure_min_samples, context)) if ensure_min_features > 0 and array.ndim == 2: n_features = array.shape[1] if n_features < ensure_min_features: raise ValueError( "Found array with %d feature(s) (shape=%s) while" " a minimum of %d is required%s." % (n_features, array.shape, ensure_min_features, context)) if copy and np.may_share_memory(array, array_orig): array = np.array(array, dtype=dtype, order=order) return array
def assert_column_equal( left, right, check_dtype=True, check_column_type="equiv", check_less_precise=False, check_exact=False, check_datetimelike_compat=False, check_categorical=True, check_category_order=True, obj="ColumnBase", ): """ Check that left and right columns are equal This function is intended to compare two columns and output any differences. Additional parameters allow varying the strictness of the equality checks performed. Parameters ---------- left : Column left Column to compare right : Column right Column to compare check_dtype : bool, default True Whether to check the Column dtype is identical. check_column_type : bool or {‘equiv’}, default ‘equiv’ Whether to check the columns class, dtype and inferred_type are identical. Currently it is idle, and similar to pandas. check_less_precise : bool or int, default False Not yet supported check_exact : bool, default False Whether to compare number exactly. check_datetime_like_compat : bool, default False Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals obj : str, default ‘ColumnBase’ Specify object name being compared, internally used to show appropriate assertion message. """ if check_dtype is True: if (is_categorical_dtype(left) and is_categorical_dtype(right) and not check_categorical): pass else: if type(left) != type(right) or left.dtype != right.dtype: msg1 = f"{left.dtype}" msg2 = f"{right.dtype}" raise_assert_detail(obj, "Dtypes are different", msg1, msg2) if check_datetimelike_compat: if np.issubdtype(left.dtype, np.datetime64): right = right.astype(left.dtype) elif np.issubdtype(right.dtype, np.datetime64): left = left.astype(right.dtype) if np.issubdtype(left.dtype, np.datetime64): if not left.equals(right): raise AssertionError( f"[datetimelike_compat=True] {left.values} " f"is not equal to {right.values}.") return if check_exact and check_categorical: if is_categorical_dtype(left) and is_categorical_dtype(right): left_cat = left.cat().categories right_cat = right.cat().categories if check_category_order: assert_index_equal( left_cat, right_cat, exact=check_dtype, check_exact=True, check_categorical=False, ) assert_column_equal( left.codes, right.codes, check_dtype=check_dtype, check_exact=True, check_categorical=False, check_category_order=False, ) if left.ordered != right.ordered: msg1 = f"{left.ordered}" msg2 = f"{right.ordered}" raise_assert_detail("{obj} category", "Orders are different", msg1, msg2) if (not check_dtype and is_categorical_dtype(left) and is_categorical_dtype(right)): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) columns_equal = False try: columns_equal = left.equals(right) except TypeError as e: if str(e) != "Categoricals can only compare with the same type": raise e if is_categorical_dtype(left) and is_categorical_dtype(right): left = left.astype(left.categories.dtype) right = right.astype(right.categories.dtype) if not columns_equal: msg1 = f"{left.to_array()}" msg2 = f"{right.to_array()}" try: diff = left.apply_boolean_mask(left != right).size diff = diff * 100.0 / left.size except BaseException: diff = 100.0 raise_assert_detail( obj, f"values are different ({np.round(diff, 5)} %)", msg1, msg2, )
'age_smoke': 'int64', 'fagerstromtotal': 'int64', 'weekday': 'object', 'hour_of_day': 'int64', 'part_of_day_afternoon': 'object', 'part_of_day_evening': 'object', 'part_of_day_morning': 'object', 'part_of_day_night': 'object', 'episode_type': 'object' } X = X.astype(convert_dict) # standardise all features in X: scaler = StandardScaler() num_cols = X.columns[X.dtypes.apply(lambda c: np.issubdtype(c, np.number))] X[num_cols] = scaler.fit_transform(X[num_cols]) del X['episode_type'] y = final_df.loc[:, final_df.columns == 'episode_type'] y = y.astype('int') # create training and testing set of data split 70/30: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) columns = X_train.columns
def average(a, axis=None, weights=None, returned=False): # pylint: disable=missing-docstring if axis is not None and not isinstance(axis, six.integer_types): # TODO(wangpeng): Support tuple of ints as `axis` raise ValueError('`axis` must be an integer. Tuple of ints is not ' 'supported yet. Got type: %s' % type(axis)) a = np_array_ops.array(a) if weights is None: # Treat all weights as 1 if not np.issubdtype(a.dtype.as_numpy_dtype, np.inexact): a = a.astype( np_utils.result_type(a.dtype, np_dtypes.default_float_type())) avg = math_ops.reduce_mean(a, axis=axis) if returned: if axis is None: weights_sum = array_ops.size(a) else: weights_sum = array_ops.shape(a)[axis] weights_sum = math_ops.cast(weights_sum, a.dtype) else: if np.issubdtype(a.dtype.as_numpy_dtype, np.inexact): out_dtype = np_utils.result_type(a.dtype, weights) else: out_dtype = np_utils.result_type(a.dtype, weights, np_dtypes.default_float_type()) a = np_array_ops.array(a, out_dtype) weights = np_array_ops.array(weights, out_dtype) def rank_equal_case(): control_flow_ops.Assert( math_ops.reduce_all( array_ops.shape(a) == array_ops.shape(weights)), [array_ops.shape(a), array_ops.shape(weights)]) weights_sum = math_ops.reduce_sum(weights, axis=axis) avg = math_ops.reduce_sum(a * weights, axis=axis) / weights_sum return avg, weights_sum if axis is None: avg, weights_sum = rank_equal_case() else: def rank_not_equal_case(): control_flow_ops.Assert( array_ops.rank(weights) == 1, [array_ops.rank(weights)]) weights_sum = math_ops.reduce_sum(weights) axes = ops.convert_to_tensor([[axis], [0]]) avg = math_ops.tensordot(a, weights, axes) / weights_sum return avg, weights_sum # We condition on rank rather than shape equality, because if we do the # latter, when the shapes are partially unknown but the ranks are known # and different, np_utils.cond will run shape checking on the true branch, # which will raise a shape-checking error. avg, weights_sum = np_utils.cond( math_ops.equal(array_ops.rank(a), array_ops.rank(weights)), rank_equal_case, rank_not_equal_case) avg = np_array_ops.array(avg) if returned: weights_sum = np_array_ops.broadcast_to(weights_sum, array_ops.shape(avg)) return avg, weights_sum return avg
def test_frame_add_datetime64_column(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') df = DataFrame(index=np.arange(len(rng))) df['A'] = rng assert np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))
def tw_matrices_to_lists(doc_word, doc_tag): """ Modification of matrix_to_lists from lda.utils into its tag-word analogue Convert (sparse) matrices of counts into arrays of tagwords and doc indices Parameters ---------- doc_word : array or sparse matrix (D, W) doc_tag : array or sparse matrix (D, T) Returns ------- (TS, WS, DS) : Tuple of three arrays TS[k] contains the tag in the kth tag-word in the corpus WS[k] contains the word in the kth tag-word in the corpus DS[k] contains the document index for the kth tag-word """ if np.count_nonzero(doc_word.sum(axis=1)) != doc_word.shape[0]: logger.warning("all zero row in document-word matrix found") if np.count_nonzero(doc_word.sum(axis=0)) != doc_word.shape[1]: logger.warning("all zero column in document-word matrix found") if np.count_nonzero(doc_tag.sum(axis=1)) != doc_tag.shape[0]: logger.warning("all zero row in document-tag matrix found") if np.count_nonzero(doc_tag.sum(axis=0)) != doc_tag.shape[1]: logger.warning("all zero column in document-tag matrix found") dw_sparse = True try: # if doc_word is a scipy sparse matrix doc_word = doc_word.copy().tolil() except AttributeError: dw_sparse = False dt_sparse = True try: # if doc_tag is a scipy sparse matrix doc_tag = doc_tag.copy().tolil() except AttributeError: dt_sparse = False if (dw_sparse and not np.issubdtype(doc_word.dtype, int)) or (dt_sparse and not np.issubdtype(doc_tag.dtype, int)): raise ValueError("expected sparse matrix with integer values, found float values") #Obtain doc id + word/tag id lists for nonzero entries in doc_word and doc_tag dw_doc_i, dw_word_i = np.nonzero(doc_word) if dw_sparse: dw_counts_i = np.array(list(doc_word[i, j] for i, j in zip(dw_doc_i, dw_word_i))) else: dw_counts_i = doc_word[dw_doc_i, dw_word_i] dt_doc_i, dt_tag_i = np.nonzero(doc_tag) #group the words and tags by doc id, each iterator returns at each step (doc_id, iter_of_words/tags) dw_word_gb_doc = it.groupby(zip(dw_doc_i, dw_word_i, dw_counts_i), lambda x: x[0]) dt_tag_gb_doc = it.groupby(zip(dt_doc_i,dt_tag_i), lambda x: x[0]) #get an iterator that returns at each step for different doc_ids, (iter_of_words X iter_of_tags) * word repetition #in the form (doc_id, word_id, tag_id) * repetition doc_tagword_iter = it.chain.from_iterable( it.repeat((tup[0][0], tup[0][1], tup[1][1]), tup[0][2]) for tup in (it.chain.from_iterable( it.product(doc_words[1], doc_tags[1]) for doc_words, doc_tags in zip(dw_word_gb_doc, dt_tag_gb_doc) )) ) #doc-tagword array DTWS = np.array(list(doc_tagword_iter)) #return TS, WS, DS return DTWS[:,2], DTWS[:,1], DTWS[:,0]
def specshow(data, x_coords=None, y_coords=None, x_axis=None, y_axis=None, sr=22050, hop_length=512, fmin=None, fmax=None, bins_per_octave=12, **kwargs): '''Display a spectrogram/chromagram/cqt/etc. Parameters ---------- data : np.ndarray [shape=(d, n)] Matrix to display (e.g., spectrogram) sr : number > 0 [scalar] Sample rate used to determine time scale in x-axis. hop_length : int > 0 [scalar] Hop length, also used to determine time scale in x-axis x_axis : None or str y_axis : None or str Range for the x- and y-axes. Valid types are: - None, 'none', or 'off' : no axis decoration is displayed. Frequency types: - 'linear', 'fft', 'hz' : frequency range is determined by the FFT window and sampling rate. - 'log' : the spectrum is displayed on a log scale. - 'mel' : frequencies are determined by the mel scale. - 'cqt_hz' : frequencies are determined by the CQT scale. - 'cqt_note' : pitches are determined by the CQT scale. All frequency types are plotted in units of Hz. Categorical types: - 'chroma' : pitches are determined by the chroma filters. Pitch classes are arranged at integer locations (0-11). - 'tonnetz' : axes are labeled by Tonnetz dimensions (0-5) - 'frames' : markers are shown as frame counts. Time types: - 'time' : markers are shown as milliseconds, seconds, minutes, or hours - 'lag' : like time, but past the half-way point counts as negative values. All time types are plotted in units of seconds. Other: - 'tempo' : markers are shown as beats-per-minute (BPM) using a logarithmic scale. x_coords : np.ndarray [shape=data.shape[1]+1] y_coords : np.ndarray [shape=data.shape[0]+1] Optional positioning coordinates of the input data. These can be use to explicitly set the location of each element `data[i, j]`, e.g., for displaying beat-synchronous features in natural time coordinates. If not provided, they are inferred from `x_axis` and `y_axis`. fmin : float > 0 [scalar] or None Frequency of the lowest spectrogram bin. Used for Mel and CQT scales. If `y_axis` is `cqt_hz` or `cqt_note` and `fmin` is not given, it is set by default to `note_to_hz('C1')`. fmax : float > 0 [scalar] or None Used for setting the Mel frequency scales bins_per_octave : int > 0 [scalar] Number of bins per octave. Used for CQT frequency scale. kwargs : additional keyword arguments Arguments passed through to `matplotlib.pyplot.pcolormesh`. Returns ------- axes The axis handle for the figure. See Also -------- cmap : Automatic colormap detection matplotlib.pyplot.pcolormesh Examples -------- Visualize an STFT power spectrum >>> import matplotlib.pyplot as plt >>> y, sr = librosa.load(librosa.util.example_audio_file()) >>> plt.figure(figsize=(12, 8)) >>> D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max) >>> plt.subplot(4, 2, 1) >>> librosa.display.specshow(D, y_axis='linear') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Linear-frequency power spectrogram') Or on a logarithmic scale >>> plt.subplot(4, 2, 2) >>> librosa.display.specshow(D, y_axis='log') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Log-frequency power spectrogram') Or use a CQT scale >>> CQT = librosa.amplitude_to_db(librosa.cqt(y, sr=sr), ref=np.max) >>> plt.subplot(4, 2, 3) >>> librosa.display.specshow(CQT, y_axis='cqt_note') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Constant-Q power spectrogram (note)') >>> plt.subplot(4, 2, 4) >>> librosa.display.specshow(CQT, y_axis='cqt_hz') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Constant-Q power spectrogram (Hz)') Draw a chromagram with pitch classes >>> C = librosa.feature.chroma_cqt(y=y, sr=sr) >>> plt.subplot(4, 2, 5) >>> librosa.display.specshow(C, y_axis='chroma') >>> plt.colorbar() >>> plt.title('Chromagram') Force a grayscale colormap (white -> black) >>> plt.subplot(4, 2, 6) >>> librosa.display.specshow(D, cmap='gray_r', y_axis='linear') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Linear power spectrogram (grayscale)') Draw time markers automatically >>> plt.subplot(4, 2, 7) >>> librosa.display.specshow(D, x_axis='time', y_axis='log') >>> plt.colorbar(format='%+2.0f dB') >>> plt.title('Log power spectrogram') Draw a tempogram with BPM markers >>> plt.subplot(4, 2, 8) >>> Tgram = librosa.feature.tempogram(y=y, sr=sr) >>> librosa.display.specshow(Tgram, x_axis='time', y_axis='tempo') >>> plt.colorbar() >>> plt.title('Tempogram') >>> plt.tight_layout() Draw beat-synchronous chroma in natural time >>> plt.figure() >>> tempo, beat_f = librosa.beat.beat_track(y=y, sr=sr, trim=False) >>> beat_f = librosa.util.fix_frames(beat_f, x_max=C.shape[1]) >>> Csync = librosa.util.sync(C, beat_f, aggregate=np.median) >>> beat_t = librosa.frames_to_time(beat_f, sr=sr) >>> ax1 = plt.subplot(2,1,1) >>> librosa.display.specshow(C, y_axis='chroma', x_axis='time') >>> plt.title('Chroma (linear time)') >>> ax2 = plt.subplot(2,1,2, sharex=ax1) >>> librosa.display.specshow(Csync, y_axis='chroma', x_axis='time', ... x_coords=beat_t) >>> plt.title('Chroma (beat time)') >>> plt.tight_layout() ''' kwargs.setdefault('shading', 'flat') if np.issubdtype(data.dtype, np.complex): warnings.warn('Trying to display complex-valued input. ' 'Showing magnitude instead.') data = np.abs(data) kwargs.setdefault('cmap', cmap(data)) all_params = dict(kwargs=kwargs, sr=sr, fmin=fmin, fmax=fmax, bins_per_octave=bins_per_octave, hop_length=hop_length) # Get the x and y coordinates y_coords = __mesh_coords(y_axis, y_coords, data.shape[0], **all_params) x_coords = __mesh_coords(x_axis, x_coords, data.shape[1], **all_params) axes = plt.gca() out = axes.pcolormesh(x_coords, y_coords, data, **kwargs) plt.sci(out) axes.set_xlim(x_coords.min(), x_coords.max()) axes.set_ylim(y_coords.min(), y_coords.max()) # Set up axis scaling __scale_axes(axes, x_axis, 'x') __scale_axes(axes, y_axis, 'y') # Construct tickers and locators __decorate_axis(axes.xaxis, x_axis) __decorate_axis(axes.yaxis, y_axis) return axes
def pad_sequences(sequences, maxlen=None, dtype='int32', padding='pre', truncating='pre', value=0., eos=1.): """ Pads sequences to the same length. This function transforms a list of `num_samples` sequences (lists of integers) into a 2D Numpy array of shape `(num_samples, num_timesteps)`. `num_timesteps` is either the `maxlen` argument if provided, or the length of the longest sequence otherwise. Sequences that are shorter than `num_timesteps` are padded with `value` at the end. Sequences longer than `num_timesteps` are truncated so that they fit the desired length. The position where padding or truncation happens is determined by the arguments `padding` and `truncating`, respectively. Pre-padding is the default. # Arguments sequences: List of lists, where each element is a sequence. maxlen: Int, maximum length of all sequences. dtype: Type of the output sequences. To pad sequences with variable length strings, you can use `object`. padding: String, 'pre' or 'post': pad either before or after each sequence. truncating: String, 'pre' or 'post': remove values from sequences larger than `maxlen`, either at the beginning or at the end of the sequences. value: Float or String, padding value. eos = end of sentence index to end each sentence # Returns x: Numpy array with shape `(len(sequences), maxlen)` # Raises ValueError: In case of invalid values for `truncating` or `padding`, or in case of invalid shape for a `sequences` entry. """ if not hasattr(sequences, '__len__'): raise ValueError('`sequences` must be iterable.') lengths = [] for x in sequences: if not hasattr(x, '__len__'): raise ValueError('`sequences` must be a list of iterables. ' 'Found non-iterable: ' + str(x)) lengths.append(len(x)) num_samples = len(sequences) if maxlen is None: maxlen = np.max(lengths) # take the sample shape from the first non empty sequence # checking for consistency in the main loop below. sample_shape = tuple() for s in sequences: if len(s) > 0: sample_shape = np.asarray(s).shape[1:] break is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype( dtype, np.unicode_) if isinstance(value, six.string_types) and dtype != object and not is_dtype_str: raise ValueError( "`dtype` {} is not compatible with `value`'s type: {}:\n" "You should set `dtype=object` for variable length strings.". format(dtype, type(value))) x = np.full((num_samples, maxlen + 1) + sample_shape, value, dtype=dtype) for idx, s in enumerate(sequences): if not len(s): continue # empty list/array was found if truncating == 'pre': trunc = s[-maxlen:] elif truncating == 'post': trunc = s[:maxlen] + [float(eos)] else: raise ValueError('Truncating type "%s" ' 'not understood' % truncating) # check `trunc` has expected shape trunc = np.asarray(trunc, dtype=dtype) if trunc.shape[1:] != sample_shape: raise ValueError('Shape of sample %s of sequence at position %s ' 'is different from expected shape %s' % (trunc.shape[1:], idx, sample_shape)) if padding == 'post': x[idx, :len(trunc)] = trunc elif padding == 'pre': x[idx, -len(trunc):] = trunc else: raise ValueError('Padding type "%s" not understood' % padding) return x
def infer_problem_type(y: Series, silent=False) -> str: """ Identifies which type of prediction problem we are interested in (if user has not specified). Ie. binary classification, multi-class classification, or regression. """ if len(y) == 0: raise ValueError("provided labels cannot have length = 0") y = y.dropna( ) # Remove missing values from y (there should not be any though as they were removed in Learner.general_data_processing()) num_rows = len(y) unique_values = y.unique() MULTICLASS_LIMIT = 1000 # if numeric and class count would be above this amount, assume it is regression if num_rows > 1000: REGRESS_THRESHOLD = 0.05 # if the unique-ratio is less than this, we assume multiclass classification, even when labels are integers else: REGRESS_THRESHOLD = 0.1 unique_count = len(unique_values) if unique_count == 2: problem_type = BINARY reason = "only two unique label-values observed" elif y.dtype.name in ['object', 'category']: problem_type = MULTICLASS reason = f"dtype of label-column == {y.dtype.name}" elif np.issubdtype(y.dtype, np.floating): unique_ratio = unique_count / float(num_rows) if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT): try: can_convert_to_int = np.array_equal(y, y.astype(int)) if can_convert_to_int: problem_type = MULTICLASS reason = "dtype of label-column == float, but few unique label-values observed and label-values can be converted to int" else: problem_type = REGRESSION reason = "dtype of label-column == float and label-values can't be converted to int" except: problem_type = REGRESSION reason = "dtype of label-column == float and label-values can't be converted to int" else: problem_type = REGRESSION reason = "dtype of label-column == float and many unique label-values observed" elif np.issubdtype(y.dtype, np.integer): unique_ratio = unique_count / float(num_rows) if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT): problem_type = MULTICLASS # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression reason = "dtype of label-column == int, but few unique label-values observed" else: problem_type = REGRESSION reason = "dtype of label-column == int and many unique label-values observed" else: raise NotImplementedError(f'label dtype {y.dtype} not supported!') if not silent: logger.log( 25, f"AutoGluon infers your prediction problem is: '{problem_type}' (because {reason})." ) # TODO: Move this outside of this function so it is visible even if problem type was not inferred. if problem_type in [BINARY, MULTICLASS]: if unique_count > 10: logger.log( 20, f'\tFirst 10 (of {unique_count}) unique label values: {list(unique_values[:10])}' ) else: logger.log( 20, f'\t{unique_count} unique label values: {list(unique_values)}' ) elif problem_type == REGRESSION: y_max = y.max() y_min = y.min() y_mean = y.mean() y_stddev = y.std() logger.log( 20, f'\tLabel info (max, min, mean, stddev): ({y_max}, {y_min}, {round(y_mean, 5)}, {round(y_stddev, 5)})' ) logger.log( 25, f"\tIf '{problem_type}' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: {[BINARY, MULTICLASS, REGRESSION]})" ) return problem_type
def test_frame_ctor_datetime64_column(self): rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s') dates = np.asarray(rng) df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates}) assert np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))
def check_array(array, accept_sparse=False, accept_large_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1, estimator=None) -> Tensor: """Input validation on a tensor, list, sparse matrix or similar. By default, the input is checked to be a non-empty 2D array containing only finite values. If the dtype of the tensor is object, attempt converting to float, raising on failure. Parameters ---------- array : object Input object to check / convert. accept_sparse : string, boolean or list/tuple of strings (default=False) String[s] representing allowed sparse matrix formats, such as 'csc', 'csr', etc. If the input is sparse but not in the allowed format, it will be converted to the first listed format. True allows the input to be any format. False means that a sparse matrix input will raise an error. accept_large_sparse : bool (default=True) If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by accept_sparse, accept_large_sparse=False will cause it to be accepted only if its indices are stored with a 32-bit dtype. dtype : string, type, list of types or None (default="numeric") Data type of result. If None, the dtype of the input is preserved. If "numeric", dtype is preserved unless array.dtype is object. If dtype is a list of types, conversion on the first type is only performed if the dtype of the input is not in the list. order : 'F', 'C' or None (default=None) Whether a tenor will be forced to be fortran or c-style. When order is None (default), then if copy=False, nothing is ensured about the memory layout of the output tensor; otherwise (copy=True) the memory layout of the returned tensor is kept as close as possible to the original tensor. copy : boolean (default=False) Whether a forced copy will be triggered. If copy=False, a copy might be triggered by a conversion. force_all_finite : boolean or 'allow-nan', (default=True) Whether to raise an error on np.inf and np.nan in tensor. The possibilities are: - True: Force all values of tensor to be finite. - False: accept both np.inf and np.nan in tensor. - 'allow-nan': accept only np.nan values in tensor. Values cannot be infinite. For object dtyped data, only np.nan is checked and not np.inf. ensure_2d : boolean (default=True) Whether to raise a value error if tensor is not 2D. allow_nd : boolean (default=False) Whether to allow tensor.ndim > 2. ensure_min_samples : int (default=1) Make sure that the tensor has a minimum number of samples in its first axis (rows for a 2D tensor). Setting to 0 disables this check. ensure_min_features : int (default=1) Make sure that the 2D tensor has some minimum number of features (columns). The default value of 1 rejects empty datasets. This check is only enforced when the input data has effectively 2 dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0 disables this check. estimator : str or estimator instance (default=None) If passed, include the name of the estimator in warning messages. Returns ------- array_converted : object The converted and validated tensor. """ # store whether originally we wanted numeric dtype dtype_numeric = isinstance(dtype, str) and dtype == "numeric" dtype_orig = getattr(array, "dtype", None) if not hasattr(dtype_orig, 'kind'): # not a data type (e.g. a column named dtype in a pandas DataFrame) dtype_orig = None if dtype_numeric: if dtype_orig is not None and dtype_orig.kind == "O": # if input is object, convert to float. dtype = np.float64 else: dtype = None if isinstance(dtype, (list, tuple)): if dtype_orig is not None and dtype_orig in dtype: # no dtype conversion required dtype = None else: # dtype conversion required. Let's select the first element of the # list of accepted types. dtype = dtype[0] if force_all_finite not in (True, False, 'allow-nan'): raise ValueError('force_all_finite should be a bool or "allow-nan"' f'. Got {force_all_finite!r} instead') if estimator is not None: if isinstance(estimator, str): estimator_name = estimator else: estimator_name = estimator.__class__.__name__ else: estimator_name = "Estimator" context = f" by {estimator_name}" if estimator is not None else "" if (hasattr(array, 'issparse') and array.issparse()) or issparse(array): _ensure_no_complex_data(array) array = mt.asarray(array) array = _ensure_sparse_format(array, accept_sparse=accept_sparse, dtype=dtype, copy=copy, force_all_finite=force_all_finite, accept_large_sparse=accept_large_sparse) else: # If np.array(..) gives ComplexWarning, then we convert the warning # to an error. This is needed because specifying a non complex # dtype to the function converts complex to real dtype, # thereby passing the test made in the lines following the scope # of warnings context manager. with warnings.catch_warnings(): try: warnings.simplefilter('error', ComplexWarning) array = mt.asarray(array, dtype=dtype, order=order) except ComplexWarning: raise ValueError(f"Complex data not supported\n{array}\n") # It is possible that the np.array(..) gave no warning. This happens # when no dtype conversion happened, for example dtype = None. The # result is that np.array(..) produces an array of complex dtype # and we need to catch and raise exception for such cases. _ensure_no_complex_data(array) if ensure_2d: # If input is scalar raise error if array.ndim == 0: raise ValueError( f"Expected 2D array, got scalar array instead:\narray={array}.\n" "Reshape your data either using array.reshape(-1, 1) if " "your data has a single feature or array.reshape(1, -1) " "if it contains a single sample.") # If input is 1D raise error if array.ndim == 1: raise ValueError( f"Expected 2D array, got 1D array instead:\narray={array}.\n" "Reshape your data either using array.reshape(-1, 1) if " "your data has a single feature or array.reshape(1, -1) " "if it contains a single sample.") # in the future np.flexible dtypes will be handled like object dtypes if dtype_numeric and np.issubdtype(array.dtype, np.flexible): warnings.warn( "Beginning in version 0.22, arrays of bytes/strings will be " "converted to decimal numbers if dtype='numeric'. " "It is recommended that you convert the array to " "a float dtype before using it in scikit-learn, " "for example by using " "your_array = your_array.astype(np.float64).", FutureWarning) # make sure we actually converted to numeric: if dtype_numeric and array.dtype.kind == "O": array = array.astype(np.float64) if not allow_nd and array.ndim >= 3: raise ValueError("Found array with dim %d. %s expected <= 2." % (array.ndim, estimator_name)) if force_all_finite: array = _assert_all_finite( array, allow_nan=force_all_finite == 'allow-nan', check_only=False) if ensure_min_samples > 0: n_samples = _num_samples(array) if n_samples < ensure_min_samples: raise ValueError("Found array with %d sample(s) (shape=%s) while a" " minimum of %d is required%s." % (n_samples, array.shape, ensure_min_samples, context)) if ensure_min_features > 0 and array.ndim == 2: n_features = array.shape[1] if n_features < ensure_min_features: raise ValueError("Found array with %d feature(s) (shape=%s) while" " a minimum of %d is required%s." % (n_features, array.shape, ensure_min_features, context)) if copy: array = mt.array(array, dtype=dtype, order=order) return array
def rdpg_corr(X, Y, r, rescale=False, directed=False, loops=False): r""" Samples a random graph pair based on the latent positions in X (and optionally in Y) If only X :math:`\in\mathbb{R}^{n\times d}` is given, the P matrix is calculated as :math:`P = XX^T`. If X, Y :math:`\in\mathbb{R}^{n\times d}` is given, then :math:`P = XY^T`. These operations correspond to the dot products between a set of latent positions, so each row in X or Y represents the latent positions in :math:`\mathbb{R}^{d}` for a single vertex in the random graph Note that this function may also rescale or clip the resulting P matrix to get probabilities between 0 and 1, or remove loops. A binary random graph is then sampled from the P matrix described by X (and possibly Y). Read more in the :ref:`tutorials <simulations_tutorials>` Parameters ---------- X: np.ndarray, shape (n_vertices, n_dimensions) latent position from which to generate a P matrix if Y is given, interpreted as the left latent position Y: np.ndarray, shape (n_vertices, n_dimensions) or None, optional right latent position from which to generate a P matrix r: float The value of the correlation between the same vertices in two graphs. rescale: boolean, optional (default=True) when rescale is True, will subtract the minimum value in P (if it is below 0) and divide by the maximum (if it is above 1) to ensure that P has entries between 0 and 1. If False, elements of P outside of [0, 1] will be clipped. directed: boolean, optional (default=False) If False, output adjacency matrix will be symmetric. Otherwise, output adjacency matrix will be asymmetric. loops: boolean, optional (default=True) If False, no edges will be sampled in the diagonal. Diagonal elements in P matrix are removed prior to rescaling (see above) which may affect behavior. Otherwise, edges are sampled in the diagonal. Returns ------- G1: ndarray (n_vertices, n_vertices) A matrix representing the probabilities of connections between vertices in a random graph based on their latent positions G2: ndarray (n_vertices, n_vertices) A matrix representing the probabilities of connections between vertices in a random graph based on their latent positions References ---------- .. [1] Vince Lyzinski, Donniell E Fishkind profile imageDonniell E. Fishkind, Carey E Priebe. "Seeded graph matching for correlated Erdös-Rényi graphs". The Journal of Machine Learning Research, January 2014 Examples -------- >>> np.random.seed(1234) >>> X = np.random.dirichlet([1, 1], size=5) >>> Y = None Generate random latent positions using 2-dimensional Dirichlet distribution. Then sample a correlated RDPG graph pair: >>> rdpg_corr(X, Y, 0.3, rescale=False, directed=False, loops=False) (array([[0., 1., 0., 1., 0.], [1., 0., 0., 1., 1.], [0., 0., 0., 0., 0.], [1., 1., 0., 0., 0.], [0., 1., 0., 0., 0.]]), array([[0., 1., 0., 1., 0.], [1., 0., 0., 0., 1.], [0., 0., 0., 0., 0.], [1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]])) """ # check r if not np.issubdtype(type(r), np.floating): raise TypeError("r is not of type float.") elif r < -1 or r > 1: msg = "r must between -1 and 1." raise ValueError(msg) # check directed and loops if type(directed) is not bool: raise TypeError("directed is not of type bool.") if type(loops) is not bool: raise TypeError("loops is not of type bool.") # check dimensions of X and Y if Y != None: if type(X) is not np.ndarray or type(Y) is not np.ndarray: raise TypeError("Latent positions must be numpy.ndarray") if X.ndim != 2 or Y.ndim != 2: raise ValueError( "Latent positions must have dimension 2 (n_vertices, n_dimensions)" ) if X.shape != Y.shape: raise ValueError( "Dimensions of latent positions X and Y must be the same") if Y is None: Y = X P = p_from_latent(X, Y, rescale=rescale, loops=loops) n = P.shape[0] R = np.full((n, n), r) G1, G2 = sample_edges_corr(P, R, directed=directed, loops=loops) return G1, G2
def fit(self, df): """Main fit method for SAR. Args: df (pd.DataFrame): User item rating dataframe """ # generate continuous indices if this hasn't been done if self.index2item is None: self.set_index(df) logger.info("Collecting user affinity matrix") if not np.issubdtype(df[self.col_rating].dtype, np.number): raise TypeError("Rating column data type must be numeric") # copy the DataFrame to avoid modification of the input select_columns = [self.col_user, self.col_item, self.col_rating] if self.time_decay_flag: select_columns += [self.col_timestamp] temp_df = df[select_columns].copy() if self.time_decay_flag: logger.info("Calculating time-decayed affinities") temp_df = self.compute_time_decay(df=temp_df, decay_column=self.col_rating) else: # without time decay use the latest user-item rating in the dataset as the affinity score logger.info("De-duplicating the user-item counts") temp_df = temp_df.drop_duplicates([self.col_user, self.col_item], keep="last") logger.info("Creating index columns") # add mapping of user and item ids to indices temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].apply( lambda item: self.item2index.get(item, np.NaN)) temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].apply( lambda user: self.user2index.get(user, np.NaN)) if self.normalize: logger.info("Calculating normalization factors") temp_df[self.col_unity_rating] = 1.0 if self.time_decay_flag: temp_df = self.compute_time_decay( df=temp_df, decay_column=self.col_unity_rating) self.unity_user_affinity = self.compute_affinity_matrix( df=temp_df, rating_col=self.col_unity_rating) # affinity matrix logger.info("Building user affinity sparse matrix") self.user_affinity = self.compute_affinity_matrix( df=temp_df, rating_col=self.col_rating) # calculate item co-occurrence logger.info("Calculating item co-occurrence") item_cooccurrence = self.compute_coocurrence_matrix(df=temp_df) # free up some space del temp_df self.item_frequencies = item_cooccurrence.diagonal() logger.info("Calculating item similarity") if self.similarity_type is COOCCUR: logger.info("Using co-occurrence based similarity") self.item_similarity = item_cooccurrence elif self.similarity_type is JACCARD: logger.info("Using jaccard based similarity") self.item_similarity = jaccard(item_cooccurrence).astype( df[self.col_rating].dtype) elif self.similarity_type is LIFT: logger.info("Using lift based similarity") self.item_similarity = lift(item_cooccurrence).astype( df[self.col_rating].dtype) else: raise ValueError("Unknown similarity type: {}".format( self.similarity_type)) # free up some space del item_cooccurrence logger.info("Done training")
def get_attr_info(self, variable=None, flag=False): """ Get ARM quality control definitions from the ARM standard bit_#_description, ... attributes and return as dictionary. Will attempt to guess if the flag is integer or bit packed based on what attributes are set. Parameters ---------- variable : str Variable name to get attribute information. If set to None will get global attributes. flag : bool Optional flag indicating if QC is expected to be bitpacked or integer. Flag = True indicates integer QC. Default is bitpacked or False. Returns ------- attributes dictionary : dict or None A dictionary contianing the attribute information converted from ARM QC to CF QC. All keys include 'flag_meanings', 'flag_masks', 'flag_values', 'flag_assessments', 'flag_tests', 'arm_attributes'. Returns None if none found. """ string = 'bit' if flag: string = 'flag' else: found_string = False try: if self._obj.attrs['qc_bit_comment']: string = 'bit' found_string = True except KeyError: pass if found_string is False: try: if self._obj.attrs['qc_flag_comment']: string = 'flag' found_string = True except KeyError: pass if found_string is False: var = self.matched_qc_variables if len(var) > 0: try: if self._obj[variable].attrs[ 'flag_method'] == 'integer': string = 'flag' found_string = True del self._obj[variable].attrs['flag_method'] except KeyError: pass try: if variable: attr_description_pattern = (r"(^" + string + r")_([0-9]+)_(description$)") attr_assessment_pattern = (r"(^" + string + r")_([0-9]+)_(assessment$)") attr_comment_pattern = (r"(^" + string + r")_([0-9]+)_(comment$)") attributes = self._obj[variable].attrs else: attr_description_pattern = (r"(^qc_" + string + r")_([0-9]+)_(description$)") attr_assessment_pattern = (r"(^qc_" + string + r")_([0-9]+)_(assessment$)") attr_comment_pattern = (r"(^qc_" + string + r")_([0-9]+)_(comment$)") attributes = self._obj.attrs except KeyError: return None assessment_bit_num = [] description_bit_num = [] comment_bit_num = [] flag_masks = [] flag_meanings = [] flag_assessments = [] flag_comments = [] arm_attributes = [] dtype = np.int32 for att_name in attributes: try: description = re.match(attr_description_pattern, att_name) description_bit_num.append(int(description.groups()[1])) flag_meanings.append(attributes[att_name]) arm_attributes.append(att_name) except AttributeError: pass try: assessment = re.match(attr_assessment_pattern, att_name) assessment_bit_num.append(int(assessment.groups()[1])) flag_assessments.append(attributes[att_name]) arm_attributes.append(att_name) except AttributeError: pass try: comment = re.match(attr_comment_pattern, att_name) comment_bit_num.append(int(comment.groups()[1])) flag_comments.append(attributes[att_name]) arm_attributes.append(att_name) except AttributeError: pass if variable is not None: # Try and get the data type from the variable if it is an integer # If not an integer make the flag values integers. try: dtype = self._obj[variable].values.dtype if np.issubdtype(dtype, np.integer): pass else: dtype = np.int32 except AttributeError: pass # Sort on bit number to ensure correct description order index = np.argsort(description_bit_num) flag_meanings = np.array(flag_meanings) description_bit_num = np.array(description_bit_num) flag_meanings = flag_meanings[index] description_bit_num = description_bit_num[index] # Sort on bit number to ensure correct assessment order if len(flag_assessments) > 0: if len(flag_assessments) < len(flag_meanings): for ii in range(1, len(flag_meanings) + 1): if ii not in assessment_bit_num: assessment_bit_num.append(ii) flag_assessments.append('') index = np.argsort(assessment_bit_num) flag_assessments = np.array(flag_assessments) flag_assessments = flag_assessments[index] # Sort on bit number to ensure correct comment order if len(flag_comments) > 0: if len(flag_comments) < len(flag_meanings): for ii in range(1, len(flag_meanings) + 1): if ii not in comment_bit_num: comment_bit_num.append(ii) flag_comments.append('') index = np.argsort(comment_bit_num) flag_comments = np.array(flag_comments) flag_comments = flag_comments[index] # Convert bit number to mask number if len(description_bit_num) > 0: flag_masks = np.array(description_bit_num) flag_masks = np.left_shift(1, flag_masks - 1) # build dictionary to return values if len(flag_masks) > 0 or len(description_bit_num) > 0: return_dict = dict() return_dict['flag_meanings'] = list( np.array(flag_meanings, dtype=str)) if len(flag_masks) > 0 and max(flag_masks) > np.iinfo( np.uint32).max: flag_mask_dtype = np.uint64 else: flag_mask_dtype = np.uint32 if flag: return_dict['flag_values'] = list( np.array(description_bit_num, dtype=dtype)) return_dict['flag_masks'] = list( np.array([], dtype=flag_mask_dtype)) else: return_dict['flag_values'] = list(np.array([], dtype=dtype)) return_dict['flag_masks'] = list( np.array(flag_masks, dtype=flag_mask_dtype)) return_dict['flag_assessments'] = list( np.array(flag_assessments, dtype=str)) return_dict['flag_tests'] = list( np.array(description_bit_num, dtype=dtype)) return_dict['flag_comments'] = list( np.array(flag_comments, dtype=str)) return_dict['arm_attributes'] = arm_attributes else: # If nothing to return set to None return_dict = None return return_dict