def score(self, x, y): template = """ @(loopsize: %(xlen)sL) result(for( zip(%(x)s, %(y)s), merger[f32, +], |b, i, e| let res = f32(1) / (f32(1) + exp(f32(0) - result( @(loopsize: %(thlen)sL) for( zip(%(th)s, e.$0), merger[f32, +], |b2, i2, e2| merge(b2, e2.$0 * e2.$1) ) ))); if(res >= f32(0.5) && e.$1 == f32(1.0), merge(b, f32(1)), merge(b, f32(0))) )) """ weldobj = WeldObject(NumPyEncoder(), NumPyDecoder()) weldobj.weld_code = template % { 'th': weldobj.update(self.th, WeldVec(WeldFloat())), # 4 'x': weldobj.update(x, WeldVec(WeldVec(WeldFloat()))), # 5 'y': weldobj.update(y, WeldVec(WeldFloat())), # 6 'xlen': str(len(x)), 'thlen': str(len(self.th)) } score = weldobj.evaluate(WeldFloat()) return score / len(x)
def fit( self, x, y, weldobj=None ): # todo x is required to be a matrix here (i think this is ok, just have brittle types) self.weldobj = weldobj if weldobj else WeldObject( NumPyEncoder(), NumPyDecoder()) m, n = x.shape th = np.zeros(n, dtype=np.float32) idxs = np.arange(m, dtype=np.int64) # pregenerate idxs isamps = np.random.choice(idxs, self.n_iters, replace=True) template = """ @(loopsize: %(niters)sL) iterate( {%(isamps)s, i64(0), %(th)s}, |p| { { p.$0, p.$1 + i64(1), let i = lookup(p.$0, p.$1); let xi = lookup(%(x)s, i); let step = if(p.$1 > i64(0), f32(1) / sqrt(f32(p.$1)), f32(1)); let hx = f32(1) / (f32(1) + exp(f32(0) - f32(result( @(loopsize: %(th_len)sL) for( zip(p.$2, xi), merger[f32, +], |b, ii, e| merge(b, e.$0 * e.$1) ) )))) - f32(lookup(%(y)s, i)); result(@(loopsize: %(th_len)sL) for( p.$2, appender[f32], |b, j, e| merge(b, e - f32(step) * (f32(hx) * lookup(xi, j) / f32(%(m)s) + f32(%(lam)s) / f32(%(m)s) * e)) )) }, p.$1 < i64(%(niters)s) - 1L }).$2""" self.weldobj.weld_code = template % { 'niters': str(self.n_iters), 'isamps': self.weldobj.update(isamps, WeldVec(WeldLong())), 'th': self.weldobj.update(th, WeldVec(WeldFloat())), 'th_len': str(len(th)), 'x': self.weldobj.update(x, WeldVec(WeldVec(WeldFloat()))), 'y': self.weldobj.update(y, WeldVec(WeldFloat())), 'm': str(float(m)), 'lam': str(float(self.lam)) } self.th = self.weldobj.evaluate(WeldVec(WeldFloat())) return self
def weld_subset(array, slice_): """ Return a subset of the input array Parameters ---------- array : np.array or WeldObject 1-dimensional array slice_ : slice subset to return Returns ------- WeldObject representation of this computation """ weld_obj = WeldObject(NumPyEncoder(), NumPyDecoder()) array_var = weld_obj.update(array) if isinstance(array, WeldObject): array_var = array.obj_id weld_obj.dependencies[array_var] = array if slice_.step == 1: weld_template = """ slice( %(array)s, %(slice_start)s, %(slice_stop)s )""" else: weld_template = """ result( for( iter(%(array)s, %(slice_start)s, %(slice_stop)s, %(slice_step)s), appender, |b, i, n| merge(b, n) ) )""" weld_obj.weld_code = weld_template % { 'array': array_var, 'slice_start': 'i64(%s)' % slice_.start, 'slice_stop': 'i64(%s)' % (slice_.stop - slice_.start), 'slice_step': 'i64(%s)' % slice_.step } return weld_obj
class Column(LazyData): encoder = NumPyEncoder() decoder = NumPyDecoder() def __init__(self, name, table, data_id, dtype): self.name = name self.table = table self.data_id = data_id self.dtype = dtype def eager_read(self): # make use of cache by retrieving df = LazyResult.retrieve_file(self.table.file_id) slice_ = slice(self.table.slice_start, self.table.nrows, 1) data = df[self.name][slice_].values # treat any object dtype as str if self.dtype.char == 'O': data = data.astype(np.str) return data def eager_head(self, n=10): # skip the cache and re-use read_file method with param from Table # which will now only read first n rows df = self.table.read_file(n) data = df[self.name][:n].values # treat any object dtype as str if self.dtype.char == 'O': data = data.astype(np.str) return data def lazy_skip_columns(self, columns): # pandas allows skipping some columns efficiently through the usecols parameter for column in columns: self.table.usecols.remove(column) def lazy_slice_rows(self, slice_): # the parser needs to read until stop anyway, and filter later through eager_read self.table.slice_start = slice_.start self.table.nrows = slice_.stop
def predict(self, x): template = """ f32(1) / (f32(1) + exp(f32(0) - result( @(loopsize: %(th_len)sL) for( zip(%(th)s, %(x)s), merger[f32, +], |b, i, e| merge(b, e.$0 * e.$1) ) ))) """ weldobj = WeldObject(NumPyEncoder(), NumPyDecoder()) weldobj.weld_code = template % { 'th': weldobj.update(self.th, WeldVec(WeldFloat())), 'x': weldobj.update(x, WeldVec(WeldFloat())), 'th_len': str(len(self.th)) } ret_ = weldobj.evaluate(WeldFloat(), verbose=False) return 1.0 if ret_ >= 0.5 else 0.0
class Variable(LazyData, LazyResult): """ Weld-ed netCDF4.Variable. Functionality is currently (very) restricted to an example operation, printing, and evaluating. Parameters ---------- file_id : str generated by Dataset from FileMapping column_name : str the variable name in the dataset dimensions : tuple same as netCDF4.Variable.dimensions shape : tuple same as netCDF4.Variable.shape attributes : OrderedDict all Variable metadata expression : str or WeldObject str if created by netCDF4_weld.Dataset, else WeldObject tracking the computations created by operations on this variable; note that expression must be == column_name if created by Dataset! dtype : np.dtype type of the elements in this variable See also -------- netCDF4.Variable """ encoder = NumPyEncoder() decoder = NumPyDecoder() def __init__(self, file_id, column_name, dimensions, shape, attributes, expression, dtype): inferred_dtype = self._infer_dtype(dtype, attributes) weld_type = numpy_to_weld_type(inferred_dtype) LazyResult.__init__(self, expression, weld_type, 1) self.file_id = file_id self.column_name = column_name self.dimensions = dimensions self.shape = shape self.attributes = attributes # when reading data with netCDF4, the values are multiplied by the scale_factor if it exists, # which means that even if data is of type int, the scale factor is often float making the result a float self.dtype = inferred_dtype # same as [:] # the param used to lazy_slice_rows self.tuple_slices = slice(None) self._slice = None @staticmethod def _infer_dtype(dtype, attributes): # TODO: can it be float64? if 'scale_factor' in attributes: return np.dtype(np.float32) # calendar is stored as int in netCDF4, but we want the datetime format later which is encoded as a str(?) if 'calendar' in attributes: return np.dtype(np.str) else: return dtype def eager_read(self, slice_=None): ds = LazyResult.retrieve_file(self.file_id) # implemented like this to allow re-use of this method from eager_head if slice_ is None: slice_ = self.tuple_slices # want just np.array, no MaskedArray; let netCDF4 do the work of replacing missing values ds.variables[self.column_name].set_auto_mask(False) # the actual read from file call data = ds.variables[self.column_name][slice_] # TODO: transpose might be required when data variables have dimensions in a different order than the # dimensions declarations # want dimension = 1 data = data.reshape(-1) attributes = ds.variables[self.column_name].__dict__ # xarray creates a pandas DatetimeIndex with Timestamps (as it should); to save time however, # a shortcut is taken to convert netCDF4 python date -> pandas timestamp -> py datetime # TODO: weld pandas DatetimeIndex & Timestamp if 'calendar' in attributes: data = np.array([str(pd.Timestamp(k).date()) for k in netCDF4.num2date(data, attributes['units'], calendar=attributes['calendar'])], dtype=np.str) # at this point, netcdf is expected to read a subset; however, it reads slightly more at the end, so slice; # self._slice is empty when using eager head if self._slice is not None and self.column_name not in self.dimensions: len_slice = self._slice.stop - self._slice.start return data[:len_slice] else: return data def eager_head(self, n=10): tuple_slices = convert_row_to_nd_slices(slice(0, n, 1), self.shape) # bypass the cache and call directly return self.eager_read(slice_=tuple_slices) def lazy_skip_columns(self, columns): # nothing to do since netcdf is able to read specific columns only pass def lazy_slice_rows(self, slice_): # user wants a slice of rows, so convert to netCDF4 slices for all dimensions if isinstance(slice_, slice): slice_ = replace_slice_defaults(slice_) self._slice = slice_ self.tuple_slices = convert_row_to_nd_slices(slice_, self.shape) elif isinstance(slice_, tuple): # assumed correct self.tuple_slices = slice_ else: raise TypeError('expected either slice or tuple of slices') def __repr__(self): return "{}(column_name={}, dtype={}, dimensions={}, attributes={})".format(self.__class__.__name__, self.column_name, self.dtype, repr(self.dimensions), repr(self.attributes)) def __str__(self): return str(self.expr) # this and add are to show that one could also implement/do Weld operations at this level, not just in pandas def _element_wise_op(self, array, value, operation): weld_obj = WeldObject(Variable.encoder, Variable.decoder) array_var = weld_obj.update(array) if isinstance(array, WeldObject): array_var = array.obj_id weld_obj.dependencies[array_var] = array weld_template = """ result( for(%(array)s, appender[%(type)s], |b: appender[%(type)s], i: i64, n: %(type)s| merge(b, n %(operation)s %(value)s) ) )""" weld_obj.weld_code = weld_template % {'array': array_var, 'value': value, 'operation': operation, 'type': numpy_to_weld_type(self.dtype)} return weld_obj def __add__(self, value): return Variable(self.file_id, self.column_name, self.shape, self.dimensions, self.attributes, self._element_wise_op(self.expr, value, '+'), self.dtype)
import numpy as np from grizzly.encoders import NumPyEncoder, NumPyDecoder, numpy_to_weld_type from weld.types import WeldLong from weld.weldobject import WeldObject from lazy_result import LazyResult # the methods are only intended to work with numpy, so have a single encoder/decoder _encoder = NumPyEncoder() _decoder = NumPyDecoder() # TODO: could generalize to return either values or indices def _duplicate_elements_indices(array, n, weld_type, cartesian=False): weld_obj = WeldObject(_encoder, _decoder) array_var = weld_obj.update(array) if isinstance(array, WeldObject): array_var = array.obj_id weld_obj.dependencies[array_var] = array if isinstance(n, WeldObject): weld_obj.update(n) weld_obj.dependencies[n.obj_id] = n n = 'len(%s)' % n.obj_id elif isinstance(n, np.ndarray): array_var = weld_obj.update(n) n = 'len(%s)' % array_var