def simple_serialiser(node, table): ''' Serialize using the given table. ''' stream = leaves(node, Node) converter = make_converter(table) return reduce_(add, [converter(value)[1] for value in stream])
def mcompose(*funcs): def _composer(f, g): def inner(*args, **kwargs): return f(*g(*args, **kwargs)) return inner return reduce_(_composer, funcs)
def count_sold(iterable: list, default=0): function = (lambda a, b: a + b.quantity if type(a) is int else a.quantity + b.quantity) if not iterable: return default elif len(iterable) == 1: return iterable[0].quantity else: return reduce_(function, iterable)
def build_row(cell): cidxs = list(reversed([c[1] for c in cell])) cm = [c[0] for c in cell] mvalues = [ reduce_( lambda memo, cur: memo[cur], # navigate to values[coords] cidxs + [mi], values) for mi, m in enumerate(measures) ] return cm + mvalues
def post_process(self, rows): """ Build a table including: Project name | Bug pts | Story pts | Debt % Name1 | NameN | Grand Total | """ project_points_by_type = reduce_(self._to_story_points, rows, {}) results = ({ 'project_name': k, 'bug_points': self._format_points(v[0]), 'story_points': self._format_points(v[1]), 'tech_debt': self._tech_debt_perc(v) } for k, v in project_points_by_type.items()) sort_by_name = partial(self._sort_by_name, 'project_name') return sorted(results, key=cmp_to_key(sort_by_name))
def read_time_series( self, variables=(), start_time=pytz.utc.localize(datetime.min), end_time=pytz.utc.localize(datetime.max), selectdim=None, size_limit=1000 * 1000 * 1000, series=None, series_name_fmt=None): """ Read a list of time-series variables from this fileset. Args: variables: A list of strs containing time series variable names to be read. start_time: A datetime, which is timezone aware, of the start time of the series to read. end_time: A datetime, timezone aware, end time of series to read. selectdim: A dict containing for each dimension name of type string, the indices of the dimension to read. For example: {"station":[3,4,5]} to read indices 3,4 and 5 (indexed from 0) of the station dimension for variables which have that dimension. size_limit: Limit on the total size in bytes to read, used to screen huge requests. series: A list of series to be read by name. series_fmt: a datetime.strftime format to create a series name for the data found in each file, based on the time associated with the file. If series_name_fmt is None, all data is put in a dictionary element named ''. Returns: A dict containing, by series name: 'time' : list of UTC timestamps, 'data': list of numpy.ndarray containing the data for each variable, 'vmap': dict by variable name, containing the index into the series data for the variable, 'dim2': dict by variable name, of values for second dimension of the data, such as height, } Raises: nc_exc.NoDataFoundException nc_exc.NoDataException The 'data' element in the returned dict is a list of numpy arrays, and not a dict by variable name. The 'vmap' element provides the mapping from a variable name to an index into 'data'. The data object is typically JSON-ified and sent to a browser. If it were a dict, the variable names may contain characters which cause headaches with JSON and javascript in django templates. For example, the JSON-ified string is typically passed to javascript in a django template by surrounding it with single quotes: var data = jQuery.parseJSON('{{ data }}'); A single quote within the data JSON string causes grief, and we want to support single quotes in variable names. The only work around I know of is to convert the single quotes within the string to '\u0027'. This is, of course, a time-consuming step we want to avoid when JSON-ifying a large chunk of data. It is less time-consuming to replace the quotes in the smaller vmap. The series names will not contain single quotes. """ debug = False dsinfo = self.get_dataset_info() if not dsinfo['time_name']: self.get_variables() dsinfo = self.get_dataset_info() dsinfo_vars = dsinfo['variables'] if not selectdim: selectdim = {} vshapes = self.resolve_variable_shapes(variables, selectdim) res_data = {} total_size = 0 ntimes = 0 files = self.get_files(start_time, end_time) if debug: _logger.debug( "len(files)=%d, series_name_fmt=%s", len(files), series_name_fmt) if series_name_fmt: file_tuples = [(f.time.strftime(series_name_fmt), f.path) \ for f in files] else: file_tuples = [("", f.path) for f in files] for (series_name, ncpath) in file_tuples: if series and not series_name in series: continue if debug: _logger.debug("series=%s", str(series)) _logger.debug("series_name=%s ,ncpath=%s", series_name, ncpath) # the files might be in the process of being moved, deleted, etc fileok = False exc = None for itry in range(0, 3): try: ncfile = netCDF4.Dataset(ncpath) fileok = True break except (OSError, RuntimeError) as exc: time.sleep(itry) if not fileok: _logger.error("%s: %s", ncpath, exc) continue if not series_name in res_data: res_data[series_name] = { 'time': [], 'data': [], 'vmap': {}, 'dim2': {}, } otime = res_data[series_name]['time'] odata = res_data[series_name]['data'] ovmap = res_data[series_name]['vmap'] odim2 = res_data[series_name]['dim2'] try: size1 = sys.getsizeof(otime) # times are apended to otime time_slice = self.read_times( ncfile, ncpath, start_time, end_time, otime, size_limit - total_size) # time_slice.start is None if nothing to read if time_slice.start is None or \ time_slice.stop <= time_slice.start: continue total_size += sys.getsizeof(otime) - size1 for exp_vname in variables: # skip if variable is not a time series or # doesn't have a selected dimension if not exp_vname in dsinfo_vars or not exp_vname in vshapes: continue # selected shape of this variable vshape = vshapes[exp_vname] vsize = reduce_( operator.mul, vshape, 1) * \ dsinfo_vars[exp_vname]["dtype"].itemsize if total_size + vsize > size_limit: raise nc_exc.TooMuchDataException( "too much data requested, will exceed {} mbytes". format(size_limit/(1000 * 1000))) dim2 = {} vdata = self.read_time_series_data( ncfile, ncpath, exp_vname, time_slice, vshape, selectdim, dim2) if not exp_vname in odim2: odim2[exp_vname] = dim2 if not exp_vname in ovmap: size1 = 0 vindex = len(odata) odata.append(vdata) ovmap[exp_vname] = vindex else: if debug: _logger.debug( "odata[%s].shape=%s, vdata.shape=%s", exp_vname, odata[exp_vname].shape, vdata.shape) vindex = ovmap[exp_vname] size1 = sys.getsizeof(odata[vindex]) time_index = dsinfo_vars[exp_vname]["time_index"] odata[vindex] = np.append( odata[vindex], vdata, axis=time_index) total_size += sys.getsizeof(odata[vindex]) - size1 finally: ncfile.close() ntimes += len(otime) if ntimes == 0: exc = nc_exc.NoDataException( "No data found between {} and {}". format( start_time.isoformat(), end_time.isoformat())) # _logger.warning("%s: %s", str(self), repr(exc)) raise exc ncol_read = sum([len(cdata) for (i, cdata) in res_data.items()]) if ncol_read == 0: exc = nc_exc.NoDataException( "No variables named {} found between {} and {}". format( repr(variables), start_time.isoformat(), end_time.isoformat())) # _logger.warning("%s: %s", str(self), repr(exc)) raise exc if debug: for series_name in res_data.keys(): for exp_vname in res_data[series_name]['vmap']: var_index = res_data[series_name]['vmap'][exp_vname] _logger.debug( "res_data[%s][%d].shape=%s, exp_vname=%s", series_name, var_index, repr(res_data[series_name][var_index].shape), exp_vname) _logger.debug( "total_size=%d", total_size) return res_data
def reduce(f, initializer, iterable): return reduce_(f, iterable, initializer)
def _fill_data(self, name, index, dest): var = self.variables[name] shape = tuple(self.coordinates[dim].length for dim in var.dimensions) size = reduce_(lambda x, y: x*y, shape, 1) numpy.copyto(dest, numpy.arange(size).reshape(shape)[index])
def add_all(iterable): return reduce_(operator.add, iterable)
import numpy as np import pyopencl as cl from .tensor import OpenCLTensor # tools from functools import lru_cache, reduce as reduce_ from itertools import zip_longest, chain # utils from pyopencl.tools import dtype_to_ctype from numpy import uint32 as i32 from math import ceil, log2 # typing from typing import Tuple __all__ = ['atom', 'dot', 'reduce', 'conv'] prod = lambda arr: reduce_(lambda x, y: x * y, arr, 1) nl = lambda i=0: ('\n' + ' ' * i) # # Elementwise Kernel # @lru_cache(maxsize=None) def cache_build_atom_kernel( context: cl.Context, op: str, # operation to execute on variables # buffers buffers: tuple, # unique names of variables / buffers buffer_dtypes: tuple, # types of variables / buffers ndim: int, # number of dimensions
def prod(a): """Product of a sequence""" return reduce_(mul_op, a, 1)
def _fill_data(self, name, index, dest): var = self.variables[name] shape = tuple(self.coordinates[dim].length for dim in var.dimensions) size = reduce_(lambda x, y: x * y, shape, 1) numpy.copyto(dest, numpy.arange(size).reshape(shape)[index])
def to_pandas(self, filter_empty_measures=True): tidy = self.tidy columns = [] table = [] properties = self._agg_params.get('properties', []) measures = self._agg_params['measures'] props = parse_properties(properties) pnames = [Identifier.parse(i).segments[-1].name for i in properties] # header row if self._agg_params['parents']: slices = [] for dd in tidy['axes']: slices.append(dd['level_depth']) for ancestor_level in self._cube.dimensions_by_name[ dd['name']]['hierarchies'][0]['levels'][ 1:dd['level_depth']]: columns += [ 'ID %s' % ancestor_level['caption'], ancestor_level['caption'] ] columns += ['ID %s' % dd['level'], dd['level']] # property names columns += pnames # measure names columns += [m['caption'] for m in measures] for row in tidy['data']: r = [] for j, cell in enumerate(row[:len(tidy['axes'])]): for ancestor in reversed(cell['ancestors'][:slices[j] - 1]): r += [ancestor['key'], ancestor['caption']] r += [cell['key'], cell['caption']] r += get_props(row[:-len(measures)], pnames, props, tidy['axes']) for mvalue in row[len(tidy['axes']):]: r.append(mvalue) table.append(r) else: # no parents for dd in tidy['axes']: columns += ['ID %s' % dd['level'], dd['level']] # measure names columns += [m['caption'] for m in self._agg_params['measures']] for row in tidy['data']: r = [] for cell in row[:len(tidy['axes'])]: r += [cell['key'], cell['caption']] r += get_props(row[:-len(measures)], pnames, props, tidy['axes']) for mvalue in row[len(tidy['axes']):]: r.append(mvalue) table.append(r) df = pd.DataFrame(table, columns=columns) \ .set_index(columns[:-len(self._agg_params['measures'])]) if filter_empty_measures: df = df[reduce_( np.logical_and, [df[msr['name']].notnull() for msr in self.measures])] return df
def read_time_series( self, variables=(), start_time=pytz.utc.localize(datetime.min), end_time=pytz.utc.localize(datetime.max), selectdim=None, size_limit=1000 * 1000 * 1000, series=None, series_name_fmt=None): """ Read a list of time-series variables from this fileset. Args: variables: A list of strs containing time series variable names to be read. start_time: A datetime, which is timezone aware, of the start time of the series to read. end_time: A datetime, timezone aware, end time of series to read. selectdim: A dict containing for each dimension name of type string, the indices of the dimension to read. For example: {"station":[3,4,5]} to read indices 3,4 and 5 (indexed from 0) of the station dimension for variables which have that dimension. size_limit: Limit on the total size in bytes to read, used to screen huge requests. series: A list of series to be read by name. For soundings a series name is something like "Aug23_0000Z", as created by series_fmt from the time associated with a file. In this way the data read can be split into named series. If series_fmt is None, the series name should be a list of one empty string, [''], and all data are concatenated together in time order. series_fmt: a datetime.strftime format to create a series name for the data found in each file, based on the time associated with the file. If series_name_fmt is None, all data is put in a dictionary element named ''. Returns: A dict containing, by series name: 'time' : list of UTC timestamps, 'data': list of numpy.ndarray containing the data for each variable, 'vmap': dict by variable name, containing the index into the series data for the variable, 'dim2': dict by variable name, of values for second dimension of the data, such as height, 'stnnames': dict by variable name, of the list of the station names for the variable that were read, as selected by selectdim. A list of length 1 containing an empty string indicates the variable does not have a station dimension. Raises: OSError nc_exc.NoDataException The 'data' element in the returned dict is a list of numpy arrays, and not a dict by variable name. The 'vmap' element provides the mapping from a variable name to an index into 'data'. The data object is typically JSON-ified and sent to a browser. If it were a dict, the variable names may contain characters which cause headaches with JSON and javascript in django templates. For example, the JSON-ified string is typically passed to javascript in a django template by surrounding it with single quotes: var data = jQuery.parseJSON('{{ data }}'); A single quote within the data JSON string causes grief, and we want to support single quotes in variable names. The only work around I know of is to convert the single quotes within the string to '\u0027'. This is, of course, a time-consuming step we want to avoid when JSON-ifying a large chunk of data. It is less time-consuming to replace the quotes in the smaller vmap. The series names will not contain single quotes. """ debug = False dsinfo = self.get_dataset_info() if not dsinfo['time_name']: self.scan_files() dsinfo = self.get_dataset_info() dsinfo_vars = dsinfo['variables'] if not selectdim: selectdim = {} vshapes = self.resolve_variable_shapes(variables, selectdim) res_data = {} total_size = 0 ntimes = 0 files = self.get_files(start_time, end_time) if debug: _logger.debug( "len(files)=%d, series_name_fmt=%s", len(files), series_name_fmt) if series_name_fmt: file_tuples = [(f.time.strftime(series_name_fmt), f.path) \ for f in files] else: file_tuples = [("", f.path) for f in files] for (series_name, ncpath) in file_tuples: if series and not series_name in series: continue if debug: _logger.debug("series=%s", str(series)) _logger.debug("series_name=%s ,ncpath=%s", series_name, ncpath) # the files might be in the process of being moved, deleted, etc fileok = False exc = None for itry in range(0, 3): try: ncfile = netCDF4.Dataset(ncpath) fileok = True break except (OSError, RuntimeError) as excx: exc = excx time.sleep(itry) if not fileok: _logger.error("%s: %s", ncpath, exc) continue if not series_name in res_data: res_data[series_name] = { 'time': [], 'data': [], 'vmap': {}, 'dim2': {}, 'stnnames': {}, } otime = res_data[series_name]['time'] odata = res_data[series_name]['data'] ovmap = res_data[series_name]['vmap'] odim2 = res_data[series_name]['dim2'] ostns = res_data[series_name]['stnnames'] try: size1 = sys.getsizeof(otime) # times are apended to otime time_slice = self.read_times( ncfile, ncpath, start_time, end_time, otime, size_limit - total_size) # time_slice.start is None if nothing to read if time_slice.start is None or \ time_slice.stop <= time_slice.start: continue total_size += sys.getsizeof(otime) - size1 for exp_vname in variables: # skip if variable is not a time series or # doesn't have a selected dimension if not exp_vname in dsinfo_vars or not exp_vname in vshapes: continue # selected shape of this variable vshape = vshapes[exp_vname] vsize = reduce_( operator.mul, vshape, 1) * \ dsinfo_vars[exp_vname]["dtype"].itemsize if not vsize: continue if total_size + vsize > size_limit: raise nc_exc.TooMuchDataException( "too much data requested, will exceed {} mbytes". format(size_limit/(1000 * 1000))) dim2 = {} stnnames = [] vdata = self.read_time_series_data( ncfile, ncpath, exp_vname, time_slice, vshape, selectdim, dim2, stnnames) if vdata is None: continue # dim2 will be empty if variable is not found in file if dim2 and not exp_vname in odim2: odim2[exp_vname] = dim2 # stnnames will be empty if variable is not found in file if stnnames and not exp_vname in ostns: ostns[exp_vname] = stnnames if not exp_vname in ovmap: size1 = 0 vindex = len(odata) odata.append(vdata) ovmap[exp_vname] = vindex else: if debug: _logger.debug( "odata[%s].shape=%s, vdata.shape=%s", exp_vname, odata[vindex].shape, vdata.shape) vindex = ovmap[exp_vname] size1 = sys.getsizeof(odata[vindex]) time_index = dsinfo_vars[exp_vname]["time_index"] odata[vindex] = np.append( odata[vindex], vdata, axis=time_index) total_size += sys.getsizeof(odata[vindex]) - size1 finally: ncfile.close() ntimes += len(otime) if ntimes == 0: exc = nc_exc.NoDataException( "No data between {} and {}". format( start_time.isoformat(), end_time.isoformat())) # _logger.warning("%s: %s", str(self), repr(exc)) raise exc if debug: for series_name in res_data: for exp_vname in res_data[series_name]['vmap']: var_index = res_data[series_name]['vmap'][exp_vname] _logger.debug( "res_data[%s]['data'][%d].shape=%s, exp_vname=%s", series_name, var_index, repr(res_data[series_name]['data'][var_index].shape), exp_vname) _logger.debug( "total_size=%d", total_size) return res_data