def iterrecast(source, key, variablefield, valuefield, samplesize, reducers, missing): # # TODO implementing this by making two passes through the data is a bit # ugly, and could be costly if there are several upstream transformations # that would need to be re-executed each pass - better to make one pass, # caching the rows sampled to discover variables to be recast as fields? # it = iter(source) fields = it.next() # normalise some stuff keyfields = key variablefields = variablefield # N.B., could be more than one if isinstance(keyfields, basestring): keyfields = (keyfields,) if isinstance(variablefields, basestring): variablefields = (variablefields,) if not keyfields: # assume keyfields is fields not in variables keyfields = [f for f in fields if f not in variablefields and f != valuefield] if not variablefields: # assume variables are fields not in keyfields variablefields = [f for f in fields if f not in keyfields and f != valuefield] # sanity checks assert valuefield in fields, 'invalid value field: %s' % valuefield assert valuefield not in keyfields, 'value field cannot be keyfields' assert valuefield not in variablefields, \ 'value field cannot be variable field' for f in keyfields: assert f in fields, 'invalid keyfields field: %s' % f for f in variablefields: assert f in fields, 'invalid variable field: %s' % f # we'll need these later valueindex = fields.index(valuefield) keyindices = [fields.index(f) for f in keyfields] variableindices = [fields.index(f) for f in variablefields] # determine the actual variable names to be cast as fields if isinstance(variablefields, dict): # user supplied dictionary variables = variablefields else: variables = collections.defaultdict(set) # sample the data to discover variables to be cast as fields for row in itertools.islice(it, 0, samplesize): for i, f in zip(variableindices, variablefields): variables[f].add(row[i]) for f in variables: variables[f] = sorted(variables[f]) # turn from sets to sorted lists # finished the first pass # determine the output fields outfields = list(keyfields) for f in variablefields: outfields.extend(variables[f]) yield tuple(outfields) # output data source = sort(source, key=keyfields) it = itertools.islice(source, 1, None) # skip header row getsortablekey = sortable_itemgetter(*keyindices) getactualkey = operator.itemgetter(*keyindices) # process sorted data in newfields groups = itertools.groupby(it, key=getsortablekey) for _, group in groups: # may need to iterate over the group more than once group = list(group) # N.B., key returned by groupby may be wrapped as SortableItem, we want # to output the actual key value, get it from the first row in the group key_value = getactualkey(group[0]) if len(keyfields) > 1: out_row = list(key_value) else: out_row = [key_value] for f, i in zip(variablefields, variableindices): for variable in variables[f]: # collect all values for the current variable values = [r[valueindex] for r in group if r[i] == variable] if len(values) == 0: value = missing elif len(values) == 1: value = values[0] else: if variable in reducers: redu = reducers[variable] else: redu = list # list all values value = redu(values) out_row.append(value) yield tuple(out_row)
def _iternocache(self, source, key, reverse): debug('iterate without cache') self._clearcache() it = iter(source) flds = it.next() yield tuple(flds) if key is not None: # convert field selection into field indices indices = asindices(flds, key) else: indices = range(len(flds)) # now use field indices to construct a _getkey function # N.B., this will probably raise an exception on short rows getkey = sortable_itemgetter(*indices) # initialise the first chunk rows = list(itertools.islice(it, 0, self.buffersize)) rows.sort(key=getkey, reverse=reverse) # have we exhausted the source iterator? if self.buffersize is None or len(rows) < self.buffersize: if self.cache: debug('caching mem') self._fldcache = flds self._memcache = rows self._getkey = getkey # actually not needed to iterate from memcache for row in rows: yield tuple(row) else: chunkfiles = [] while rows: # dump the chunk f = NamedTemporaryFile(dir=self.tempdir) for row in rows: pickle.dump(row, f, protocol=-1) f.flush() # N.B., do not close the file! Closing will delete # the file, and we might want to keep it around # if it can be cached. We'll let garbage collection # deal with this, i.e., when no references to the # chunk files exist any more, garbage collection # should be an implicit close, which will cause file # deletion. chunkfiles.append(f) # grab the next chunk rows = list(itertools.islice(it, 0, self.buffersize)) rows.sort(key=getkey, reverse=reverse) if self.cache: debug('caching files %r', chunkfiles) self._fldcache = flds self._filecache = chunkfiles self._getkey = getkey chunkiters = [iterchunk(f) for f in chunkfiles] for row in _mergesorted(getkey, reverse, *chunkiters): yield tuple(row)
def iterrecast(source, key, variablefield, valuefield, samplesize, reducers, missing): # # TODO implementing this by making two passes through the data is a bit # ugly, and could be costly if there are several upstream transformations # that would need to be re-executed each pass - better to make one pass, # caching the rows sampled to discover variables to be recast as fields? # it = iter(source) fields = it.next() # normalise some stuff keyfields = key variablefields = variablefield # N.B., could be more than one if isinstance(keyfields, basestring): keyfields = (keyfields, ) if isinstance(variablefields, basestring): variablefields = (variablefields, ) if not keyfields: # assume keyfields is fields not in variables keyfields = [ f for f in fields if f not in variablefields and f != valuefield ] if not variablefields: # assume variables are fields not in keyfields variablefields = [ f for f in fields if f not in keyfields and f != valuefield ] # sanity checks assert valuefield in fields, 'invalid value field: %s' % valuefield assert valuefield not in keyfields, 'value field cannot be keyfields' assert valuefield not in variablefields, \ 'value field cannot be variable field' for f in keyfields: assert f in fields, 'invalid keyfields field: %s' % f for f in variablefields: assert f in fields, 'invalid variable field: %s' % f # we'll need these later valueindex = fields.index(valuefield) keyindices = [fields.index(f) for f in keyfields] variableindices = [fields.index(f) for f in variablefields] # determine the actual variable names to be cast as fields if isinstance(variablefields, dict): # user supplied dictionary variables = variablefields else: variables = collections.defaultdict(set) # sample the data to discover variables to be cast as fields for row in itertools.islice(it, 0, samplesize): for i, f in zip(variableindices, variablefields): variables[f].add(row[i]) for f in variables: variables[f] = sorted( variables[f]) # turn from sets to sorted lists # finished the first pass # determine the output fields outfields = list(keyfields) for f in variablefields: outfields.extend(variables[f]) yield tuple(outfields) # output data source = sort(source, key=keyfields) it = itertools.islice(source, 1, None) # skip header row getsortablekey = sortable_itemgetter(*keyindices) getactualkey = operator.itemgetter(*keyindices) # process sorted data in newfields groups = itertools.groupby(it, key=getsortablekey) for _, group in groups: # may need to iterate over the group more than once group = list(group) # N.B., key returned by groupby may be wrapped as SortableItem, we want # to output the actual key value, get it from the first row in the group key_value = getactualkey(group[0]) if len(keyfields) > 1: out_row = list(key_value) else: out_row = [key_value] for f, i in zip(variablefields, variableindices): for variable in variables[f]: # collect all values for the current variable values = [r[valueindex] for r in group if r[i] == variable] if len(values) == 0: value = missing elif len(values) == 1: value = values[0] else: if variable in reducers: redu = reducers[variable] else: redu = list # list all values value = redu(values) out_row.append(value) yield tuple(out_row)