def itermelt(source, key, variables, variablefield, valuefield): it = iter(source) # normalise some stuff flds = it.next() if isinstance(key, basestring): key = (key,) # normalise to a tuple if isinstance(variables, basestring): # shouldn't expect this, but ... ? variables = (variables,) # normalise to a tuple if not key: # assume key is fields not in variables key = [f for f in flds if f not in variables] if not variables: # assume variables are fields not in key variables = [f for f in flds if f not in key] # determine the output fields out_flds = list(key) out_flds.append(variablefield) out_flds.append(valuefield) yield tuple(out_flds) key_indices = [flds.index(k) for k in key] getkey = rowgetter(*key_indices) variables_indices = [flds.index(v) for v in variables] # construct the output data for row in it: k = getkey(row) for v, i in zip(variables, variables_indices): o = list(k) # populate with key values initially o.append(v) # add variable o.append(row[i]) # add value yield tuple(o)
def itermelt(source, key, variables, variablefield, valuefield): it = iter(source) # normalise some stuff flds = it.next() if isinstance(key, basestring): key = (key, ) # normalise to a tuple if isinstance(variables, basestring): # shouldn't expect this, but ... ? variables = (variables, ) # normalise to a tuple if not key: # assume key is fields not in variables key = [f for f in flds if f not in variables] if not variables: # assume variables are fields not in key variables = [f for f in flds if f not in key] # determine the output fields out_flds = list(key) out_flds.append(variablefield) out_flds.append(valuefield) yield tuple(out_flds) key_indices = [flds.index(k) for k in key] getkey = rowgetter(*key_indices) variables_indices = [flds.index(v) for v in variables] # construct the output data for row in it: k = getkey(row) for v, i in zip(variables, variables_indices): o = list(k) # populate with key values initially o.append(v) # add variable o.append(row[i]) # add value yield tuple(o)
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lflds = lit.next() rflds = rit.next() # determine indices of the key fields in left and right tables lkind = asindices(lflds, lkey) rkind = asindices(rflds, rkey) # construct functions to extract key values from left table rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rflds)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outflds = list(lflds) else: outflds = [(str(lprefix) + str(f)) for f in lflds] if rprefix is None: outflds.extend(rgetv(rflds)) else: outflds.extend([(str(rprefix) + str(f)) for f in rgetv(rflds)]) yield tuple(outflds) # define a function to join rows def joinrows(_rrow, _lrows): for lrow in _lrows: # start with the left row _outrow = list(lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) yield tuple(_outrow) for rrow in rit: k = rgetk(rrow) if k in llookup: lrows = llookup[k] for outrow in joinrows(rrow, lrows): yield outrow else: # start with missing values in place of the left row outrow = [missing] * len(lflds) # set key values for li, ri in zip(lkind, rkind): outrow[li] = rrow[ri] # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow)
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix): lit = iter(left) lflds = lit.next() rflds, rit = iterpeek(right) # need the whole lot to pass to lookup from petl.util import lookupone rlookup = lookupone(rit, rkey, strict=False) # determine indices of the key fields in left and right tables lkind = asindices(lflds, lkey) rkind = asindices(rflds, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rflds)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outflds = list(lflds) else: outflds = [(str(lprefix) + str(f)) for f in lflds] if rprefix is None: outflds.extend(rgetv(rflds)) else: outflds.extend([(str(rprefix) + str(f)) for f in rgetv(rflds)]) yield tuple(outflds) # define a function to join rows def joinrows(_lrow, _rrow): # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) return tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrow = rlookup[k] yield joinrows(lrow, rrow) else: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow)
def __iter__(self): it = iter(self.table) # determine output fields fields = list(it.next()) newfields = [f for f in fields if f != self.field] newfields.insert(self.index, self.field) yield tuple(newfields) # define a function to transform each row in the source data # according to the field selection indices = asindices(fields, newfields) transform = rowgetter(*indices) # construct the transformed data for row in it: try: yield transform(row) except IndexError: # row is short, let's be kind and fill in any missing fields yield tuple(row[i] if i < len(row) else self.missing for i in indices)
def itercut(source, spec, missing=None): it = iter(source) spec = tuple(spec) # make sure no-one can change midstream # convert field selection into field indices flds = it.next() indices = asindices(flds, spec) # define a function to transform each row in the source data # according to the field selection transform = rowgetter(*indices) # yield the transformed field names yield transform(flds) # construct the transformed data for row in it: try: yield transform(row) except IndexError: # row is short, let's be kind and fill in any missing fields yield tuple(row[i] if i < len(row) else missing for i in indices)
def iterlookupjoin(left, right, lkey, rkey, missing=None, lprefix=None, rprefix=None): lit = iter(left) rit = iter(right) lflds = lit.next() rflds = rit.next() # determine indices of the key fields in left and right tables lkind = asindices(lflds, lkey) rkind = asindices(rflds, rkey) # construct functions to extract key values from both tables lgetk = operator.itemgetter(*lkind) rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rflds)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outflds = list(lflds) else: outflds = [(str(lprefix) + str(f)) for f in lflds] if rprefix is None: outflds.extend(rgetv(rflds)) else: outflds.extend([(str(rprefix) + str(f)) for f in rgetv(rflds)]) yield tuple(outflds) # define a function to join two groups of rows def joinrows(lrowgrp, rrowgrp): if rrowgrp is None: for lrow in lrowgrp: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow) else: rrow = iter(rrowgrp).next() # pick first arbitrarily for lrow in lrowgrp: # start with the left row outrow = list(lrow) # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow) # construct group iterators for both tables lgit = itertools.groupby(lit, key=lgetk) rgit = itertools.groupby(rit, key=rgetk) # loop until *either* of the iterators is exhausted lkval, rkval = None, None # initialise here to handle empty tables try: # pick off initial row groups lkval, lrowgrp = lgit.next() rkval, rrowgrp = rgit.next() while True: if lkval < rkval: for row in joinrows(lrowgrp, None): yield tuple(row) # advance left lkval, lrowgrp = lgit.next() elif lkval > rkval: # advance right rkval, rrowgrp = rgit.next() else: for row in joinrows(lrowgrp, rrowgrp): yield tuple(row) # advance both lkval, lrowgrp = lgit.next() rkval, rrowgrp = rgit.next() except StopIteration: pass # make sure any left rows remaining are yielded if lkval > rkval: # yield anything that got left hanging for row in joinrows(lrowgrp, None): yield tuple(row) # yield the rest for lkval, lrowgrp in lgit: for row in joinrows(lrowgrp, None): yield tuple(row)