def iterunique(source, key): # assume source is sorted # first need to sort the data it = iter(source) hdr = next(it) yield tuple(hdr) # convert field selection into field indices if key is None: indices = range(len(hdr)) else: indices = asindices(hdr, key) # now use field indices to construct a _getkey function # N.B., this may raise an exception on short rows, depending on # the field selection getkey = operator.itemgetter(*indices) prev = next(it) prev_key = getkey(prev) prev_comp_ne = True for curr in it: curr_key = getkey(curr) curr_comp_ne = curr_key != prev_key if prev_comp_ne and curr_comp_ne: yield tuple(prev) prev = curr prev_key = curr_key prev_comp_ne = curr_comp_ne # last one? if prev_comp_ne: yield prev
def facettupletrees(table, key, start='start', stop='stop', value=None): """ Construct faceted interval trees for the given table, where each node in the tree is a row of the table. """ import intervaltree it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) assert start in flds, 'start field not recognised' assert stop in flds, 'stop field not recognised' getstart = itemgetter(flds.index(start)) getstop = itemgetter(flds.index(stop)) if value is None: getvalue = tuple else: valueindices = asindices(hdr, value) assert len(valueindices) > 0, 'invalid value field specification' getvalue = itemgetter(*valueindices) keyindices = asindices(hdr, key) assert len(keyindices) > 0, 'invalid key' getkey = itemgetter(*keyindices) trees = dict() for row in it: k = getkey(row) if k not in trees: trees[k] = intervaltree.IntervalTree() trees[k].addi(getstart(row), getstop(row), getvalue(row)) return trees
def __init__(self, default_connections, keyed_connections, fields, key): super(DuplicatesConnection, self).__init__(default_connections, keyed_connections, fields) # convert field selection into field indices indices = asindices(fields, key) # now use field indices to construct a _getkey function # N.B., this may raise an exception on short rows, depending on # the field selection self.getkey = itemgetter(*indices) # initial state self.previous = None self.previous_is_duplicate = False # convert field selection into field indices indices = asindices(fields, key) # now use field indices to construct a _getkey function # N.B., this may raise an exception on short rows, depending on # the field selection self.getkey = itemgetter(*indices) # initial state self.previous = None self.previous_is_duplicate = False
def iterhashantijoin(left, right, lkey, rkey): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) yield tuple(lhdr) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = operator.itemgetter(*lkind) rgetk = operator.itemgetter(*rkind) rkeys = set() for rrow in rit: rk = rgetk(rrow) rkeys.add(rk) for lrow in lit: lk = lgetk(lrow) if lk not in rkeys: yield tuple(lrow)
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(str(lprefix) + str(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_rrow, _lrows): for lrow in _lrows: # start with the left row _outrow = list(lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) yield tuple(_outrow) for rrow in rit: k = rgetk(rrow) if k in llookup: lrows = llookup[k] for outrow in joinrows(rrow, lrows): yield outrow else: # start with missing values in place of the left row outrow = [missing] * len(lhdr) # set key values for li, ri in zip(lkind, rkind): outrow[li] = rrow[ri] # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow)
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_rrow, _lrows): for lrow in _lrows: # start with the left row _outrow = list(lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) yield tuple(_outrow) for rrow in rit: k = rgetk(rrow) if k in llookup: lrows = llookup[k] for outrow in joinrows(rrow, lrows): yield outrow else: # start with missing values in place of the left row outrow = [missing] * len(lhdr) # set key values for li, ri in zip(lkind, rkind): outrow[li] = rrow[ri] # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow)
def iterantijoin(left, right, lkey, rkey): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) yield tuple(lhdr) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = comparable_itemgetter(*lkind) rgetk = comparable_itemgetter(*rkind) # construct group iterators for both tables lgit = itertools.groupby(lit, key=lgetk) rgit = itertools.groupby(rit, key=rgetk) lrowgrp = [] # loop until *either* of the iterators is exhausted lkval, rkval = Comparable(None), Comparable(None) try: # pick off initial row groups lkval, lrowgrp = next(lgit) rkval, _ = next(rgit) while True: if lkval < rkval: for row in lrowgrp: yield tuple(row) # advance left lkval, lrowgrp = next(lgit) elif lkval > rkval: # advance right rkval, _ = next(rgit) else: # advance both lkval, lrowgrp = next(lgit) rkval, _ = next(rgit) except StopIteration: pass # any left over? if lkval > rkval: # yield anything that got left hanging for row in lrowgrp: yield tuple(row) # and the rest... for lkval, lrowgrp in lgit: for row in lrowgrp: yield tuple(row)
def iterfilldown(table, fillfields, missing, where, anchorfields, until): # prepare where function if isinstance(where, string_types): where = expr(where) elif where is not None: assert callable(where), 'expected callable for "where" argument, found %r' % where else: where = lambda r: True # default where callable returns True # prepare until function if isinstance(until, string_types): until = expr(until) elif until is not None: assert callable(until), 'expected callable for "until" argument, found %r' % until else: until = lambda r: False # default until callable returns True # normal iter function it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) yield tuple(hdr) if not fillfields: # fill down all fields fillfields = hdr fillindices = asindices(hdr, fillfields) if anchorfields: anchorindices = asindices(hdr, anchorfields) fill = list(next(it)) # fill values prev = fill untilfunctiontriggered = False yield tuple(fill) for row in it: outrow = list(row) if untilfunctiontriggered: fill = outrow untilfunctiontriggered = False # reset if anchorfields: row_values = [row[i] for i in anchorindices] prev_values = [prev[i] for i in anchorindices] check_anchor = row_values == prev_values else: check_anchor = True # loop through fill-down fields for idx in fillindices: if row[idx] == missing and where(Record(row, flds)) and check_anchor: outrow[idx] = fill[idx] # fill down elif row[idx] == missing and check_anchor: pass else: fill[idx] = row[idx] # new fill value prev = outrow yield tuple(outrow) # found stop point, reset fill with next row's contents if until(Record(row, flds)): untilfunctiontriggered = True
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix): lit = iter(left) lhdr = next(lit) rhdr, rit = iterpeek(right) # need the whole lot to pass to lookup rlookup = lookupone(rit, rkey, strict=False) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(str(lprefix) + str(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_lrow, _rrow): # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) return tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrow = rlookup[k] yield joinrows(lrow, rrow) else: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow)
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix): lit = iter(left) lhdr = next(lit) rhdr, rit = iterpeek(right) # need the whole lot to pass to lookup rlookup = lookupone(rit, rkey, strict=False) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(text_type(rprefix) + text_type(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_lrow, _rrow): # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(_rrow)) return tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrow = rlookup[k] yield joinrows(lrow, rrow) else: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow)
def tupletree(table, start='start', stop='stop', value=None): """ Construct an interval tree for the given table, where each node in the tree is a row of the table. """ import intervaltree tree = intervaltree.IntervalTree() it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) assert start in flds, 'start field not recognised' assert stop in flds, 'stop field not recognised' getstart = itemgetter(flds.index(start)) getstop = itemgetter(flds.index(stop)) if value is None: getvalue = tuple else: valueindices = asindices(hdr, value) assert len(valueindices) > 0, 'invalid value field specification' getvalue = itemgetter(*valueindices) for row in it: tree.addi(getstart(row), getstop(row), getvalue(row)) return tree
def itersearch(table, pattern, field, flags, complement): prog = re.compile(pattern, flags) it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) yield tuple(hdr) if field is None: # search whole row test = lambda r: any(prog.search(text_type(v)) for v in r) else: indices = asindices(hdr, field) if len(indices) == 1: index = indices[0] test = lambda r: prog.search(text_type(r[index])) else: getvals = operator.itemgetter(*indices) test = lambda r: any(prog.search(text_type(v)) for v in getvals(r)) # complement==False, return rows that match if not complement: for row in it: if test(row): yield tuple(row) # complement==True, return rows that do not match else: for row in it: if not test(row): yield tuple(row)
def iterunique(source, key): # assume source is sorted # first need to sort the data it = iter(source) hdr = next(it) yield tuple(hdr) # convert field selection into field indices if key is None: indices = range(len(hdr)) else: indices = asindices(hdr, key) # now use field indices to construct a _getkey function # N.B., this may raise an exception on short rows, depending on # the field selection getkey = operator.itemgetter(*indices) prev = next(it) prev_key = getkey(prev) prev_comp_ne = True for curr in it: curr_key = getkey(curr) curr_comp_ne = (curr_key != prev_key) if prev_comp_ne and curr_comp_ne: yield tuple(prev) prev = curr prev_key = curr_key prev_comp_ne = curr_comp_ne # last one? if prev_comp_ne: yield prev
def recordlookup(table, key, dictionary=None): """ Load a dictionary with data from the given table, mapping to record objects. """ if dictionary is None: dictionary = dict() it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) keyindices = asindices(hdr, key) assert len(keyindices) > 0, 'no key selected' getkey = operator.itemgetter(*keyindices) for row in it: k = getkey(row) rec = Record(row, flds) if k in dictionary: # work properly with shelve l = dictionary[k] l.append(rec) dictionary[k] = l else: dictionary[k] = [rec] return dictionary
def iterhashjoin(left, right, lkey, rkey, rlookup, lprefix, rprefix): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from left table lgetk = operator.itemgetter(*lkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(str(lprefix) + str(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join rows def joinrows(_lrow, _rrows): for rrow in _rrows: # start with the left row _outrow = list(_lrow) # extend with non-key values from the right row _outrow.extend(rgetv(rrow)) yield tuple(_outrow) for lrow in lit: k = lgetk(lrow) if k in rlookup: rrows = rlookup[k] for outrow in joinrows(lrow, rrows): yield outrow
def __init__(self, default_connections, keyed_connections, fields, discriminator): super(PartitionConnection, self).__init__(default_connections, keyed_connections, fields) if callable(discriminator): self.discriminator = discriminator else: # assume field or fields self.discriminator = itemgetter(*asindices(fields, discriminator))
def itermelt(source, key, variables, variablefield, valuefield): if key is None and variables is None: raise ValueError('either key or variables must be specified') it = iter(source) hdr = next(it) # determine key and variable field indices key_indices = variables_indices = None if key is not None: key_indices = asindices(hdr, key) if variables is not None: if not isinstance(variables, (list, tuple)): variables = (variables,) variables_indices = asindices(hdr, variables) if key is None: # assume key is fields not in variables key_indices = [i for i in range(len(hdr)) if i not in variables_indices] if variables is None: # assume variables are fields not in key variables_indices = [i for i in range(len(hdr)) if i not in key_indices] variables = [hdr[i] for i in variables_indices] getkey = rowgetter(*key_indices) # determine the output fields outhdr = [hdr[i] for i in key_indices] outhdr.append(variablefield) outhdr.append(valuefield) yield tuple(outhdr) # construct the output data for row in it: k = getkey(row) for v, i in zip(variables, variables_indices): try: o = list(k) # populate with key values initially o.append(v) # add variable o.append(row[i]) # add value yield tuple(o) except IndexError: # row is missing this value, and melt() should yield no row pass
def itermergesort(sources, key, header, missing, reverse): # first need to standardise headers of all input tables # borrow this from itercat - TODO remove code smells its = [iter(t) for t in sources] src_hdrs = [next(it) for it in its] if header is None: # determine output fields by gathering all fields found in the sources outhdr = list() for hdr in src_hdrs: for f in list(map(text_type, hdr)): if f not in outhdr: # add any new fields as we find them outhdr.append(f) else: # predetermined output fields outhdr = header yield tuple(outhdr) def _standardisedata(it, hdr, ofs): flds = list(map(text_type, hdr)) # now construct and yield the data rows for _row in it: try: # should be quickest to do this way yield tuple(_row[flds.index(fo)] if fo in flds else missing for fo in ofs) except IndexError: # handle short rows outrow = [missing] * len(ofs) for i, fi in enumerate(flds): try: outrow[ofs.index(fi)] = _row[i] except IndexError: pass # be relaxed about short rows yield tuple(outrow) # wrap all iterators to standardise fields sits = [_standardisedata(it, hdr, outhdr) for hdr, it in zip(src_hdrs, its)] # now determine key function getkey = None if key is not None: # convert field selection into field indices indices = asindices(outhdr, key) # now use field indices to construct a _getkey function # N.B., this will probably raise an exception on short rows getkey = comparable_itemgetter(*indices) # OK, do the merge sort for row in _shortlistmergesorted(getkey, reverse, *sits): yield row
def _setup_lookup(table, key, value): # obtain iterator and header row it = iter(table) hdr = next(it) # prepare key getter keyindices = asindices(hdr, key) assert len(keyindices) > 0, 'no key selected' getkey = operator.itemgetter(*keyindices) # prepare value getter if value is None: # default value is complete row getvalue = rowgetter(*range(len(hdr))) else: valueindices = asindices(hdr, value) assert len(valueindices) > 0, 'no value selected' getvalue = operator.itemgetter(*valueindices) return it, getkey, getvalue
def iterconflicts(source, key, missing, exclude, include): # normalise arguments if exclude and not isinstance(exclude, (list, tuple)): exclude = (exclude,) if include and not isinstance(include, (list, tuple)): include = (include,) # exclude overrides include if include and exclude: include = None it = iter(source) hdr = next(it) flds = list(map(text_type, hdr)) yield tuple(hdr) # convert field selection into field indices indices = asindices(hdr, key) # now use field indices to construct a _getkey function # N.B., this may raise an exception on short rows, depending on # the field selection getkey = operator.itemgetter(*indices) previous = None previous_yielded = False for row in it: if previous is None: previous = row else: kprev = getkey(previous) kcurr = getkey(row) if kprev == kcurr: # is there a conflict? conflict = False for x, y, f in zip(previous, row, flds): if (exclude and f not in exclude) \ or (include and f in include) \ or (not exclude and not include): if missing not in (x, y) and x != y: conflict = True break if conflict: if not previous_yielded: yield tuple(previous) previous_yielded = True yield tuple(row) else: # reset previous_yielded = False previous = row
def iterconflicts(source, key, missing, exclude, include): # normalise arguments if exclude and not isinstance(exclude, (list, tuple)): exclude = (exclude, ) if include and not isinstance(include, (list, tuple)): include = (include, ) # exclude overrides include if include and exclude: include = None it = iter(source) hdr = next(it) flds = list(map(text_type, hdr)) yield tuple(hdr) # convert field selection into field indices indices = asindices(hdr, key) # now use field indices to construct a _getkey function # N.B., this may raise an exception on short rows, depending on # the field selection getkey = operator.itemgetter(*indices) previous = None previous_yielded = False for row in it: if previous is None: previous = row else: kprev = getkey(previous) kcurr = getkey(row) if kprev == kcurr: # is there a conflict? conflict = False for x, y, f in zip(previous, row, flds): if (exclude and f not in exclude) \ or (include and f in include) \ or (not exclude and not include): if missing not in (x, y) and x != y: conflict = True break if conflict: if not previous_yielded: yield tuple(previous) previous_yielded = True yield tuple(row) else: # reset previous_yielded = False previous = row
def iterfieldselect(source, field, where, complement, missing): it = iter(source) hdr = next(it) yield tuple(hdr) indices = asindices(hdr, field) getv = operator.itemgetter(*indices) for row in it: try: v = getv(row) except IndexError: v = missing if where(v) != complement: # XOR yield tuple(row)
def iterfieldselect(source, field, where, complement, missing): it = iter(source) hdr = next(it) yield tuple(hdr) indices = asindices(hdr, field) getv = operator.itemgetter(*indices) for row in it: try: v = getv(row) except IndexError: v = missing if bool(where(v)) != complement: # XOR yield tuple(row)
def iterfillright(table, fillfields, missing): it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) yield tuple(hdr) if not fillfields: # fill down all fields fillfields = hdr fillindices = asindices(hdr, fillfields) for row in it: outrow = list(row) for i, _ in enumerate(outrow): if i > 0 and outrow[i] == missing and outrow[i-1] != missing and i in fillindices: outrow[i] = outrow[i-1] yield tuple(outrow)
def issorted(table, key=None, reverse=False, strict=False): """ Return True if the table is ordered (i.e., sorted) by the given key. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar', 'baz'], ... ['a', 1, True], ... ['b', 3, True], ... ['b', 2]] >>> etl.issorted(table1, key='foo') True >>> etl.issorted(table1, key='bar') False >>> etl.issorted(table1, key='foo', strict=True) False >>> etl.issorted(table1, key='foo', reverse=True) False """ # determine the operator to use when comparing rows if reverse and strict: op = operator.lt elif reverse and not strict: op = operator.le elif strict: op = operator.gt else: op = operator.ge it = iter(table) flds = [text_type(f) for f in next(it)] if key is None: prev = next(it) for curr in it: if not op(curr, prev): return False prev = curr else: getkey = comparable_itemgetter(*asindices(flds, key)) prev = next(it) prevkey = getkey(prev) for curr in it: currkey = getkey(curr) if not op(currkey, prevkey): return False prevkey = currkey return True
def iterfilldown(table, fillfields, missing): it = iter(table) hdr = next(it) yield tuple(hdr) if not fillfields: # fill down all fields fillfields = hdr fillindices = asindices(hdr, fillfields) fill = list(next(it)) # fill values yield tuple(fill) for row in it: outrow = list(row) for idx in fillindices: if row[idx] == missing: outrow[idx] = fill[idx] # fill down else: fill[idx] = row[idx] # new fill value yield tuple(outrow)
def __iter__(self): it = iter(self.table) hdr = next(it) shdr = sorted(hdr) indices = asindices(hdr, shdr) transform = rowgetter(*indices) # yield the transformed header yield tuple(shdr) # construct the transformed data missing = self.missing for row in it: try: yield transform(row) except IndexError: # row is short, let's be kind and fill in any missing fields yield tuple(row[i] if i < len(row) else missing for i in indices)
def __iter__(self): it = iter(self.table) hdr = next(it) # convert field selection into field indices if self.key is None: indices = range(len(hdr)) else: indices = asindices(hdr, self.key) # now use field indices to construct a _getkey function # N.B., this may raise an exception on short rows, depending on # the field selection getkey = operator.itemgetter(*indices) INIT = object() if self.count: hdr = tuple(hdr) + (self.count,) yield hdr previous = INIT n_dup = 1 for row in it: if previous is INIT: previous = row else: kprev = getkey(previous) kcurr = getkey(row) if kprev == kcurr: n_dup += 1 else: yield tuple(previous) + (n_dup,) n_dup = 1 previous = row # deal with last row yield tuple(previous) + (n_dup,) else: yield tuple(hdr) previous_keys = INIT for row in it: keys = getkey(row) if keys != previous_keys: yield tuple(row) previous_keys = keys
def __iter__(self): it = iter(self.table) hdr = next(it) # convert field selection into field indices if self.key is None: indices = range(len(hdr)) else: indices = asindices(hdr, self.key) # now use field indices to construct a _getkey function # N.B., this may raise an exception on short rows, depending on # the field selection getkey = operator.itemgetter(*indices) if self.count: hdr = tuple(hdr) + (self.count,) yield hdr previous = None n_dup = 1 for row in it: if previous is None: previous = row else: kprev = getkey(previous) kcurr = getkey(row) if kprev == kcurr: n_dup += 1 else: yield tuple(previous) + (n_dup,) n_dup = 1 previous = row # deal with last row yield tuple(previous) + (n_dup,) else: yield tuple(hdr) previous_keys = None for row in it: keys = getkey(row) if keys != previous_keys: yield tuple(row) previous_keys = keys
def facetcolumns(table, key, missing=None): """ Like :func:`petl.util.materialise.columns` but stratified by values of the given key field. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar', 'baz'], ... ['a', 1, True], ... ['b', 2, True], ... ['b', 3]] >>> fc = etl.facetcolumns(table, 'foo') >>> fc['a'] {'foo': ['a'], 'bar': [1], 'baz': [True]} >>> fc['b'] {'foo': ['b', 'b'], 'bar': [2, 3], 'baz': [True, None]} """ fct = dict() it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) indices = asindices(hdr, key) assert len(indices) > 0, 'no key field selected' getkey = operator.itemgetter(*indices) for row in it: kv = getkey(row) if kv not in fct: cols = dict() for f in flds: cols[f] = list() fct[kv] = cols else: cols = fct[kv] for f, v in izip_longest(flds, row, fillvalue=missing): if f in cols: cols[f].append(v) return fct
def __init__(self, default_connections, keyed_connections, fields, key, reverse, buffersize): super(SortConnection, self).__init__(default_connections, keyed_connections, fields) self.getkey = None if key is not None: # convert field selection into field indices indices = asindices(fields, key) # now use field indices to construct a _getkey function # N.B., this will probably raise an exception on short rows self.getkey = comparable_itemgetter(*indices) self.reverse = reverse if buffersize is None: self.buffersize = petl.config.sort_buffersize else: self.buffersize = buffersize self.cache = list() self.chunkfiles = list()
def itercut(source, spec, missing=None): it = iter(source) spec = tuple(spec) # make sure no-one can change midstream # convert field selection into field indices hdr = next(it) indices = asindices(hdr, spec) # define a function to transform each row in the source data # according to the field selection transform = rowgetter(*indices) # yield the transformed header yield transform(hdr) # construct the transformed data for row in it: try: yield transform(row) except IndexError: # row is short, let's be kind and fill in any missing fields yield tuple(row[i] if i < len(row) else missing for i in indices)
def __iter__(self): it = iter(self.table) # determine output fields hdr = next(it) outhdr = [f for f in hdr if f != self.field] outhdr.insert(self.index, self.field) yield tuple(outhdr) # define a function to transform each row in the source data # according to the field selection outflds = list(map(str, outhdr)) indices = asindices(hdr, outflds) transform = rowgetter(*indices) # construct the transformed data for row in it: try: yield transform(row) except IndexError: # row is short, let's be kind and fill in any missing fields yield tuple(row[i] if i < len(row) else self.missing for i in indices)
def iterduplicates(source, key): # assume source is sorted # first need to sort the data it = iter(source) hdr = next(it) yield tuple(hdr) # convert field selection into field indices if key is None: indices = range(len(hdr)) else: indices = asindices(hdr, key) # now use field indices to construct a _getkey function # N.B., this may raise an exception on short rows, depending on # the field selection getkey = operator.itemgetter(*indices) previous = None previous_yielded = False for row in it: if previous is None: previous = row else: kprev = getkey(previous) kcurr = getkey(row) if kprev == kcurr: if not previous_yielded: yield tuple(previous) previous_yielded = True yield tuple(row) else: # reset previous_yielded = False previous = row
def recordlookupone(table, key, dictionary=None, strict=False): """ Load a dictionary with data from the given table, mapping to record objects, assuming there is at most one row for each key. """ if dictionary is None: dictionary = dict() it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) keyindices = asindices(hdr, key) assert len(keyindices) > 0, 'no key selected' getkey = operator.itemgetter(*keyindices) for row in it: k = getkey(row) if strict and k in dictionary: raise DuplicateKeyError(k) elif k not in dictionary: d = Record(row, flds) dictionary[k] = d return dictionary
def iterintervaljoin(left, right, lstart, lstop, rstart, rstop, lkey, rkey, include_stop, missing, lprefix, rprefix, leftouter, anti=False): # create iterators and obtain fields lit = iter(left) lhdr = next(lit) lflds = list(map(text_type, lhdr)) rit = iter(right) rhdr = next(rit) rflds = list(map(text_type, rhdr)) # check fields via petl.util.asindices (raises FieldSelectionError if spec # is not valid) asindices(lhdr, lstart) asindices(lhdr, lstop) if lkey is not None: asindices(lhdr, lkey) asindices(rhdr, rstart) asindices(rhdr, rstop) if rkey is not None: asindices(rhdr, rkey) # determine output fields if lprefix is None: outhdr = list(lflds) if not anti: outhdr.extend(rflds) else: outhdr = list(lprefix + f for f in lflds) if not anti: outhdr.extend(rprefix + f for f in rflds) yield tuple(outhdr) # create getters for start and stop positions getlstart = itemgetter(lflds.index(lstart)) getlstop = itemgetter(lflds.index(lstop)) if rkey is None: # build interval lookup for right table lookup = intervallookup(right, rstart, rstop, include_stop=include_stop) search = lookup.search # main loop for lrow in lit: start = getlstart(lrow) stop = getlstop(lrow) rrows = search(start, stop) if rrows: if not anti: for rrow in rrows: outrow = list(lrow) outrow.extend(rrow) yield tuple(outrow) elif leftouter: outrow = list(lrow) if not anti: outrow.extend([missing] * len(rflds)) yield tuple(outrow) else: # build interval lookup for right table lookup = facetintervallookup(right, key=rkey, start=rstart, stop=rstop, include_stop=include_stop) search = dict() for f in lookup: search[f] = lookup[f].search # getter for facet key values in left table getlkey = itemgetter(*asindices(lflds, lkey)) # main loop for lrow in lit: lkey = getlkey(lrow) start = getlstart(lrow) stop = getlstop(lrow) try: rrows = search[lkey](start, stop) except KeyError: rrows = None except AttributeError: rrows = None if rrows: if not anti: for rrow in rrows: outrow = list(lrow) outrow.extend(rrow) yield tuple(outrow) elif leftouter: outrow = list(lrow) if not anti: outrow.extend([missing] * len(rflds)) yield tuple(outrow)
def iterlookupjoin(left, right, lkey, rkey, missing=None, lprefix=None, rprefix=None): lit = iter(left) rit = iter(right) lhdr = next(lit) rhdr = next(rit) # determine indices of the key fields in left and right tables lkind = asindices(lhdr, lkey) rkind = asindices(rhdr, rkey) # construct functions to extract key values from both tables lgetk = operator.itemgetter(*lkind) rgetk = operator.itemgetter(*rkind) # determine indices of non-key fields in the right table # (in the output, we only include key fields from the left table - we # don't want to duplicate fields) rvind = [i for i in range(len(rhdr)) if i not in rkind] rgetv = rowgetter(*rvind) # determine the output fields if lprefix is None: outhdr = list(lhdr) else: outhdr = [(str(lprefix) + str(f)) for f in lhdr] if rprefix is None: outhdr.extend(rgetv(rhdr)) else: outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)]) yield tuple(outhdr) # define a function to join two groups of rows def joinrows(_lrowgrp, _rrowgrp): if _rrowgrp is None: for lrow in _lrowgrp: outrow = list(lrow) # start with the left row # extend with missing values in place of the right row outrow.extend([missing] * len(rvind)) yield tuple(outrow) else: rrow = next(iter(_rrowgrp)) # pick first arbitrarily for lrow in _lrowgrp: # start with the left row outrow = list(lrow) # extend with non-key values from the right row outrow.extend(rgetv(rrow)) yield tuple(outrow) # construct group iterators for both tables lgit = itertools.groupby(lit, key=lgetk) rgit = itertools.groupby(rit, key=rgetk) lrowgrp = [] # loop until *either* of the iterators is exhausted lkval, rkval = None, None # initialise here to handle empty tables try: # pick off initial row groups lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) while True: if lkval < rkval: for row in joinrows(lrowgrp, None): yield tuple(row) # advance left lkval, lrowgrp = next(lgit) elif lkval > rkval: # advance right rkval, rrowgrp = next(rgit) else: for row in joinrows(lrowgrp, rrowgrp): yield tuple(row) # advance both lkval, lrowgrp = next(lgit) rkval, rrowgrp = next(rgit) except StopIteration: pass # make sure any left rows remaining are yielded if lkval > rkval: # yield anything that got left hanging for row in joinrows(lrowgrp, None): yield tuple(row) # yield the rest for lkval, lrowgrp in lgit: for row in joinrows(lrowgrp, None): yield tuple(row)
def _iternocache(self, source, key, reverse): debug('iterate without cache') self.clearcache() it = iter(source) hdr = next(it) yield tuple(hdr) if key is not None: # convert field selection into field indices indices = asindices(hdr, key) else: indices = range(len(hdr)) # now use field indices to construct a _getkey function # TODO check if this raises an exception on short rows getkey = comparable_itemgetter(*indices) # TODO support native comparison # initialise the first chunk rows = list(itertools.islice(it, 0, self.buffersize)) rows.sort(key=getkey, reverse=reverse) # have we exhausted the source iterator? if self.buffersize is None or len(rows) < self.buffersize: # yes, table fits within sort buffer if self.cache: debug('caching mem') self._hdrcache = hdr self._memcache = rows # actually not needed to iterate from memcache self._getkey = getkey for row in rows: yield tuple(row) else: # no, table is too big, need to sort in chunks chunkfiles = [] while rows: # dump the chunk with NamedTemporaryFile(dir=self.tempdir, delete=False, mode='wb') as f: # N.B., we **don't** want the file to be deleted on close, # but we **do** want the file to be deleted when self # is garbage collected, or when the program exits. When # all references to the wrapper are gone, the file should # get deleted. wrapper = _NamedTempFileDeleteOnGC(f.name) debug('created temporary chunk file %s' % f.name) for row in rows: pickle.dump(row, f, protocol=-1) f.flush() chunkfiles.append(wrapper) # grab the next chunk rows = list(itertools.islice(it, 0, self.buffersize)) rows.sort(key=getkey, reverse=reverse) if self.cache: debug('caching files') self._hdrcache = hdr self._filecache = chunkfiles self._getkey = getkey chunkiters = [_iterchunk(f.name) for f in chunkfiles] for row in _mergesorted(getkey, reverse, *chunkiters): yield tuple(row)
def iterproblems(table, constraints, expected_header): outhdr = ('name', 'row', 'field', 'value', 'error') yield outhdr it = iter(table) actual_header = next(it) if expected_header is None: flds = list(map(text_type, actual_header)) else: expected_flds = list(map(text_type, expected_header)) actual_flds = list(map(text_type, actual_header)) try: assert expected_flds == actual_flds except Exception as e: yield ('__header__', 0, None, None, type(e).__name__) flds = expected_flds local_constraints = normalize_constraints(constraints, flds) # setup getters for constraint in local_constraints: if 'getter' not in constraint: if 'field' in constraint: # should ensure FieldSelectionError if bad field in constraint indices = asindices(flds, constraint['field']) getter = operator.itemgetter(*indices) constraint['getter'] = getter # generate problems expected_len = len(flds) for i, row in enumerate(it): row = tuple(row) # row length constraint l = None try: l = len(row) assert l == expected_len except Exception as e: yield ('__len__', i + 1, None, l, type(e).__name__) # user defined constraints row = Record(row, flds) for constraint in local_constraints: name = constraint.get('name', None) field = constraint.get('field', None) assertion = constraint.get('assertion', None) test = constraint.get('test', None) getter = constraint.get('getter', lambda x: x) try: target = getter(row) except Exception as e: # getting target value failed, report problem yield (name, i + 1, field, None, type(e).__name__) else: value = target if field else None if test is not None: try: test(target) except Exception as e: # test raised exception, report problem yield (name, i + 1, field, value, type(e).__name__) if assertion is not None: try: assert assertion(target) except Exception as e: # assertion raised exception, report problem yield (name, i + 1, field, value, type(e).__name__)