Exemple #1
1
def iterunique(source, key):
    # assume source is sorted
    # first need to sort the data
    it = iter(source)

    hdr = next(it)
    yield tuple(hdr)

    # convert field selection into field indices
    if key is None:
        indices = range(len(hdr))
    else:
        indices = asindices(hdr, key)

    # now use field indices to construct a _getkey function
    # N.B., this may raise an exception on short rows, depending on
    # the field selection
    getkey = operator.itemgetter(*indices)

    prev = next(it)
    prev_key = getkey(prev)
    prev_comp_ne = True

    for curr in it:
        curr_key = getkey(curr)
        curr_comp_ne = curr_key != prev_key
        if prev_comp_ne and curr_comp_ne:
            yield tuple(prev)
        prev = curr
        prev_key = curr_key
        prev_comp_ne = curr_comp_ne

    # last one?
    if prev_comp_ne:
        yield prev
Exemple #2
0
def facettupletrees(table, key, start='start', stop='stop', value=None):
    """
    Construct faceted interval trees for the given table, where each node in
    the tree is a row of the table.

    """

    import intervaltree
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    assert start in flds, 'start field not recognised'
    assert stop in flds, 'stop field not recognised'
    getstart = itemgetter(flds.index(start))
    getstop = itemgetter(flds.index(stop))
    if value is None:
        getvalue = tuple
    else:
        valueindices = asindices(hdr, value)
        assert len(valueindices) > 0, 'invalid value field specification'
        getvalue = itemgetter(*valueindices)
    keyindices = asindices(hdr, key)
    assert len(keyindices) > 0, 'invalid key'
    getkey = itemgetter(*keyindices)

    trees = dict()
    for row in it:
        k = getkey(row)
        if k not in trees:
            trees[k] = intervaltree.IntervalTree()
        trees[k].addi(getstart(row), getstop(row), getvalue(row))
    return trees
Exemple #3
0
    def __init__(self, default_connections, keyed_connections, fields, key):
        super(DuplicatesConnection, self).__init__(default_connections,
                                                   keyed_connections, fields)

        # convert field selection into field indices
        indices = asindices(fields, key)

        # now use field indices to construct a _getkey function
        # N.B., this may raise an exception on short rows, depending on
        # the field selection
        self.getkey = itemgetter(*indices)

        # initial state
        self.previous = None
        self.previous_is_duplicate = False

        # convert field selection into field indices
        indices = asindices(fields, key)

        # now use field indices to construct a _getkey function
        # N.B., this may raise an exception on short rows, depending on
        # the field selection
        self.getkey = itemgetter(*indices)

        # initial state
        self.previous = None
        self.previous_is_duplicate = False
Exemple #4
0
def facettupletrees(table, key, start='start', stop='stop', value=None):
    """
    Construct faceted interval trees for the given table, where each node in
    the tree is a row of the table.

    """

    import intervaltree
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    assert start in flds, 'start field not recognised'
    assert stop in flds, 'stop field not recognised'
    getstart = itemgetter(flds.index(start))
    getstop = itemgetter(flds.index(stop))
    if value is None:
        getvalue = tuple
    else:
        valueindices = asindices(hdr, value)
        assert len(valueindices) > 0, 'invalid value field specification'
        getvalue = itemgetter(*valueindices)
    keyindices = asindices(hdr, key)
    assert len(keyindices) > 0, 'invalid key'
    getkey = itemgetter(*keyindices)

    trees = dict()
    for row in it:
        k = getkey(row)
        if k not in trees:
            trees[k] = intervaltree.IntervalTree()
        trees[k].addi(getstart(row), getstop(row), getvalue(row))
    return trees
Exemple #5
0
def iterhashantijoin(left, right, lkey, rkey):
    lit = iter(left)
    rit = iter(right)

    lhdr = next(lit)
    rhdr = next(rit)
    yield tuple(lhdr)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)

    # construct functions to extract key values from both tables
    lgetk = operator.itemgetter(*lkind)
    rgetk = operator.itemgetter(*rkind)

    rkeys = set()
    for rrow in rit:
        rk = rgetk(rrow)
        rkeys.add(rk)

    for lrow in lit:
        lk = lgetk(lrow)
        if lk not in rkeys:
            yield tuple(lrow)
Exemple #6
0
    def __init__(self, default_connections, keyed_connections, fields, key):
        super(DuplicatesConnection, self).__init__(default_connections,
                                                   keyed_connections, fields)

        # convert field selection into field indices
        indices = asindices(fields, key)
        
        # now use field indices to construct a _getkey function
        # N.B., this may raise an exception on short rows, depending on
        # the field selection
        self.getkey = itemgetter(*indices)

        # initial state
        self.previous = None
        self.previous_is_duplicate = False

        # convert field selection into field indices
        indices = asindices(fields, key)
        
        # now use field indices to construct a _getkey function
        # N.B., this may raise an exception on short rows, depending on
        # the field selection
        self.getkey = itemgetter(*indices)

        # initial state
        self.previous = None
        self.previous_is_duplicate = False
Exemple #7
0
def iterhashantijoin(left, right, lkey, rkey):
    lit = iter(left)
    rit = iter(right)

    lhdr = next(lit)
    rhdr = next(rit)
    yield tuple(lhdr)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)
    
    # construct functions to extract key values from both tables
    lgetk = operator.itemgetter(*lkind)
    rgetk = operator.itemgetter(*rkind)
    
    rkeys = set()
    for rrow in rit:
        rk = rgetk(rrow)
        rkeys.add(rk)
        
    for lrow in lit:
        lk = lgetk(lrow)
        if lk not in rkeys:
            yield tuple(lrow)
Exemple #8
0
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix,
                      rprefix):
    lit = iter(left)
    rit = iter(right)

    lhdr = next(lit)
    rhdr = next(rit)
    
    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)
    
    # construct functions to extract key values from left table
    rgetk = operator.itemgetter(*rkind)
    
    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rhdr)) if i not in rkind]
    rgetv = rowgetter(*rvind)
    
    # determine the output fields
    if lprefix is None:
        outhdr = list(lhdr)
    else:
        outhdr = [(str(lprefix) + str(f))
                  for f in lhdr]
    if rprefix is None:
        outhdr.extend(rgetv(rhdr))
    else:
        outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)])
    yield tuple(outhdr)

    # define a function to join rows
    def joinrows(_rrow, _lrows):
        for lrow in _lrows:
            # start with the left row
            _outrow = list(lrow)
            # extend with non-key values from the right row
            _outrow.extend(rgetv(_rrow))
            yield tuple(_outrow)

    for rrow in rit:
        k = rgetk(rrow)
        if k in llookup:
            lrows = llookup[k]
            for outrow in joinrows(rrow, lrows):
                yield outrow
        else:
            # start with missing values in place of the left row
            outrow = [missing] * len(lhdr)
            # set key values
            for li, ri in zip(lkind, rkind):
                outrow[li] = rrow[ri]
            # extend with non-key values from the right row  
            outrow.extend(rgetv(rrow))
            yield tuple(outrow)
Exemple #9
0
def iterhashrightjoin(left, right, lkey, rkey, missing, llookup, lprefix,
                      rprefix):
    lit = iter(left)
    rit = iter(right)

    lhdr = next(lit)
    rhdr = next(rit)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)

    # construct functions to extract key values from left table
    rgetk = operator.itemgetter(*rkind)

    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rhdr)) if i not in rkind]
    rgetv = rowgetter(*rvind)

    # determine the output fields
    if lprefix is None:
        outhdr = list(lhdr)
    else:
        outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr]
    if rprefix is None:
        outhdr.extend(rgetv(rhdr))
    else:
        outhdr.extend([(text_type(rprefix) + text_type(f))
                       for f in rgetv(rhdr)])
    yield tuple(outhdr)

    # define a function to join rows
    def joinrows(_rrow, _lrows):
        for lrow in _lrows:
            # start with the left row
            _outrow = list(lrow)
            # extend with non-key values from the right row
            _outrow.extend(rgetv(_rrow))
            yield tuple(_outrow)

    for rrow in rit:
        k = rgetk(rrow)
        if k in llookup:
            lrows = llookup[k]
            for outrow in joinrows(rrow, lrows):
                yield outrow
        else:
            # start with missing values in place of the left row
            outrow = [missing] * len(lhdr)
            # set key values
            for li, ri in zip(lkind, rkind):
                outrow[li] = rrow[ri]
            # extend with non-key values from the right row
            outrow.extend(rgetv(rrow))
            yield tuple(outrow)
Exemple #10
0
def iterantijoin(left, right, lkey, rkey):
    lit = iter(left)
    rit = iter(right)

    lhdr = next(lit)
    rhdr = next(rit)
    yield tuple(lhdr)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)

    # construct functions to extract key values from both tables
    lgetk = comparable_itemgetter(*lkind)
    rgetk = comparable_itemgetter(*rkind)

    # construct group iterators for both tables
    lgit = itertools.groupby(lit, key=lgetk)
    rgit = itertools.groupby(rit, key=rgetk)
    lrowgrp = []

    # loop until *either* of the iterators is exhausted
    lkval, rkval = Comparable(None), Comparable(None)
    try:

        # pick off initial row groups
        lkval, lrowgrp = next(lgit)
        rkval, _ = next(rgit)

        while True:
            if lkval < rkval:
                for row in lrowgrp:
                    yield tuple(row)
                # advance left
                lkval, lrowgrp = next(lgit)
            elif lkval > rkval:
                # advance right
                rkval, _ = next(rgit)
            else:
                # advance both
                lkval, lrowgrp = next(lgit)
                rkval, _ = next(rgit)

    except StopIteration:
        pass

    # any left over?
    if lkval > rkval:
        # yield anything that got left hanging
        for row in lrowgrp:
            yield tuple(row)
    # and the rest...
    for lkval, lrowgrp in lgit:
        for row in lrowgrp:
            yield tuple(row)
Exemple #11
0
def iterantijoin(left, right, lkey, rkey):
    lit = iter(left)
    rit = iter(right)

    lhdr = next(lit)
    rhdr = next(rit)
    yield tuple(lhdr)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)

    # construct functions to extract key values from both tables
    lgetk = comparable_itemgetter(*lkind)
    rgetk = comparable_itemgetter(*rkind)

    # construct group iterators for both tables
    lgit = itertools.groupby(lit, key=lgetk)
    rgit = itertools.groupby(rit, key=rgetk)
    lrowgrp = []

    # loop until *either* of the iterators is exhausted
    lkval, rkval = Comparable(None), Comparable(None)
    try:

        # pick off initial row groups
        lkval, lrowgrp = next(lgit)
        rkval, _ = next(rgit)

        while True:
            if lkval < rkval:
                for row in lrowgrp:
                    yield tuple(row)
                # advance left
                lkval, lrowgrp = next(lgit)
            elif lkval > rkval:
                # advance right
                rkval, _ = next(rgit)
            else:
                # advance both
                lkval, lrowgrp = next(lgit)
                rkval, _ = next(rgit)

    except StopIteration:
        pass

    # any left over?
    if lkval > rkval:
        # yield anything that got left hanging
        for row in lrowgrp:
            yield tuple(row)
    # and the rest...
    for lkval, lrowgrp in lgit:
        for row in lrowgrp:
            yield tuple(row)
def iterfilldown(table, fillfields, missing, where, anchorfields, until):
	# prepare where function
	if isinstance(where, string_types):
		where = expr(where)
	elif where is not None:
		assert callable(where), 'expected callable for "where" argument, found %r' % where
	else:
		where = lambda r: True # default where callable returns True
	# prepare until function
	if isinstance(until, string_types):
		until = expr(until)
	elif until is not None:
		assert callable(until), 'expected callable for "until" argument, found %r' % until
	else:
		until = lambda r: False # default until callable returns True
	# normal iter function
	it = iter(table)
	hdr = next(it)
	flds = list(map(text_type, hdr))
	yield tuple(hdr)
	if not fillfields:  # fill down all fields
		fillfields = hdr
	fillindices = asindices(hdr, fillfields)
	if anchorfields:
		anchorindices = asindices(hdr, anchorfields)
	fill = list(next(it))  # fill values
	prev = fill
	untilfunctiontriggered = False
	yield tuple(fill)
	for row in it:
		outrow = list(row)
		if untilfunctiontriggered:
			fill = outrow
			untilfunctiontriggered = False # reset
		if anchorfields:
			row_values = [row[i] for i in anchorindices]
			prev_values = [prev[i] for i in anchorindices]
			check_anchor = row_values == prev_values
		else:
			check_anchor = True
		# loop through fill-down fields
		for idx in fillindices:
			if row[idx] == missing and where(Record(row, flds)) and check_anchor: 
				outrow[idx] = fill[idx]  # fill down
			elif row[idx] == missing and check_anchor:
				pass
			else:
				fill[idx] = row[idx]  # new fill value
		prev = outrow
		yield tuple(outrow)
		# found stop point, reset fill with next row's contents
		if until(Record(row, flds)):
			untilfunctiontriggered = True
Exemple #13
0
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix):
    lit = iter(left)
    lhdr = next(lit)

    rhdr, rit = iterpeek(right)  # need the whole lot to pass to lookup
    rlookup = lookupone(rit, rkey, strict=False)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)

    # construct functions to extract key values from left table
    lgetk = operator.itemgetter(*lkind)

    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rhdr)) if i not in rkind]
    rgetv = rowgetter(*rvind)

    # determine the output fields
    if lprefix is None:
        outhdr = list(lhdr)
    else:
        outhdr = [(str(lprefix) + str(f))
                  for f in lhdr]
    if rprefix is None:
        outhdr.extend(rgetv(rhdr))
    else:
        outhdr.extend([(str(rprefix) + str(f))
                       for f in rgetv(rhdr)])
    yield tuple(outhdr)

    # define a function to join rows
    def joinrows(_lrow, _rrow):
        # start with the left row
        _outrow = list(_lrow)
        # extend with non-key values from the right row
        _outrow.extend(rgetv(_rrow))
        return tuple(_outrow)

    for lrow in lit:
        k = lgetk(lrow)
        if k in rlookup:
            rrow = rlookup[k]
            yield joinrows(lrow, rrow)
        else:
            outrow = list(lrow)  # start with the left row
            # extend with missing values in place of the right row
            outrow.extend([missing] * len(rvind))
            yield tuple(outrow)
Exemple #14
0
def iterhashlookupjoin(left, right, lkey, rkey, missing, lprefix, rprefix):
    lit = iter(left)
    lhdr = next(lit)

    rhdr, rit = iterpeek(right)  # need the whole lot to pass to lookup
    rlookup = lookupone(rit, rkey, strict=False)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)

    # construct functions to extract key values from left table
    lgetk = operator.itemgetter(*lkind)

    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rhdr)) if i not in rkind]
    rgetv = rowgetter(*rvind)

    # determine the output fields
    if lprefix is None:
        outhdr = list(lhdr)
    else:
        outhdr = [(text_type(lprefix) + text_type(f)) for f in lhdr]
    if rprefix is None:
        outhdr.extend(rgetv(rhdr))
    else:
        outhdr.extend([(text_type(rprefix) + text_type(f))
                       for f in rgetv(rhdr)])
    yield tuple(outhdr)

    # define a function to join rows
    def joinrows(_lrow, _rrow):
        # start with the left row
        _outrow = list(_lrow)
        # extend with non-key values from the right row
        _outrow.extend(rgetv(_rrow))
        return tuple(_outrow)

    for lrow in lit:
        k = lgetk(lrow)
        if k in rlookup:
            rrow = rlookup[k]
            yield joinrows(lrow, rrow)
        else:
            outrow = list(lrow)  # start with the left row
            # extend with missing values in place of the right row
            outrow.extend([missing] * len(rvind))
            yield tuple(outrow)
Exemple #15
0
def tupletree(table, start='start', stop='stop', value=None):
    """
    Construct an interval tree for the given table, where each node in the tree
    is a row of the table.

    """

    import intervaltree
    tree = intervaltree.IntervalTree()
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    assert start in flds, 'start field not recognised'
    assert stop in flds, 'stop field not recognised'
    getstart = itemgetter(flds.index(start))
    getstop = itemgetter(flds.index(stop))
    if value is None:
        getvalue = tuple
    else:
        valueindices = asindices(hdr, value)
        assert len(valueindices) > 0, 'invalid value field specification'
        getvalue = itemgetter(*valueindices)
    for row in it:
        tree.addi(getstart(row), getstop(row), getvalue(row))
    return tree
Exemple #16
0
def itersearch(table, pattern, field, flags, complement):
    prog = re.compile(pattern, flags)
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    yield tuple(hdr)

    if field is None:
        # search whole row
        test = lambda r: any(prog.search(text_type(v)) for v in r)
    else:
        indices = asindices(hdr, field)
        if len(indices) == 1:
            index = indices[0]
            test = lambda r: prog.search(text_type(r[index]))
        else:
            getvals = operator.itemgetter(*indices)
            test = lambda r: any(prog.search(text_type(v)) for v in getvals(r))
    # complement==False, return rows that match
    if not complement:
        for row in it:
            if test(row):
                yield tuple(row)
    # complement==True, return rows that do not match
    else:
        for row in it:
            if not test(row):
                yield tuple(row)
Exemple #17
0
def tupletree(table, start='start', stop='stop', value=None):
    """
    Construct an interval tree for the given table, where each node in the tree
    is a row of the table.

    """

    import intervaltree
    tree = intervaltree.IntervalTree()
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    assert start in flds, 'start field not recognised'
    assert stop in flds, 'stop field not recognised'
    getstart = itemgetter(flds.index(start))
    getstop = itemgetter(flds.index(stop))
    if value is None:
        getvalue = tuple
    else:
        valueindices = asindices(hdr, value)
        assert len(valueindices) > 0, 'invalid value field specification'
        getvalue = itemgetter(*valueindices)
    for row in it:
        tree.addi(getstart(row), getstop(row), getvalue(row))
    return tree
def iterunique(source, key):
    # assume source is sorted
    # first need to sort the data
    it = iter(source)

    hdr = next(it)
    yield tuple(hdr)

    # convert field selection into field indices
    if key is None:
        indices = range(len(hdr))
    else:
        indices = asindices(hdr, key)

    # now use field indices to construct a _getkey function
    # N.B., this may raise an exception on short rows, depending on
    # the field selection
    getkey = operator.itemgetter(*indices)

    prev = next(it)
    prev_key = getkey(prev)
    prev_comp_ne = True

    for curr in it:
        curr_key = getkey(curr)
        curr_comp_ne = (curr_key != prev_key)
        if prev_comp_ne and curr_comp_ne:
            yield tuple(prev)
        prev = curr
        prev_key = curr_key
        prev_comp_ne = curr_comp_ne

    # last one?
    if prev_comp_ne:
        yield prev
Exemple #19
0
def recordlookup(table, key, dictionary=None):
    """
    Load a dictionary with data from the given table, mapping to record objects.

    """

    if dictionary is None:
        dictionary = dict()

    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    keyindices = asindices(hdr, key)
    assert len(keyindices) > 0, 'no key selected'
    getkey = operator.itemgetter(*keyindices)
    for row in it:
        k = getkey(row)
        rec = Record(row, flds)
        if k in dictionary:
            # work properly with shelve
            l = dictionary[k]
            l.append(rec)
            dictionary[k] = l
        else:
            dictionary[k] = [rec]
    return dictionary
Exemple #20
0
def itersearch(table, pattern, field, flags, complement):
    prog = re.compile(pattern, flags)
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    yield tuple(hdr)

    if field is None:
        # search whole row
        test = lambda r: any(prog.search(text_type(v)) for v in r)
    else:
        indices = asindices(hdr, field)
        if len(indices) == 1:
            index = indices[0]
            test = lambda r: prog.search(text_type(r[index]))
        else:
            getvals = operator.itemgetter(*indices)
            test = lambda r: any(prog.search(text_type(v)) for v in getvals(r))
    # complement==False, return rows that match
    if not complement:
        for row in it:
            if test(row):
                yield tuple(row)
    # complement==True, return rows that do not match
    else:
        for row in it:
            if not test(row):
                yield tuple(row)
def recordlookup(table, key, dictionary=None):
    """
    Load a dictionary with data from the given table, mapping to record objects.

    """

    if dictionary is None:
        dictionary = dict()

    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    keyindices = asindices(hdr, key)
    assert len(keyindices) > 0, 'no key selected'
    getkey = operator.itemgetter(*keyindices)
    for row in it:
        k = getkey(row)
        rec = Record(row, flds)
        if k in dictionary:
            # work properly with shelve
            l = dictionary[k]
            l.append(rec)
            dictionary[k] = l
        else:
            dictionary[k] = [rec]
    return dictionary
Exemple #22
0
def iterhashjoin(left, right, lkey, rkey, rlookup, lprefix, rprefix):
    lit = iter(left)
    rit = iter(right)

    lhdr = next(lit)
    rhdr = next(rit)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)

    # construct functions to extract key values from left table
    lgetk = operator.itemgetter(*lkind)

    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rhdr)) if i not in rkind]
    rgetv = rowgetter(*rvind)

    # determine the output fields
    if lprefix is None:
        outhdr = list(lhdr)
    else:
        outhdr = [(str(lprefix) + str(f)) for f in lhdr]
    if rprefix is None:
        outhdr.extend(rgetv(rhdr))
    else:
        outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)])
    yield tuple(outhdr)

    # define a function to join rows
    def joinrows(_lrow, _rrows):
        for rrow in _rrows:
            # start with the left row
            _outrow = list(_lrow)
            # extend with non-key values from the right row
            _outrow.extend(rgetv(rrow))
            yield tuple(_outrow)

    for lrow in lit:
        k = lgetk(lrow)
        if k in rlookup:
            rrows = rlookup[k]
            for outrow in joinrows(lrow, rrows):
                yield outrow
Exemple #23
0
 def __init__(self, default_connections, keyed_connections, fields,
              discriminator):
     super(PartitionConnection, self).__init__(default_connections,
                                               keyed_connections, fields)
     if callable(discriminator):
         self.discriminator = discriminator
     else:  # assume field or fields
         self.discriminator = itemgetter(*asindices(fields, discriminator))
Exemple #24
0
def itermelt(source, key, variables, variablefield, valuefield):
    if key is None and variables is None:
        raise ValueError('either key or variables must be specified')

    it = iter(source)
    hdr = next(it)

    # determine key and variable field indices
    key_indices = variables_indices = None
    if key is not None:
        key_indices = asindices(hdr, key)
    if variables is not None:
        if not isinstance(variables, (list, tuple)):
            variables = (variables,)
        variables_indices = asindices(hdr, variables)

    if key is None:
        # assume key is fields not in variables
        key_indices = [i for i in range(len(hdr))
                       if i not in variables_indices]
    if variables is None:
        # assume variables are fields not in key
        variables_indices = [i for i in range(len(hdr))
                             if i not in key_indices]
        variables = [hdr[i] for i in variables_indices]

    getkey = rowgetter(*key_indices)

    # determine the output fields
    outhdr = [hdr[i] for i in key_indices]
    outhdr.append(variablefield)
    outhdr.append(valuefield)
    yield tuple(outhdr)

    # construct the output data
    for row in it:
        k = getkey(row)
        for v, i in zip(variables, variables_indices):
            try:
                o = list(k)  # populate with key values initially
                o.append(v)  # add variable
                o.append(row[i])  # add value
                yield tuple(o)
            except IndexError:
                # row is missing this value, and melt() should yield no row
                pass
Exemple #25
0
 def __init__(self, default_connections, keyed_connections, fields,
              discriminator):
     super(PartitionConnection, self).__init__(default_connections,
                                               keyed_connections, fields)
     if callable(discriminator):
         self.discriminator = discriminator
     else: # assume field or fields
         self.discriminator = itemgetter(*asindices(fields, discriminator))
def itermergesort(sources, key, header, missing, reverse):
    # first need to standardise headers of all input tables
    # borrow this from itercat - TODO remove code smells

    its = [iter(t) for t in sources]
    src_hdrs = [next(it) for it in its]

    if header is None:
        # determine output fields by gathering all fields found in the sources
        outhdr = list()
        for hdr in src_hdrs:
            for f in list(map(text_type, hdr)):
                if f not in outhdr:
                    # add any new fields as we find them
                    outhdr.append(f)
    else:
        # predetermined output fields
        outhdr = header
    yield tuple(outhdr)

    def _standardisedata(it, hdr, ofs):
        flds = list(map(text_type, hdr))
        # now construct and yield the data rows
        for _row in it:
            try:
                # should be quickest to do this way
                yield tuple(_row[flds.index(fo)] if fo in flds else missing
                            for fo in ofs)
            except IndexError:
                # handle short rows
                outrow = [missing] * len(ofs)
                for i, fi in enumerate(flds):
                    try:
                        outrow[ofs.index(fi)] = _row[i]
                    except IndexError:
                        pass  # be relaxed about short rows
                yield tuple(outrow)

    # wrap all iterators to standardise fields
    sits = [_standardisedata(it, hdr, outhdr)
            for hdr, it in zip(src_hdrs, its)]

    # now determine key function
    getkey = None
    if key is not None:
        # convert field selection into field indices
        indices = asindices(outhdr, key)
        # now use field indices to construct a _getkey function
        # N.B., this will probably raise an exception on short rows
        getkey = comparable_itemgetter(*indices)

    # OK, do the merge sort
    for row in _shortlistmergesorted(getkey, reverse, *sits):
        yield row
Exemple #27
0
def _setup_lookup(table, key, value):

    # obtain iterator and header row
    it = iter(table)
    hdr = next(it)

    # prepare key getter
    keyindices = asindices(hdr, key)
    assert len(keyindices) > 0, 'no key selected'
    getkey = operator.itemgetter(*keyindices)

    # prepare value getter
    if value is None:
        # default value is complete row
        getvalue = rowgetter(*range(len(hdr)))
    else:
        valueindices = asindices(hdr, value)
        assert len(valueindices) > 0, 'no value selected'
        getvalue = operator.itemgetter(*valueindices)

    return it, getkey, getvalue
Exemple #28
0
def iterconflicts(source, key, missing, exclude, include):

    # normalise arguments
    if exclude and not isinstance(exclude, (list, tuple)):
        exclude = (exclude,)
    if include and not isinstance(include, (list, tuple)):
        include = (include,)

    # exclude overrides include
    if include and exclude:
        include = None
        
    it = iter(source)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    yield tuple(hdr)

    # convert field selection into field indices
    indices = asindices(hdr, key)
                    
    # now use field indices to construct a _getkey function
    # N.B., this may raise an exception on short rows, depending on
    # the field selection
    getkey = operator.itemgetter(*indices)
    
    previous = None
    previous_yielded = False
    
    for row in it:
        if previous is None:
            previous = row
        else:
            kprev = getkey(previous)
            kcurr = getkey(row)
            if kprev == kcurr:
                # is there a conflict?
                conflict = False
                for x, y, f in zip(previous, row, flds):
                    if (exclude and f not in exclude) \
                            or (include and f in include) \
                            or (not exclude and not include):
                        if missing not in (x, y) and x != y:
                            conflict = True
                            break
                if conflict:
                    if not previous_yielded:
                        yield tuple(previous)
                        previous_yielded = True
                    yield tuple(row)
            else:
                # reset
                previous_yielded = False
            previous = row
def iterconflicts(source, key, missing, exclude, include):

    # normalise arguments
    if exclude and not isinstance(exclude, (list, tuple)):
        exclude = (exclude, )
    if include and not isinstance(include, (list, tuple)):
        include = (include, )

    # exclude overrides include
    if include and exclude:
        include = None

    it = iter(source)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    yield tuple(hdr)

    # convert field selection into field indices
    indices = asindices(hdr, key)

    # now use field indices to construct a _getkey function
    # N.B., this may raise an exception on short rows, depending on
    # the field selection
    getkey = operator.itemgetter(*indices)

    previous = None
    previous_yielded = False

    for row in it:
        if previous is None:
            previous = row
        else:
            kprev = getkey(previous)
            kcurr = getkey(row)
            if kprev == kcurr:
                # is there a conflict?
                conflict = False
                for x, y, f in zip(previous, row, flds):
                    if (exclude and f not in exclude) \
                            or (include and f in include) \
                            or (not exclude and not include):
                        if missing not in (x, y) and x != y:
                            conflict = True
                            break
                if conflict:
                    if not previous_yielded:
                        yield tuple(previous)
                        previous_yielded = True
                    yield tuple(row)
            else:
                # reset
                previous_yielded = False
            previous = row
Exemple #30
0
def iterfieldselect(source, field, where, complement, missing):
    it = iter(source)
    hdr = next(it)
    yield tuple(hdr)
    indices = asindices(hdr, field)
    getv = operator.itemgetter(*indices)
    for row in it:
        try:
            v = getv(row)
        except IndexError:
            v = missing
        if where(v) != complement:  # XOR
            yield tuple(row)
Exemple #31
0
def iterfieldselect(source, field, where, complement, missing):
    it = iter(source)
    hdr = next(it)
    yield tuple(hdr)
    indices = asindices(hdr, field)
    getv = operator.itemgetter(*indices)
    for row in it:
        try:
            v = getv(row)
        except IndexError:
            v = missing
        if bool(where(v)) != complement:  # XOR
            yield tuple(row)
def iterfillright(table, fillfields, missing):
	it = iter(table)
	hdr = next(it)
	flds = list(map(text_type, hdr))
	yield tuple(hdr)
	if not fillfields:  # fill down all fields
		fillfields = hdr
	fillindices = asindices(hdr, fillfields)
	for row in it:
		outrow = list(row)
		for i, _ in enumerate(outrow):
			if i > 0 and outrow[i] == missing and outrow[i-1] != missing and i in fillindices:
				outrow[i] = outrow[i-1]
		yield tuple(outrow)
def issorted(table, key=None, reverse=False, strict=False):
    """
    Return True if the table is ordered (i.e., sorted) by the given key. E.g.::

        >>> import petl as etl
        >>> table1 = [['foo', 'bar', 'baz'],
        ...           ['a', 1, True],
        ...           ['b', 3, True],
        ...           ['b', 2]]
        >>> etl.issorted(table1, key='foo')
        True
        >>> etl.issorted(table1, key='bar')
        False
        >>> etl.issorted(table1, key='foo', strict=True)
        False
        >>> etl.issorted(table1, key='foo', reverse=True)
        False

    """

    # determine the operator to use when comparing rows
    if reverse and strict:
        op = operator.lt
    elif reverse and not strict:
        op = operator.le
    elif strict:
        op = operator.gt
    else:
        op = operator.ge

    it = iter(table)
    flds = [text_type(f) for f in next(it)]
    if key is None:
        prev = next(it)
        for curr in it:
            if not op(curr, prev):
                return False
            prev = curr
    else:
        getkey = comparable_itemgetter(*asindices(flds, key))
        prev = next(it)
        prevkey = getkey(prev)
        for curr in it:
            currkey = getkey(curr)
            if not op(currkey, prevkey):
                return False
            prevkey = currkey
    return True
Exemple #34
0
def iterfilldown(table, fillfields, missing):
    it = iter(table)
    hdr = next(it)
    yield tuple(hdr)
    if not fillfields:  # fill down all fields
        fillfields = hdr
    fillindices = asindices(hdr, fillfields)
    fill = list(next(it))  # fill values
    yield tuple(fill)
    for row in it:
        outrow = list(row)
        for idx in fillindices:
            if row[idx] == missing:
                outrow[idx] = fill[idx]  # fill down
            else:
                fill[idx] = row[idx]  # new fill value
        yield tuple(outrow)
Exemple #35
0
def iterfilldown(table, fillfields, missing):
    it = iter(table)
    hdr = next(it)
    yield tuple(hdr)
    if not fillfields:  # fill down all fields
        fillfields = hdr
    fillindices = asindices(hdr, fillfields)
    fill = list(next(it))  # fill values
    yield tuple(fill)
    for row in it:
        outrow = list(row)
        for idx in fillindices:
            if row[idx] == missing:
                outrow[idx] = fill[idx]  # fill down
            else:
                fill[idx] = row[idx]  # new fill value
        yield tuple(outrow)
Exemple #36
0
    def __iter__(self):
        it = iter(self.table)
        hdr = next(it)
        shdr = sorted(hdr)
        indices = asindices(hdr, shdr)
        transform = rowgetter(*indices)

        # yield the transformed header
        yield tuple(shdr)

        # construct the transformed data
        missing = self.missing
        for row in it:
            try:
                yield transform(row)
            except IndexError:
                # row is short, let's be kind and fill in any missing fields
                yield tuple(row[i] if i < len(row) else missing
                            for i in indices)
Exemple #37
0
    def __iter__(self):
        it = iter(self.table)
        hdr = next(it)
        shdr = sorted(hdr)
        indices = asindices(hdr, shdr)
        transform = rowgetter(*indices)

        # yield the transformed header
        yield tuple(shdr)

        # construct the transformed data
        missing = self.missing
        for row in it:
            try:
                yield transform(row)
            except IndexError:
                # row is short, let's be kind and fill in any missing fields
                yield tuple(row[i] if i < len(row) else missing
                            for i in indices)
Exemple #38
0
    def __iter__(self):
        it = iter(self.table)
        hdr = next(it)

        # convert field selection into field indices
        if self.key is None:
            indices = range(len(hdr))
        else:
            indices = asindices(hdr, self.key)

        # now use field indices to construct a _getkey function
        # N.B., this may raise an exception on short rows, depending on
        # the field selection
        getkey = operator.itemgetter(*indices)

        INIT = object()
        if self.count:
            hdr = tuple(hdr) + (self.count,)
            yield hdr
            previous = INIT
            n_dup = 1
            for row in it:
                if previous is INIT:
                    previous = row
                else:
                    kprev = getkey(previous)
                    kcurr = getkey(row)
                    if kprev == kcurr:
                        n_dup += 1
                    else:
                        yield tuple(previous) + (n_dup,)
                        n_dup = 1
                        previous = row
            # deal with last row
            yield tuple(previous) + (n_dup,)
        else:
            yield tuple(hdr)
            previous_keys = INIT
            for row in it:
                keys = getkey(row)
                if keys != previous_keys:
                    yield tuple(row)
                previous_keys = keys
Exemple #39
0
    def __iter__(self):
        it = iter(self.table)
        hdr = next(it)

        # convert field selection into field indices
        if self.key is None:
            indices = range(len(hdr))
        else:
            indices = asindices(hdr, self.key)

        # now use field indices to construct a _getkey function
        # N.B., this may raise an exception on short rows, depending on
        # the field selection
        getkey = operator.itemgetter(*indices)

        if self.count:
            hdr = tuple(hdr) + (self.count,)
            yield hdr
            previous = None
            n_dup = 1
            for row in it:
                if previous is None:
                    previous = row
                else:
                    kprev = getkey(previous)
                    kcurr = getkey(row)
                    if kprev == kcurr:
                        n_dup += 1
                    else:
                        yield tuple(previous) + (n_dup,)
                        n_dup = 1
                        previous = row
            # deal with last row
            yield tuple(previous) + (n_dup,)
        else:
            yield tuple(hdr)
            previous_keys = None
            for row in it:
                keys = getkey(row)
                if keys != previous_keys:
                    yield tuple(row)
                previous_keys = keys
Exemple #40
0
def facetcolumns(table, key, missing=None):
    """
    Like :func:`petl.util.materialise.columns` but stratified by values of the
    given key field. E.g.::

        >>> import petl as etl
        >>> table = [['foo', 'bar', 'baz'],
        ...          ['a', 1, True],
        ...          ['b', 2, True],
        ...          ['b', 3]]
        >>> fc = etl.facetcolumns(table, 'foo')
        >>> fc['a']
        {'foo': ['a'], 'bar': [1], 'baz': [True]}
        >>> fc['b']
        {'foo': ['b', 'b'], 'bar': [2, 3], 'baz': [True, None]}

    """

    fct = dict()
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    indices = asindices(hdr, key)
    assert len(indices) > 0, 'no key field selected'
    getkey = operator.itemgetter(*indices)

    for row in it:
        kv = getkey(row)
        if kv not in fct:
            cols = dict()
            for f in flds:
                cols[f] = list()
            fct[kv] = cols
        else:
            cols = fct[kv]
        for f, v in izip_longest(flds, row, fillvalue=missing):
            if f in cols:
                cols[f].append(v)

    return fct
Exemple #41
0
    def __init__(self, default_connections, keyed_connections, fields, key,
                 reverse, buffersize):
        super(SortConnection, self).__init__(default_connections,
                                             keyed_connections, fields)

        self.getkey = None
        if key is not None:
            # convert field selection into field indices
            indices = asindices(fields, key)
            # now use field indices to construct a _getkey function
            # N.B., this will probably raise an exception on short rows
            self.getkey = comparable_itemgetter(*indices)

        self.reverse = reverse

        if buffersize is None:
            self.buffersize = petl.config.sort_buffersize
        else:
            self.buffersize = buffersize

        self.cache = list()
        self.chunkfiles = list()
Exemple #42
0
    def __init__(self, default_connections, keyed_connections, fields, key,
                 reverse, buffersize):
        super(SortConnection, self).__init__(default_connections,
                                             keyed_connections, fields)

        self.getkey = None
        if key is not None:
            # convert field selection into field indices
            indices = asindices(fields, key)
            # now use field indices to construct a _getkey function
            # N.B., this will probably raise an exception on short rows
            self.getkey = comparable_itemgetter(*indices)

        self.reverse = reverse

        if buffersize is None:
            self.buffersize = petl.config.sort_buffersize
        else:
            self.buffersize = buffersize

        self.cache = list()
        self.chunkfiles = list()
Exemple #43
0
def itercut(source, spec, missing=None):
    it = iter(source)
    spec = tuple(spec)  # make sure no-one can change midstream

    # convert field selection into field indices
    hdr = next(it)
    indices = asindices(hdr, spec)

    # define a function to transform each row in the source data
    # according to the field selection
    transform = rowgetter(*indices)

    # yield the transformed header
    yield transform(hdr)

    # construct the transformed data
    for row in it:
        try:
            yield transform(row)
        except IndexError:
            # row is short, let's be kind and fill in any missing fields
            yield tuple(row[i] if i < len(row) else missing for i in indices)
Exemple #44
0
def itercut(source, spec, missing=None):
    it = iter(source)
    spec = tuple(spec)  # make sure no-one can change midstream
    
    # convert field selection into field indices
    hdr = next(it)
    indices = asindices(hdr, spec)

    # define a function to transform each row in the source data 
    # according to the field selection
    transform = rowgetter(*indices)
    
    # yield the transformed header
    yield transform(hdr)
    
    # construct the transformed data
    for row in it:
        try:
            yield transform(row) 
        except IndexError:
            # row is short, let's be kind and fill in any missing fields
            yield tuple(row[i] if i < len(row) else missing for i in indices)
Exemple #45
0
    def __iter__(self):
        it = iter(self.table)

        # determine output fields
        hdr = next(it)
        outhdr = [f for f in hdr if f != self.field]
        outhdr.insert(self.index, self.field)
        yield tuple(outhdr)

        # define a function to transform each row in the source data
        # according to the field selection
        outflds = list(map(str, outhdr))
        indices = asindices(hdr, outflds)
        transform = rowgetter(*indices)

        # construct the transformed data
        for row in it:
            try:
                yield transform(row)
            except IndexError:
                # row is short, let's be kind and fill in any missing fields
                yield tuple(row[i] if i < len(row) else self.missing
                            for i in indices)
Exemple #46
0
    def __iter__(self):
        it = iter(self.table)

        # determine output fields
        hdr = next(it)
        outhdr = [f for f in hdr if f != self.field]
        outhdr.insert(self.index, self.field)
        yield tuple(outhdr)

        # define a function to transform each row in the source data
        # according to the field selection
        outflds = list(map(str, outhdr))
        indices = asindices(hdr, outflds)
        transform = rowgetter(*indices)

        # construct the transformed data
        for row in it:
            try:
                yield transform(row)
            except IndexError:
                # row is short, let's be kind and fill in any missing fields
                yield tuple(row[i] if i < len(row) else self.missing
                            for i in indices)
def iterduplicates(source, key):
    # assume source is sorted
    # first need to sort the data
    it = iter(source)

    hdr = next(it)
    yield tuple(hdr)

    # convert field selection into field indices
    if key is None:
        indices = range(len(hdr))
    else:
        indices = asindices(hdr, key)

    # now use field indices to construct a _getkey function
    # N.B., this may raise an exception on short rows, depending on
    # the field selection
    getkey = operator.itemgetter(*indices)

    previous = None
    previous_yielded = False

    for row in it:
        if previous is None:
            previous = row
        else:
            kprev = getkey(previous)
            kcurr = getkey(row)
            if kprev == kcurr:
                if not previous_yielded:
                    yield tuple(previous)
                    previous_yielded = True
                yield tuple(row)
            else:
                # reset
                previous_yielded = False
            previous = row
Exemple #48
0
def iterduplicates(source, key):
    # assume source is sorted
    # first need to sort the data
    it = iter(source)

    hdr = next(it)
    yield tuple(hdr)

    # convert field selection into field indices
    if key is None:
        indices = range(len(hdr))
    else:
        indices = asindices(hdr, key)

    # now use field indices to construct a _getkey function
    # N.B., this may raise an exception on short rows, depending on
    # the field selection
    getkey = operator.itemgetter(*indices)

    previous = None
    previous_yielded = False

    for row in it:
        if previous is None:
            previous = row
        else:
            kprev = getkey(previous)
            kcurr = getkey(row)
            if kprev == kcurr:
                if not previous_yielded:
                    yield tuple(previous)
                    previous_yielded = True
                yield tuple(row)
            else:
                # reset
                previous_yielded = False
            previous = row
def recordlookupone(table, key, dictionary=None, strict=False):
    """
    Load a dictionary with data from the given table, mapping to record objects,
    assuming there is at most one row for each key.

    """

    if dictionary is None:
        dictionary = dict()

    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    keyindices = asindices(hdr, key)
    assert len(keyindices) > 0, 'no key selected'
    getkey = operator.itemgetter(*keyindices)
    for row in it:
        k = getkey(row)
        if strict and k in dictionary:
            raise DuplicateKeyError(k)
        elif k not in dictionary:
            d = Record(row, flds)
            dictionary[k] = d
    return dictionary
Exemple #50
0
def recordlookupone(table, key, dictionary=None, strict=False):
    """
    Load a dictionary with data from the given table, mapping to record objects,
    assuming there is at most one row for each key.

    """

    if dictionary is None:
        dictionary = dict()

    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    keyindices = asindices(hdr, key)
    assert len(keyindices) > 0, 'no key selected'
    getkey = operator.itemgetter(*keyindices)
    for row in it:
        k = getkey(row)
        if strict and k in dictionary:
            raise DuplicateKeyError(k)
        elif k not in dictionary:
            d = Record(row, flds)
            dictionary[k] = d
    return dictionary
Exemple #51
0
def iterintervaljoin(left,
                     right,
                     lstart,
                     lstop,
                     rstart,
                     rstop,
                     lkey,
                     rkey,
                     include_stop,
                     missing,
                     lprefix,
                     rprefix,
                     leftouter,
                     anti=False):

    # create iterators and obtain fields
    lit = iter(left)
    lhdr = next(lit)
    lflds = list(map(text_type, lhdr))
    rit = iter(right)
    rhdr = next(rit)
    rflds = list(map(text_type, rhdr))

    # check fields via petl.util.asindices (raises FieldSelectionError if spec
    # is not valid)
    asindices(lhdr, lstart)
    asindices(lhdr, lstop)
    if lkey is not None:
        asindices(lhdr, lkey)
    asindices(rhdr, rstart)
    asindices(rhdr, rstop)
    if rkey is not None:
        asindices(rhdr, rkey)

    # determine output fields
    if lprefix is None:
        outhdr = list(lflds)
        if not anti:
            outhdr.extend(rflds)
    else:
        outhdr = list(lprefix + f for f in lflds)
        if not anti:
            outhdr.extend(rprefix + f for f in rflds)
    yield tuple(outhdr)

    # create getters for start and stop positions
    getlstart = itemgetter(lflds.index(lstart))
    getlstop = itemgetter(lflds.index(lstop))

    if rkey is None:
        # build interval lookup for right table
        lookup = intervallookup(right,
                                rstart,
                                rstop,
                                include_stop=include_stop)
        search = lookup.search
        # main loop
        for lrow in lit:
            start = getlstart(lrow)
            stop = getlstop(lrow)
            rrows = search(start, stop)
            if rrows:
                if not anti:
                    for rrow in rrows:
                        outrow = list(lrow)
                        outrow.extend(rrow)
                        yield tuple(outrow)
            elif leftouter:
                outrow = list(lrow)
                if not anti:
                    outrow.extend([missing] * len(rflds))
                yield tuple(outrow)

    else:
        # build interval lookup for right table
        lookup = facetintervallookup(right,
                                     key=rkey,
                                     start=rstart,
                                     stop=rstop,
                                     include_stop=include_stop)
        search = dict()
        for f in lookup:
            search[f] = lookup[f].search
        # getter for facet key values in left table
        getlkey = itemgetter(*asindices(lflds, lkey))
        # main loop
        for lrow in lit:
            lkey = getlkey(lrow)
            start = getlstart(lrow)
            stop = getlstop(lrow)

            try:
                rrows = search[lkey](start, stop)
            except KeyError:
                rrows = None
            except AttributeError:
                rrows = None

            if rrows:
                if not anti:
                    for rrow in rrows:
                        outrow = list(lrow)
                        outrow.extend(rrow)
                        yield tuple(outrow)
            elif leftouter:
                outrow = list(lrow)
                if not anti:
                    outrow.extend([missing] * len(rflds))
                yield tuple(outrow)
Exemple #52
0
def iterlookupjoin(left, right, lkey, rkey, missing=None, lprefix=None,
                   rprefix=None):
    lit = iter(left)
    rit = iter(right)

    lhdr = next(lit)
    rhdr = next(rit)

    # determine indices of the key fields in left and right tables
    lkind = asindices(lhdr, lkey)
    rkind = asindices(rhdr, rkey)

    # construct functions to extract key values from both tables
    lgetk = operator.itemgetter(*lkind)
    rgetk = operator.itemgetter(*rkind)

    # determine indices of non-key fields in the right table
    # (in the output, we only include key fields from the left table - we
    # don't want to duplicate fields)
    rvind = [i for i in range(len(rhdr)) if i not in rkind]
    rgetv = rowgetter(*rvind)

    # determine the output fields
    if lprefix is None:
        outhdr = list(lhdr)
    else:
        outhdr = [(str(lprefix) + str(f)) for f in lhdr]
    if rprefix is None:
        outhdr.extend(rgetv(rhdr))
    else:
        outhdr.extend([(str(rprefix) + str(f)) for f in rgetv(rhdr)])
    yield tuple(outhdr)

    # define a function to join two groups of rows
    def joinrows(_lrowgrp, _rrowgrp):
        if _rrowgrp is None:
            for lrow in _lrowgrp:
                outrow = list(lrow)  # start with the left row
                # extend with missing values in place of the right row
                outrow.extend([missing] * len(rvind))
                yield tuple(outrow)
        else:
            rrow = next(iter(_rrowgrp))  # pick first arbitrarily
            for lrow in _lrowgrp:
                # start with the left row
                outrow = list(lrow)
                # extend with non-key values from the right row
                outrow.extend(rgetv(rrow))
                yield tuple(outrow)

    # construct group iterators for both tables
    lgit = itertools.groupby(lit, key=lgetk)
    rgit = itertools.groupby(rit, key=rgetk)
    lrowgrp = []

    # loop until *either* of the iterators is exhausted
    lkval, rkval = None, None  # initialise here to handle empty tables
    try:

        # pick off initial row groups
        lkval, lrowgrp = next(lgit)
        rkval, rrowgrp = next(rgit)

        while True:
            if lkval < rkval:
                for row in joinrows(lrowgrp, None):
                    yield tuple(row)
                # advance left
                lkval, lrowgrp = next(lgit)
            elif lkval > rkval:
                # advance right
                rkval, rrowgrp = next(rgit)
            else:
                for row in joinrows(lrowgrp, rrowgrp):
                    yield tuple(row)
                # advance both
                lkval, lrowgrp = next(lgit)
                rkval, rrowgrp = next(rgit)

    except StopIteration:
        pass

    # make sure any left rows remaining are yielded
    if lkval > rkval:
        # yield anything that got left hanging
        for row in joinrows(lrowgrp, None):
            yield tuple(row)
    # yield the rest
    for lkval, lrowgrp in lgit:
        for row in joinrows(lrowgrp, None):
            yield tuple(row)
    def _iternocache(self, source, key, reverse):
        debug('iterate without cache')
        self.clearcache()
        it = iter(source)

        hdr = next(it)
        yield tuple(hdr)

        if key is not None:
            # convert field selection into field indices
            indices = asindices(hdr, key)
        else:
            indices = range(len(hdr))
        # now use field indices to construct a _getkey function
        # TODO check if this raises an exception on short rows
        getkey = comparable_itemgetter(*indices)

        # TODO support native comparison

        # initialise the first chunk
        rows = list(itertools.islice(it, 0, self.buffersize))
        rows.sort(key=getkey, reverse=reverse)

        # have we exhausted the source iterator?
        if self.buffersize is None or len(rows) < self.buffersize:
            # yes, table fits within sort buffer

            if self.cache:
                debug('caching mem')
                self._hdrcache = hdr
                self._memcache = rows
                # actually not needed to iterate from memcache
                self._getkey = getkey

            for row in rows:
                yield tuple(row)

        else:
            # no, table is too big, need to sort in chunks

            chunkfiles = []

            while rows:

                # dump the chunk
                with NamedTemporaryFile(dir=self.tempdir, delete=False,
                                        mode='wb') as f:
                    # N.B., we **don't** want the file to be deleted on close,
                    # but we **do** want the file to be deleted when self
                    # is garbage collected, or when the program exits. When
                    # all references to the wrapper are gone, the file should
                    # get deleted.
                    wrapper = _NamedTempFileDeleteOnGC(f.name)
                    debug('created temporary chunk file %s' % f.name)
                    for row in rows:
                        pickle.dump(row, f, protocol=-1)
                    f.flush()
                    chunkfiles.append(wrapper)

                # grab the next chunk
                rows = list(itertools.islice(it, 0, self.buffersize))
                rows.sort(key=getkey, reverse=reverse)

            if self.cache:
                debug('caching files')
                self._hdrcache = hdr
                self._filecache = chunkfiles
                self._getkey = getkey

            chunkiters = [_iterchunk(f.name) for f in chunkfiles]
            for row in _mergesorted(getkey, reverse, *chunkiters):
                yield tuple(row)
Exemple #54
0
def iterproblems(table, constraints, expected_header):

    outhdr = ('name', 'row', 'field', 'value', 'error')
    yield outhdr

    it = iter(table)
    actual_header = next(it)

    if expected_header is None:
        flds = list(map(text_type, actual_header))
    else:
        expected_flds = list(map(text_type, expected_header))
        actual_flds = list(map(text_type, actual_header))
        try:
            assert expected_flds == actual_flds
        except Exception as e:
            yield ('__header__', 0, None, None, type(e).__name__)
        flds = expected_flds

    local_constraints = normalize_constraints(constraints, flds)

    # setup getters
    for constraint in local_constraints:
        if 'getter' not in constraint:
            if 'field' in constraint:
                # should ensure FieldSelectionError if bad field in constraint
                indices = asindices(flds, constraint['field'])
                getter = operator.itemgetter(*indices)
                constraint['getter'] = getter

    # generate problems
    expected_len = len(flds)
    for i, row in enumerate(it):
        row = tuple(row)

        # row length constraint
        l = None
        try:
            l = len(row)
            assert l == expected_len
        except Exception as e:
            yield ('__len__', i + 1, None, l, type(e).__name__)

        # user defined constraints
        row = Record(row, flds)
        for constraint in local_constraints:
            name = constraint.get('name', None)
            field = constraint.get('field', None)
            assertion = constraint.get('assertion', None)
            test = constraint.get('test', None)
            getter = constraint.get('getter', lambda x: x)
            try:
                target = getter(row)
            except Exception as e:
                # getting target value failed, report problem
                yield (name, i + 1, field, None, type(e).__name__)
            else:
                value = target if field else None
                if test is not None:
                    try:
                        test(target)
                    except Exception as e:
                        # test raised exception, report problem
                        yield (name, i + 1, field, value, type(e).__name__)
                if assertion is not None:
                    try:
                        assert assertion(target)
                    except Exception as e:
                        # assertion raised exception, report problem
                        yield (name, i + 1, field, value, type(e).__name__)