Example #1
0
File: sorts.py Project: talwai/petl
def itermergesort(sources, key, header, missing, reverse):

    # first need to standardise headers of all input tables
    # borrow this from itercat - TODO remove code smells

    its = [iter(t) for t in sources]
    source_flds_lists = [it.next() for it in its]

    if header is None:
        # determine output fields by gathering all fields found in the sources
        outflds = list()
        for flds in source_flds_lists:
            for f in flds:
                if f not in outflds:
                    # add any new fields as we find them
                    outflds.append(f)
    else:
        # predetermined output fields
        outflds = header
    yield tuple(outflds)

    def _standardisedata(it, flds, outflds):
        # now construct and yield the data rows
        for row in it:
            try:
                # should be quickest to do this way
                yield tuple(row[flds.index(f)] if f in flds else missing
                            for f in outflds)
            except IndexError:
                # handle short rows
                outrow = [missing] * len(outflds)
                for i, f in enumerate(flds):
                    try:
                        outrow[outflds.index(f)] = row[i]
                    except IndexError:
                        pass  # be relaxed about short rows
                yield tuple(outrow)

    # wrap all iterators to standardise fields
    sits = [
        _standardisedata(it, flds, outflds)
        for flds, it in zip(source_flds_lists, its)
    ]

    # now determine key function
    getkey = None
    if key is not None:
        # convert field selection into field indices
        indices = asindices(outflds, key)
        # now use field indices to construct a _getkey function
        # N.B., this will probably raise an exception on short rows
        getkey = operator.itemgetter(*indices)

    # OK, do the merge sort
    for row in shortlistmergesorted(getkey, reverse, *sits):
        yield row
Example #2
0
def _mergesorted(key=None, reverse=False, *iterables):

    # N.B., I've used heapq for normal merge sort and shortlist merge sort for reverse
    # merge sort because I've assumed that heapq.merge is faster and so is preferable
    # but it doesn't support reverse sorting so the shortlist merge sort has to
    # be used for reverse sorting. Some casual profiling suggests there isn't much
    # between the two in terms of speed, but might be worth profiling more carefully

    if reverse:
        return shortlistmergesorted(key, True, *iterables)
    else:
        return heapqmergesorted(key, *iterables)
Example #3
0
 def close(self):
     # sort anything remaining in the cache
     self.cache.sort(key=self.getkey, reverse=self.reverse)
     if self.chunkfiles:
         chunkiters = [iterchunk(f) for f in self.chunkfiles]
         chunkiters.append(self.cache) # make sure any left in cache are included
         for row in shortlistmergesorted(self.getkey, self.reverse, *chunkiters):
             self.broadcast(row)
     else:
         for row in self.cache:
             self.broadcast(row)
     super(SortConnection, self).close()
Example #4
0
File: push.py Project: talwai/petl
 def close(self):
     # sort anything remaining in the cache
     self.cache.sort(key=self.getkey, reverse=self.reverse)
     if self.chunkfiles:
         chunkiters = [iterchunk(f) for f in self.chunkfiles]
         chunkiters.append(self.cache) # make sure any left in cache are included
         for row in shortlistmergesorted(self.getkey, self.reverse, *chunkiters):
             self.broadcast(row)
     else:
         for row in self.cache:
             self.broadcast(row)
     super(SortConnection, self).close()
Example #5
0
def itermergesort(sources, key, header, missing, reverse):

    # first need to standardise headers of all input tables
    # borrow this from itercat - TODO remove code smells

    its = [iter(t) for t in sources]
    source_flds_lists = [it.next() for it in its]

    if header is None:
        # determine output fields by gathering all fields found in the sources
        outflds = list()
        for flds in source_flds_lists:
            for f in flds:
                if f not in outflds:
                    # add any new fields as we find them
                    outflds.append(f)
    else:
        # predetermined output fields
        outflds = header
    yield tuple(outflds)

    def _standardisedata(it, flds, outflds):
        # now construct and yield the data rows
        for row in it:
            try:
                # should be quickest to do this way
                yield tuple(row[flds.index(f)] if f in flds else missing for f in outflds)
            except IndexError:
                # handle short rows
                outrow = [missing] * len(outflds)
                for i, f in enumerate(flds):
                    try:
                        outrow[outflds.index(f)] = row[i]
                    except IndexError:
                        pass # be relaxed about short rows
                yield tuple(outrow)

    # wrap all iterators to standardise fields
    sits = [_standardisedata(it, flds, outflds) for flds, it in zip(source_flds_lists, its)]

    # now determine key function
    getkey = None
    if key is not None:
        # convert field selection into field indices
        indices = asindices(outflds, key)
        # now use field indices to construct a _getkey function
        # N.B., this will probably raise an exception on short rows
        getkey = operator.itemgetter(*indices)

    # OK, do the merge sort
    for row in shortlistmergesorted(getkey, reverse, *sits):
        yield row