def _yieldPartitions(self, iterable, func, fieldnameIndexes, length): """ This method parses a set of lines for a partition, applies an anonymous function that converts each line of the partition to a key-value pair where key is of type tuple, sorts the key-value pairs on the keys and then yields the partition. Through this method, we obtain several sorted chunks. :param iterable: lines of text :param func: function that converts each row of the input file to an unique key :param fieldnameIndexes: dictionary of fieldnames and corresponding indexes :param length: determines the number of lines in the buffer """ isKeyTuple = False # Take the first "length" number of items and return them as list. lines = list(itertools.islice(iterable, length)) data = collections.OrderedDict() while len(lines) > 0: pairs = [None] * len(lines) # Create a list of (key, value) pairs # Each key consists of a tuple, value is the corresponding text # Note: CSV dictionary reader is NOT used because a chunk of text is parsed at a time, rather than a # line of text for i in xrange(len(lines)): # Note: CSV dictionary reader is NOT used because a chunk of text is parsed at a time, rather than a # line of text line = lines[i] tokens = MutUtils.getTokens(line, self.delimiter, self.lineterminator) for fieldname, index in fieldnameIndexes.items(): data[fieldname] = tokens[index] key = func(data) if not isKeyTuple: isKeyTuple = isinstance(key, tuple) if not isKeyTuple: raise CallbackException( "The value returned by the callback must be a tuple. Instead, a value " "of %s was returned." % (type(key))) pairs[i] = self._Pair(key, line) partition = sorted(pairs, key=operator.attrgetter("key")) lines = list(itertools.islice(iterable, length)) yield partition
def _yieldPartitions(self, iterable, func, fieldnameIndexes, length): """ This method parses a set of lines for a partition, applies an anonymous function that converts each line of the partition to a key-value pair where key is of type tuple, sorts the key-value pairs on the keys and then yields the partition. Through this method, we obtain several sorted chunks. :param iterable: lines of text :param func: function that converts each row of the input file to an unique key :param fieldnameIndexes: dictionary of fieldnames and corresponding indexes :param length: determines the number of lines in the buffer """ isKeyTuple = False # Take the first "length" number of items and return them as list. lines = list(itertools.islice(iterable, length)) data = collections.OrderedDict() while len(lines) > 0: pairs = [None]*len(lines) # Create a list of (key, value) pairs # Each key consists of a tuple, value is the corresponding text # Note: CSV dictionary reader is NOT used because a chunk of text is parsed at a time, rather than a # line of text for i in xrange(len(lines)): # Note: CSV dictionary reader is NOT used because a chunk of text is parsed at a time, rather than a # line of text line = lines[i] tokens = MutUtils.getTokens(line, self.delimiter, self.lineterminator) for fieldname, index in fieldnameIndexes.items(): data[fieldname] = tokens[index] key = func(data) if not isKeyTuple: isKeyTuple = isinstance(key, tuple) if not isKeyTuple: raise CallbackException("The value returned by the callback must be a tuple. Instead, a value " "of %s was returned." % (type(key))) pairs[i] = self._Pair(key, line) partition = sorted(pairs, key=operator.attrgetter("key")) lines = list(itertools.islice(iterable, length)) yield partition