def _yieldPartitions(self, iterable, func, fieldnameIndexes, length):
        """
        This method parses a set of lines for a partition, applies an anonymous function that converts each line of the
        partition to a key-value pair where key is of type tuple, sorts the key-value pairs on the keys and then yields
        the partition. Through this method, we obtain several sorted chunks.

        :param iterable: lines of text
        :param func: function that converts each row of the input file to an unique key
        :param fieldnameIndexes: dictionary of fieldnames and corresponding indexes
        :param length: determines the number of lines in the buffer
        """
        isKeyTuple = False

        # Take the first "length" number of items and return them as list.
        lines = list(itertools.islice(iterable, length))
        data = collections.OrderedDict()

        while len(lines) > 0:
            pairs = [None] * len(lines)

            # Create a list of (key, value) pairs
            # Each key consists of a tuple, value is the corresponding text
            # Note: CSV dictionary reader is NOT used because a chunk of text is parsed at a time, rather than a
            # line of text
            for i in xrange(len(lines)):
                # Note: CSV dictionary reader is NOT used because a chunk of text is parsed at a time, rather than a
                # line of text
                line = lines[i]
                tokens = MutUtils.getTokens(line, self.delimiter,
                                            self.lineterminator)

                for fieldname, index in fieldnameIndexes.items():
                    data[fieldname] = tokens[index]

                key = func(data)

                if not isKeyTuple:
                    isKeyTuple = isinstance(key, tuple)
                    if not isKeyTuple:
                        raise CallbackException(
                            "The value returned by the callback must be a tuple. Instead, a value "
                            "of %s was returned." % (type(key)))
                pairs[i] = self._Pair(key, line)

            partition = sorted(pairs, key=operator.attrgetter("key"))

            lines = list(itertools.islice(iterable, length))

            yield partition
Exemple #2
0
    def _yieldPartitions(self, iterable, func, fieldnameIndexes, length):
        """
        This method parses a set of lines for a partition, applies an anonymous function that converts each line of the
        partition to a key-value pair where key is of type tuple, sorts the key-value pairs on the keys and then yields
        the partition. Through this method, we obtain several sorted chunks.

        :param iterable: lines of text
        :param func: function that converts each row of the input file to an unique key
        :param fieldnameIndexes: dictionary of fieldnames and corresponding indexes
        :param length: determines the number of lines in the buffer
        """
        isKeyTuple = False

        # Take the first "length" number of items and return them as list.
        lines = list(itertools.islice(iterable, length))
        data = collections.OrderedDict()

        while len(lines) > 0:
            pairs = [None]*len(lines)

            # Create a list of (key, value) pairs
            # Each key consists of a tuple, value is the corresponding text
            # Note: CSV dictionary reader is NOT used because a chunk of text is parsed at a time, rather than a
            # line of text
            for i in xrange(len(lines)):
                # Note: CSV dictionary reader is NOT used because a chunk of text is parsed at a time, rather than a
                # line of text
                line = lines[i]
                tokens = MutUtils.getTokens(line, self.delimiter, self.lineterminator)

                for fieldname, index in fieldnameIndexes.items():
                    data[fieldname] = tokens[index]

                key = func(data)

                if not isKeyTuple:
                    isKeyTuple = isinstance(key, tuple)
                    if not isKeyTuple:
                        raise CallbackException("The value returned by the callback must be a tuple. Instead, a value "
                                                "of %s was returned." % (type(key)))
                pairs[i] = self._Pair(key, line)

            partition = sorted(pairs, key=operator.attrgetter("key"))

            lines = list(itertools.islice(iterable, length))

            yield partition