Beispiel #1
0
    def _split_mapper_input(self, input_paths, step_num):
        """Take one or more input paths (which may be compressed) and split
        it to create the input files for the map tasks.

        Yields "splits", which are dictionaries with the following keys:

        input: path of input for one mapper
        file: path of original file
        start, length: chunk of original file in *input*

        Uncompressed files will not be split (even ``.bz2`` files);
        uncompressed files will be split as to to attempt to create
        twice as many input files as there are mappers.
        """
        input_paths = list(input_paths)
        manifest = (step_num == 0 and self._uses_input_manifest())

        # determine split size
        if manifest:
            split_size = 1  # one line per mapper
        else:
            split_size = self._pick_mapper_split_size(input_paths, step_num)

        # yield output fileobjs as needed
        split_fileobj_gen = self._yield_split_fileobjs('mapper', step_num)

        results = []

        for path in input_paths:
            with open(path, 'rb') as src:
                if is_compressed(path):
                    if manifest:
                        raise Exception('input manifest %s should not be'
                                        ' compressed!' % path)

                    # if file is compressed, uncompress it into a single split

                    # Hadoop tracks the compressed file's size
                    size = os.stat(path)[stat.ST_SIZE]

                    with next(split_fileobj_gen) as dest:
                        for chunk in decompress(src, path):
                            dest.write(chunk)

                    results.append(dict(
                        file=path,
                        start=0,
                        length=size,
                    ))
                else:
                    # otherwise, split into one or more input files
                    start = 0
                    length = 0

                    for lines in _split_records(src, split_size):
                        with next(split_fileobj_gen) as dest:
                            for line in lines:
                                # simulate NLinesInputFormat by prefixing
                                # each line with byte number
                                if manifest:
                                    i = start + length
                                    dest.write(('%d\t' % i).encode('ascii'))
                                dest.write(line)
                                length += len(line)

                        results.append(dict(
                            file=path,
                            start=start,
                            length=length,
                        ))

                        start += length
                        length = 0

        return results
Beispiel #2
0
    def _split_mapper_input(self, input_paths, step_num):
        """Take one or more input paths (which may be compressed) and split
        it to create the input files for the map tasks.

        Yields "splits", which are dictionaries with the following keys:

        input: path of input for one mapper
        file: path of original file
        start, length: chunk of original file in *input*

        Uncompressed files will not be split (even ``.bz2`` files);
        uncompressed files will be split as to to attempt to create
        twice as many input files as there are mappers.
        """
        input_paths = list(input_paths)
        manifest = (step_num == 0 and self._uses_input_manifest())

        # determine split size
        if manifest:
            split_size = 1  # one line per mapper
        else:
            split_size = self._pick_mapper_split_size(input_paths, step_num)

        # yield output fileobjs as needed
        split_fileobj_gen = self._yield_split_fileobjs('mapper', step_num)

        results = []

        for path in input_paths:
            with open(path, 'rb') as src:
                if is_compressed(path):
                    if manifest:
                        raise Exception('input manifest %s should not be'
                                        ' compressed!' % path)

                    # if file is compressed, uncompress it into a single split

                    # Hadoop tracks the compressed file's size
                    size = os.stat(path)[stat.ST_SIZE]

                    with next(split_fileobj_gen) as dest:
                        for chunk in decompress(src, path):
                            dest.write(chunk)

                    results.append(dict(
                        file=path,
                        start=0,
                        length=size,
                    ))
                else:
                    # otherwise, split into one or more input files
                    start = 0
                    length = 0

                    for lines in _split_records(src, split_size):
                        with next(split_fileobj_gen) as dest:
                            for line in lines:
                                # simulate NLinesInputFormat by prefixing
                                # each line with byte number
                                if manifest:
                                    i = start + length
                                    dest.write(('%d\t' % i).encode('ascii'))
                                dest.write(line)
                                length += len(line)

                        results.append(dict(
                            file=path,
                            start=start,
                            length=length,
                        ))

                        start += length
                        length = 0

        return results
Beispiel #3
0
    def _split_mapper_input(self, input_paths, step_num):
        """Take one or more input paths (which may be compressed) and split
        it to create the input files for the map tasks.

        Yields "splits", which are dictionaries with the following keys:

        input: path of input for one mapper
        file: path of original file
        start, length: chunk of original file in *input*

        Uncompressed files will not be split (even ``.bz2`` files);
        uncompressed files will be split as to to attempt to create
        twice as many input files as there are mappers.
        """
        input_paths = list(input_paths)

        # determine split size
        split_size = self._pick_mapper_split_size(input_paths, step_num)

        # yield output fileobjs as needed
        split_fileobj_gen = self._yield_split_fileobjs('mapper', step_num)

        results = []

        for path in input_paths:
            with open_input(path) as src:
                if is_compressed(path):
                    # if file is compressed, uncompress it into a single split

                    # Hadoop tracks the compressed file's size
                    size = os.stat(path)[stat.ST_SIZE]

                    with next(split_fileobj_gen) as dest:
                        shutil.copyfileobj(src, dest)

                    results.append(dict(
                        file=path,
                        start=0,
                        length=size,
                    ))
                else:
                    # otherwise, split into one or more input files
                    start = 0
                    length = 0

                    for lines in _split_records(src, split_size):
                        with next(split_fileobj_gen) as dest:
                            for line in lines:
                                dest.write(line)
                                length += len(line)

                        results.append(dict(
                            file=path,
                            start=start,
                            length=length,
                        ))

                        start += length
                        length = 0

        return results