コード例 #1
0
    def test_get_file_splits_test(self):
        # set up input paths
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write('bar\nqux\nfoo\nbar\nqux\nfoo\n')

        input_path2 = os.path.join(self.tmp_dir, 'input2')
        with open(input_path2, 'wb') as input_file:
            input_file.write(b'foo\nbar\nbar\n')

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_path, input_path2], 3)

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            with open(file_name, 'rb') as f:
                content.extend(f.readlines())

        self.assertEqual(sorted(content), [
            b'bar\n', b'bar\n', b'bar\n', b'bar\n', b'foo\n', b'foo\n',
            b'foo\n', b'qux\n', b'qux\n'
        ])
コード例 #2
0
    def test_get_file_splits_sorted_test(self):
        # set up input paths
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'wb') as input_file:
            input_file.write(
                b'1\tbar\n1\tbar\n1\tbar\n2\tfoo\n2\tfoo\n2\tfoo\n3\tqux\n'
                b'3\tqux\n3\tqux\n')

        runner = LocalMRJobRunner(conf_paths=[])

        file_splits = runner._get_file_splits([input_path],
                                              3,
                                              keep_sorted=True)

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved in sorted order
        content = []
        for file_name in sorted(file_splits.keys()):
            with open(file_name, 'rb') as f:
                content.extend(f.readlines())

        self.assertEqual(content, [
            b'1\tbar\n', b'1\tbar\n', b'1\tbar\n', b'2\tfoo\n', b'2\tfoo\n',
            b'2\tfoo\n', b'3\tqux\n', b'3\tqux\n', b'3\tqux\n'
        ])
コード例 #3
0
ファイル: test_local.py プロジェクト: eklitzke/mrjob
    def test_get_file_splits_test(self):
        # set up input paths
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write('bar\nqux\nfoo\nbar\nqux\nfoo\n')

        input_path2 = os.path.join(self.tmp_dir, 'input2')
        with open(input_path2, 'w') as input_file:
            input_file.write('foo\nbar\nbar\n')

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_path, input_path2], 3)

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            f = open(file_name)
            content.extend(f.readlines())

        self.assertEqual(sorted(content),
                         ['bar\n', 'bar\n', 'bar\n', 'bar\n', 'foo\n',
                          'foo\n', 'foo\n', 'qux\n', 'qux\n'])
コード例 #4
0
ファイル: test_local.py プロジェクト: eklitzke/mrjob
    def test_get_file_splits_sorted_test(self):
        # set up input paths
        input_path = os.path.join(self.tmp_dir, 'input')
        with open(input_path, 'w') as input_file:
            input_file.write(
                '1\tbar\n1\tbar\n1\tbar\n2\tfoo\n2\tfoo\n2\tfoo\n3\tqux\n'
                '3\tqux\n3\tqux\n')

        runner = LocalMRJobRunner(conf_paths=[])

        file_splits = runner._get_file_splits([input_path], 3,
                                              keep_sorted=True)

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved in sorted order
        content = []
        for file_name in sorted(file_splits.keys()):
            f = open(file_name, 'r')
            content.extend(f.readlines())

        self.assertEqual(content,
                         ['1\tbar\n', '1\tbar\n', '1\tbar\n',
                          '2\tfoo\n', '2\tfoo\n', '2\tfoo\n',
                          '3\tqux\n', '3\tqux\n', '3\tqux\n'])
コード例 #5
0
ファイル: test_local.py プロジェクト: alanhdu/mrjob
    def test_get_file_splits_sorted_test(self):
        # set up input paths
        input_path = os.path.join(self.tmp_dir, "input")
        with open(input_path, "wb") as input_file:
            input_file.write(b"1\tbar\n1\tbar\n1\tbar\n2\tfoo\n2\tfoo\n2\tfoo\n3\tqux\n" b"3\tqux\n3\tqux\n")

        runner = LocalMRJobRunner(conf_paths=[])

        file_splits = runner._get_file_splits([input_path], 3, keep_sorted=True)

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved in sorted order
        content = []
        for file_name in sorted(file_splits.keys()):
            with open(file_name, "rb") as f:
                content.extend(f.readlines())

        self.assertEqual(
            content,
            [
                b"1\tbar\n",
                b"1\tbar\n",
                b"1\tbar\n",
                b"2\tfoo\n",
                b"2\tfoo\n",
                b"2\tfoo\n",
                b"3\tqux\n",
                b"3\tqux\n",
                b"3\tqux\n",
            ],
        )
コード例 #6
0
ファイル: test_local.py プロジェクト: alanhdu/mrjob
    def test_get_file_splits_test(self):
        # set up input paths
        input_path = os.path.join(self.tmp_dir, "input")
        with open(input_path, "w") as input_file:
            input_file.write("bar\nqux\nfoo\nbar\nqux\nfoo\n")

        input_path2 = os.path.join(self.tmp_dir, "input2")
        with open(input_path2, "wb") as input_file:
            input_file.write(b"foo\nbar\nbar\n")

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_path, input_path2], 3)

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            with open(file_name, "rb") as f:
                content.extend(f.readlines())

        self.assertEqual(
            sorted(content), [b"bar\n", b"bar\n", b"bar\n", b"bar\n", b"foo\n", b"foo\n", b"foo\n", b"qux\n", b"qux\n"]
        )
コード例 #7
0
    def gz_test(self, dir_path_name):
        contents_gz = [
            b'bar\n', b'qux\n', b'foo\n', b'bar\n', b'qux\n', b'foo\n'
        ]
        contents_normal = [b'foo\n', b'bar\n', b'bar\n']
        all_contents_sorted = sorted(contents_gz + contents_normal)

        input_gz_path = join(dir_path_name, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b''.join(contents_gz))
        input_gz.close()
        input_path2 = join(dir_path_name, 'input2')
        with open(input_path2, 'wb') as input_file:
            input_file.write(b''.join(contents_normal))

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)

        # Make sure that input.gz occurs in a single split that starts at
        # its beginning and ends at its end
        for split_info in file_splits.values():
            if split_info['orig_name'] == input_gz_path:
                self.assertEqual(split_info['start'], 0)
                self.assertEqual(split_info['length'],
                                 os.stat(input_gz_path)[stat.ST_SIZE])

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            with open(file_name, 'rb') as f:
                lines = list(to_lines(decompress(f, file_name)))

            # make sure the input_gz split got its entire contents
            if file_name == input_gz_path:
                self.assertEqual(lines, contents_gz)

            content.extend(lines)

        self.assertEqual(sorted(content), all_contents_sorted)
コード例 #8
0
ファイル: test_local.py プロジェクト: Affirm/mrjob
    def gz_test(self, dir_path_name):
        contents_gz = [b'bar\n', b'qux\n', b'foo\n', b'bar\n',
                       b'qux\n', b'foo\n']
        contents_normal = [b'foo\n', b'bar\n', b'bar\n']
        all_contents_sorted = sorted(contents_gz + contents_normal)

        input_gz_path = join(dir_path_name, 'input.gz')
        input_gz = gzip.GzipFile(input_gz_path, 'wb')
        input_gz.write(b''.join(contents_gz))
        input_gz.close()
        input_path2 = join(dir_path_name, 'input2')
        with open(input_path2, 'wb') as input_file:
            input_file.write(b''.join(contents_normal))

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)

        # Make sure that input.gz occurs in a single split that starts at
        # its beginning and ends at its end
        for split_info in file_splits.values():
            if split_info['orig_name'] == input_gz_path:
                self.assertEqual(split_info['start'], 0)
                self.assertEqual(split_info['length'],
                                 os.stat(input_gz_path)[stat.ST_SIZE])

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            with open(file_name, 'rb') as f:
                lines = list(to_lines(decompress(f, file_name)))

            # make sure the input_gz split got its entire contents
            if file_name == input_gz_path:
                self.assertEqual(lines, contents_gz)

            content.extend(lines)

        self.assertEqual(sorted(content),
                         all_contents_sorted)
コード例 #9
0
ファイル: test_local.py プロジェクト: alanhdu/mrjob
    def gz_test(self, dir_path_name):
        contents_gz = [b"bar\n", b"qux\n", b"foo\n", b"bar\n", b"qux\n", b"foo\n"]
        contents_normal = [b"foo\n", b"bar\n", b"bar\n"]
        all_contents_sorted = sorted(contents_gz + contents_normal)

        input_gz_path = os.path.join(dir_path_name, "input.gz")
        input_gz = gzip.GzipFile(input_gz_path, "wb")
        input_gz.write(b"".join(contents_gz))
        input_gz.close()
        input_path2 = os.path.join(dir_path_name, "input2")
        with open(input_path2, "wb") as input_file:
            input_file.write(b"".join(contents_normal))

        runner = LocalMRJobRunner(conf_paths=[])

        # split into 3 files
        file_splits = runner._get_file_splits([input_gz_path, input_path2], 3)

        # Make sure that input.gz occurs in a single split that starts at
        # its beginning and ends at its end
        for split_info in file_splits.values():
            if split_info["orig_name"] == input_gz_path:
                self.assertEqual(split_info["start"], 0)
                self.assertEqual(split_info["length"], os.stat(input_gz_path)[stat.ST_SIZE])

        # make sure we get 3 files
        self.assertEqual(len(file_splits), 3)

        # make sure all the data is preserved
        content = []
        for file_name in file_splits:
            lines = list(read_file(file_name))

            # make sure the input_gz split got its entire contents
            if file_name == input_gz_path:
                self.assertEqual(lines, contents_gz)

            content.extend(lines)

        self.assertEqual(sorted(content), all_contents_sorted)