Beispiel #1
0
        def line_group_generator(input_path):
            # Generate lines from a given input_path, if keep_sorted is True,
            # group lines by key; otherwise have one line per group
            # concatenate all lines with the same key and yield them
            # together
            if keep_sorted:
                def reducer_key(line):
                    return line.split('\t')[0]

                # assume that input is a collection of key <tab> value pairs
                # match all non-tab characters
                for _, lines in itertools.groupby(
                        read_input(input_path), key=reducer_key):
                    yield lines
            else:
                for line in read_input(input_path):
                    yield (line,)
Beispiel #2
0
Datei: job.py Projekt: mtai/mrjob
    def _read_input(self):
        """Read from stdin, or one more files, or directories.
        Yield one line at time.

        - Resolve globs (``foo_*.gz``).
        - Decompress ``.gz`` and ``.bz2`` files.
        - If path is ``-``, read from STDIN.
        - Recursively read all files in a directory
        """
        paths = self.args or ['-']
        for path in paths:
            for line in read_input(path, stdin=self.stdin):
                yield line
Beispiel #3
0
    def _read_input(self):
        """Read from stdin, or one more files, or directories.
        Yield one line at time.

        - Resolve globs (``foo_*.gz``).
        - Decompress ``.gz`` and ``.bz2`` files.
        - If path is ``-``, read from STDIN.
        - Recursively read all files in a directory
        """
        paths = self.args or ['-']
        for path in paths:
            for line in read_input(path, stdin=self.stdin):
                yield line
Beispiel #4
0
 def test_stdin(self):
     lines = read_input('-', stdin=BytesIO(self.BEAVER_DATA))
     self.assertEqual(list(lines), [self.BEAVER_DATA])
Beispiel #5
0
 def test_glob(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers.*'))
     assert_equal(list(lines), [self.BEAVER_DATA] * 3)
Beispiel #6
0
 def test_stdin(self):
     lines = read_input('-', stdin=StringIO(self.BEAVER_DATA))
     assert_equal(list(lines), [self.BEAVER_DATA])
Beispiel #7
0
 def test_bz2_file(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers.bz2'))
     assert_equal(list(lines), [self.BEAVER_DATA])
Beispiel #8
0
 def test_dir_recursion(self):
     lines = read_input(self.tmpdir)
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 4)
Beispiel #9
0
 def test_bad_glob(self):
     # read_input is a generator, so we won't get an error
     # until we try to read from it
     self.assertRaises(IOError, list,
                       read_input(os.path.join(self.tmpdir, 'lions*')))
Beispiel #10
0
 def test_dir_recursion(self):
     lines = read_input(self.tmpdir)
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 4)
Beispiel #11
0
 def test_glob_including_dir(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers*'))
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 4)
Beispiel #12
0
 def test_stdin_can_be_iterator(self):
     lines = read_input('-', stdin=[self.BEAVER_DATA] * 5)
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 5)
Beispiel #13
0
 def test_dir(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers/'))
     self.assertEqual(list(lines), [self.BEAVER_DATA])
Beispiel #14
0
 def test_stdin(self):
     lines = read_input('-', stdin=BytesIO(self.BEAVER_DATA))
     self.assertEqual(list(lines), [self.BEAVER_DATA])
Beispiel #15
0
 def test_glob(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers.*'))
     assert_equal(list(lines), [self.BEAVER_DATA] * 3)
Beispiel #16
0
 def test_bz2_file(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers.bz2'))
     assert_equal(list(lines), [self.BEAVER_DATA])
Beispiel #17
0
 def test_stdin_can_be_iterator(self):
     lines = read_input('-', stdin=[self.BEAVER_DATA] * 5)
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 5)
Beispiel #18
0
 def test_bad_glob(self):
     # read_input is a generator, so we won't get an error
     # until we try to read from it
     self.assertRaises(IOError, list,
                       read_input(os.path.join(self.tmpdir, 'lions*')))
Beispiel #19
0
 def test_dir(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers/'))
     self.assertEqual(list(lines), [self.BEAVER_DATA])
Beispiel #20
0
 def test_bz2_file(self):
     lines = read_input(os.path.join(self.tmpdir, "beavers.bz2"))
     self.assertEqual(list(lines), [self.BEAVER_DATA])
Beispiel #21
0
 def test_glob_including_dir(self):
     lines = read_input(os.path.join(self.tmpdir, 'beavers*'))
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 4)
Beispiel #22
0
 def test_stdin(self):
     lines = read_input('-', stdin=StringIO(self.BEAVER_DATA))
     assert_equal(list(lines), [self.BEAVER_DATA])
Beispiel #23
0
 def test_stdin(self):
     lines = read_input("-", stdin=StringIO(self.BEAVER_DATA))
     self.assertEqual(list(lines), [self.BEAVER_DATA])
Beispiel #24
0
 def test_glob(self):
     lines = read_input(os.path.join(self.tmpdir, "beavers.*"))
     self.assertEqual(list(lines), [self.BEAVER_DATA] * 3)