def test_mapper_pre_filter(self): data = b'x\ny\nz\n' job = MRFilterJob(['--mapper-filter', 'cat -e', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual( r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'cat -e'}}]) r.run() lines = [line.strip() for line in to_lines(r.cat_output())] self.assertEqual(sorted(lines), [b'x$', b'y$', b'z$'])
def test_pre_filter_failure(self): # regression test for #1524 data = b'x\ny\nz\n' # grep will return exit code 1 because there are no matches job = MRFilterJob(['--mapper-filter', 'grep w', '--runner=local']) job.sandbox(stdin=BytesIO(data)) with job.make_runner() as r: self.assertEqual( r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'grep w'}}]) r.run() lines = [line.strip() for line in to_lines(r.cat_output())] self.assertEqual(sorted(lines), [])
def test_pre_filter_on_compressed_data(self): # regression test for #1061 input_gz_path = self.makefile('data.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'x\ny\nz\n') input_gz.close() job = MRFilterJob([ '--mapper-filter', 'cat -e', '--runner=local', input_gz_path]) job.sandbox() with job.make_runner() as r: self.assertEqual( r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'cat -e'}}]) r.run() lines = [line.strip() for line in to_lines(r.cat_output())] self.assertEqual(sorted(lines), [b'x$', b'y$', b'z$'])
def test_pre_filter_on_compressed_data(self): # regression test for #1061 input_gz_path = self.makefile('data.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'x\ny\nz\n') input_gz.close() job = MRFilterJob( ['--mapper-filter', 'cat -e', '--runner=local', input_gz_path]) job.sandbox() with job.make_runner() as r: self.assertEqual(r._get_steps(), [{ 'type': 'streaming', 'mapper': { 'type': 'script', 'pre_filter': 'cat -e' } }]) r.run() lines = [line.strip() for line in to_lines(r.cat_output())] self.assertEqual(sorted(lines), [b'x$', b'y$', b'z$'])
def test_no_pre_filters(self): job = MRFilterJob(['-r', 'inline', '--mapper-filter', 'grep foo']) job.sandbox() self.assertRaises(NotImplementedError, job.make_runner)