def test_cat_compressed_stream(self): input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'w') input_gz.write('foo\nbar\n') input_gz.close() # restrict a file object to only the read() method class OnlyReadWrapper(object): def __init__(self, fp): self.fp = fp def read(self, *args, **kwargs): return self.fp.read(*args, **kwargs) output = [] with open(input_gz_path) as f: for line in read_file(input_gz_path, fileobj=OnlyReadWrapper(f)): output.append(line) self.assertEqual(output, ['foo\n', 'bar\n']) input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2') input_bz2 = bz2.BZ2File(input_bz2_path, 'w') input_bz2.write('bar\nbar\nfoo\n') input_bz2.close() output = [] for line in read_file(input_bz2_path, fileobj=open(input_bz2_path)): output.append(line) self.assertEqual(output, ['bar\n', 'bar\n', 'foo\n'])
def _cat_file(self, filename): # stream lines from the s3 key s3_key = self.get_s3_key(filename) # yields_lines=False: warn read_file that s3_key yields chunks of bytes return read_file(s3_key_to_uri(s3_key), fileobj=s3_key, yields_lines=False)
def _cat_file(self, filename): if is_uri(filename): # stream from HDFS cat_args = self._opts['hadoop_bin'] + ['fs', '-cat', filename] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) def stream(): for line in cat_proc.stdout: yield line # there shouldn't be any stderr for line in cat_proc.stderr: log.error('STDERR: ' + line) returncode = cat_proc.wait() if returncode != 0: raise CalledProcessError(returncode, cat_args) return read_file(filename, stream()) else: # read from local filesystem return super(HadoopJobRunner, self)._cat_file(filename)
def test_read_file_uncompressed_stream(self): input_path = os.path.join(self.tmp_dir, "input") with open(input_path, "w") as input_file: input_file.write("bar\nfoo\n") output = [] for line in read_file(input_path, fileobj=open(input_path)): output.append(line) self.assertEqual(output, ["bar\n", "foo\n"])
def test_read_file_uncompressed_stream(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('bar\nfoo\n') output = [] for line in read_file(input_path, fileobj=open(input_path)): output.append(line) assert_equal(output, ['bar\n', 'foo\n'])
def test_read_uncompressed_file(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'wb') as input_file: input_file.write(b'bar\nfoo\n') output = [] for line in read_file(input_path): output.append(line) self.assertEqual(output, [b'bar\n', b'foo\n'])
def test_read_file_uncompressed_stream(self): input_path = os.path.join(self.tmp_dir, 'input') with open(input_path, 'w') as input_file: input_file.write('bar\nfoo\n') output = [] for line in read_file(input_path, fileobj=open(input_path)): output.append(line) self.assertEqual(output, ['bar\n', 'foo\n'])
def test_read_uncompressed_file(self): input_path = os.path.join(self.tmp_dir, "input") with open(input_path, "wb") as input_file: input_file.write(b"bar\nfoo\n") output = [] for line in read_file(input_path): output.append(line) self.assertEqual(output, [b"bar\n", b"foo\n"])
def test_read_bz2_file(self): input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2') input_bz2 = bz2.BZ2File(input_bz2_path, 'wb') input_bz2.write(b'bar\nbar\nfoo\n') input_bz2.close() output = [] for line in read_file(input_bz2_path): output.append(line) self.assertEqual(output, [b'bar\n', b'bar\n', b'foo\n'])
def test_read_gz_file(self): input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b'foo\nbar\n') input_gz.close() output = [] for line in read_file(input_gz_path): output.append(line) self.assertEqual(output, [b'foo\n', b'bar\n'])
def _cat_file(self, filename): # stream lines from the s3 key s3_key = self.get_s3_key(filename) # stream the key to a fileobj stream = StringIO() s3_key.get_file(stream) stream.seek(0) buffer_iterator = read_file(s3_key_to_uri(s3_key), fileobj=stream) return buffer_iterator_to_line_iterator(buffer_iterator)
def test_read_gz_file(self): input_gz_path = os.path.join(self.tmp_dir, "input.gz") input_gz = gzip.GzipFile(input_gz_path, "wb") input_gz.write(b"foo\nbar\n") input_gz.close() output = [] for line in read_file(input_gz_path): output.append(line) self.assertEqual(output, [b"foo\n", b"bar\n"])
def test_read_bz2_file(self): input_bz2_path = os.path.join(self.tmp_dir, "input.bz2") input_bz2 = bz2.BZ2File(input_bz2_path, "wb") input_bz2.write(b"bar\nbar\nfoo\n") input_bz2.close() output = [] for line in read_file(input_bz2_path): output.append(line) self.assertEqual(output, [b"bar\n", b"bar\n", b"foo\n"])
def _cat_file(self, gcs_uri): tmp_fd, tmp_path = tempfile.mkstemp() with os.fdopen(tmp_fd, 'w+b') as tmp_fileobj: self._download_io(gcs_uri, tmp_fileobj) tmp_fileobj.seek(0) line_gen = read_file( gcs_uri, fileobj=tmp_fileobj, yields_lines=False) for current_line in line_gen: yield current_line
def test_read_gz_file_from_fileobj(self): input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'w') input_gz.write('foo\nbar\n') input_gz.close() output = [] with open(input_gz_path) as f: for line in read_file(input_gz_path, fileobj=OnlyReadWrapper(f)): output.append(line) self.assertEqual(output, ['foo\n', 'bar\n'])
def test_read_bz2_file_from_fileobj(self): input_bz2_path = os.path.join(self.tmp_dir, "input.bz2") input_bz2 = bz2.BZ2File(input_bz2_path, "wb") input_bz2.write(b"bar\nbar\nfoo\n") input_bz2.close() output = [] with open(input_bz2_path, "rb") as f: for line in read_file(input_bz2_path, fileobj=OnlyReadWrapper(f)): output.append(line) self.assertEqual(output, [b"bar\n", b"bar\n", b"foo\n"])
def test_read_bz2_file_from_fileobj(self): input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2') input_bz2 = bz2.BZ2File(input_bz2_path, 'wb') input_bz2.write(b'bar\nbar\nfoo\n') input_bz2.close() output = [] with open(input_bz2_path, 'rb') as f: for line in read_file(input_bz2_path, fileobj=OnlyReadWrapper(f)): output.append(line) self.assertEqual(output, [b'bar\n', b'bar\n', b'foo\n'])
def _cat_file(self, gcs_uri): tmp_fd, tmp_path = tempfile.mkstemp() with os.fdopen(tmp_fd, 'w+b') as tmp_fileobj: self._download_io(gcs_uri, tmp_fileobj) tmp_fileobj.seek(0) line_gen = read_file(gcs_uri, fileobj=tmp_fileobj, yields_lines=False) for current_line in line_gen: yield current_line
def test_cat_compressed_stream(self): input_gz_path = os.path.join(self.tmp_dir, "input.gz") input_gz = gzip.GzipFile(input_gz_path, "w") input_gz.write("foo\nbar\n") input_gz.close() output = [] for line in read_file(input_gz_path, fileobj=open(input_gz_path)): output.append(line) self.assertEqual(output, ["foo\n", "bar\n"]) input_bz2_path = os.path.join(self.tmp_dir, "input.bz2") input_bz2 = bz2.BZ2File(input_bz2_path, "w") input_bz2.write("bar\nbar\nfoo\n") input_bz2.close() output = [] for line in read_file(input_bz2_path, fileobj=open(input_bz2_path)): output.append(line) self.assertEqual(output, ["bar\n", "bar\n", "foo\n"])
def _cat_file(self, filename): ssh_match = SSH_URI_RE.match(filename) addr = ssh_match.group('hostname') or self._address_of_master() if '!' in addr and self.ssh_key_name is None: raise ValueError('ssh_key_name must not be None') output = ssh_cat( self._ssh_bin, addr, self._ec2_key_pair_file, ssh_match.group('filesystem_path'), self.ssh_key_name, ) return read_file(filename, fileobj=StringIO(output))
def test_cat_compressed_stream(self): input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'w') input_gz.write('foo\nbar\n') input_gz.close() output = [] for line in read_file(input_gz_path, fileobj=open(input_gz_path)): output.append(line) assert_equal(output, ['foo\n', 'bar\n']) input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2') input_bz2 = bz2.BZ2File(input_bz2_path, 'w') input_bz2.write('bar\nbar\nfoo\n') input_bz2.close() output = [] for line in read_file(input_bz2_path, fileobj=open(input_bz2_path)): output.append(line) assert_equal(output, ['bar\n', 'bar\n', 'foo\n'])
def test_cat_compressed_stream(self): input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'w') input_gz.write('foo\nbar\n') input_gz.close() output = [] for line in read_file(input_gz_path, fileobj=open(input_gz_path)): output.append(line) self.assertEqual(output, ['foo\n', 'bar\n']) input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2') input_bz2 = bz2.BZ2File(input_bz2_path, 'w') input_bz2.write('bar\nbar\nfoo\n') input_bz2.close() output = [] for line in read_file(input_bz2_path, fileobj=open(input_bz2_path)): output.append(line) self.assertEqual(output, ['bar\n', 'bar\n', 'foo\n'])
def _cat_file(self, filename): ssh_match = _SSH_URI_RE.match(filename) addr = ssh_match.group('hostname') or self._address_of_master() keyfile = self._key_filename_for(addr) output = _ssh_cat( self._ssh_bin, addr, self._ec2_key_pair_file, ssh_match.group('filesystem_path'), keyfile, ) return read_file(filename, fileobj=BytesIO(output))
def _cat_file(self, filename): ssh_match = _SSH_URI_RE.match(filename) addr = ssh_match.group('hostname') or self._address_of_master() keyfile = self._key_filename_for(addr) output = ssh_cat( self._ssh_bin, addr, self._ec2_key_pair_file, ssh_match.group('filesystem_path'), keyfile, ) return read_file(filename, fileobj=BytesIO(output))
def _cat_file(self, filename): # stream from HDFS cat_args = self._hadoop_bin + ['fs', '-cat', filename] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) def cleanup(): # there shouldn't be any stderr for line in cat_proc.stderr: log.error('STDERR: ' + line) returncode = cat_proc.wait() if returncode != 0: raise IOError("Could not stream %s" % filename) return read_file(filename, cat_proc.stdout, cleanup=cleanup)
def test_dont_split_gz(self): contents_gz = ['bar\n', 'qux\n', 'foo\n', 'bar\n', 'qux\n', 'foo\n'] contents_normal = ['foo\n', 'bar\n', 'bar\n'] all_contents_sorted = sorted(contents_gz + contents_normal) input_gz_path = os.path.join(self.tmp_dir, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'w') input_gz.write(''.join(contents_gz)) input_gz.close() input_path2 = os.path.join(self.tmp_dir, 'input2') with open(input_path2, 'w') as input_file: input_file.write(''.join(contents_normal)) runner = LocalMRJobRunner(conf_paths=[]) # split into 3 files file_splits = runner._get_file_splits([input_gz_path, input_path2], 3) # Make sure that input.gz occurs in a single split that starts at # its beginning and ends at its end for split_info in file_splits.values(): if split_info['orig_name'] == input_gz_path: self.assertEqual(split_info['start'], 0) self.assertEqual(split_info['length'], os.stat(input_gz_path)[stat.ST_SIZE]) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved content = [] for file_name in file_splits: lines = list(read_file(file_name)) # make sure the input_gz split got its entire contents if file_name == input_gz_path: self.assertEqual(lines, contents_gz) content.extend(lines) self.assertEqual(sorted(content), all_contents_sorted)
def gz_test(self, dir_path_name): contents_gz = [ b'bar\n', b'qux\n', b'foo\n', b'bar\n', b'qux\n', b'foo\n' ] contents_normal = [b'foo\n', b'bar\n', b'bar\n'] all_contents_sorted = sorted(contents_gz + contents_normal) input_gz_path = os.path.join(dir_path_name, 'input.gz') input_gz = gzip.GzipFile(input_gz_path, 'wb') input_gz.write(b''.join(contents_gz)) input_gz.close() input_path2 = os.path.join(dir_path_name, 'input2') with open(input_path2, 'wb') as input_file: input_file.write(b''.join(contents_normal)) runner = LocalMRJobRunner(conf_paths=[]) # split into 3 files file_splits = runner._get_file_splits([input_gz_path, input_path2], 3) # Make sure that input.gz occurs in a single split that starts at # its beginning and ends at its end for split_info in file_splits.values(): if split_info['orig_name'] == input_gz_path: self.assertEqual(split_info['start'], 0) self.assertEqual(split_info['length'], os.stat(input_gz_path)[stat.ST_SIZE]) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved content = [] for file_name in file_splits: lines = list(read_file(file_name)) # make sure the input_gz split got its entire contents if file_name == input_gz_path: self.assertEqual(lines, contents_gz) content.extend(lines) self.assertEqual(sorted(content), all_contents_sorted)
def test_read_large_bz2_file(self): # catch incorrect use of bz2 library (Issue #814) input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2') input_bz2 = bz2.BZ2File(input_bz2_path, 'w') # can't just repeat same value, because we need the file to be # compressed! 50000 lines is too few to catch the bug. random.seed(0) for _ in xrange(100000): input_bz2.write('%016x\n' % random.randint(0, 2**64 - 1)) input_bz2.close() random.seed(0) num_lines = 0 for line in read_file(input_bz2_path): self.assertEqual(line, '%016x\n' % random.randint(0, 2**64 - 1)) num_lines += 1 self.assertEqual(num_lines, 100000)
def test_read_large_bz2_file(self): # catch incorrect use of bz2 library (Issue #814) input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2') input_bz2 = bz2.BZ2File(input_bz2_path, 'w') # can't just repeat same value, because we need the file to be # compressed! 50000 lines is too few to catch the bug. random.seed(0) for _ in xrange(100000): input_bz2.write('%016x\n' % random.randint(0, 2 ** 64 - 1)) input_bz2.close() random.seed(0) num_lines = 0 for line in read_file(input_bz2_path): self.assertEqual(line, '%016x\n' % random.randint(0, 2 ** 64 - 1)) num_lines += 1 self.assertEqual(num_lines, 100000)
def test_read_large_bz2_file(self): # catch incorrect use of bz2 library (Issue #814) input_bz2_path = os.path.join(self.tmp_dir, "input.bz2") input_bz2 = bz2.BZ2File(input_bz2_path, "wb") # can't just repeat same value, because we need the file to be # compressed! 50000 lines is too few to catch the bug. with random_seed(0): for _ in range(100000): input_bz2.write((random_identifier() + "\n").encode("ascii")) input_bz2.close() # now expect to read back the same bytes with random_seed(0): num_lines = 0 for line in read_file(input_bz2_path): self.assertEqual(line, (random_identifier() + "\n").encode("ascii")) num_lines += 1 self.assertEqual(num_lines, 100000)
def _cat_file(self, filename): # stream from HDFS cat_args = self.get_hadoop_bin() + ['fs', '-cat', filename] log.debug('> %s' % cmd_line(cat_args)) cat_proc = Popen(cat_args, stdout=PIPE, stderr=PIPE) def cleanup(): # this does someties happen; see #1396 for line in cat_proc.stderr: log.error('STDERR: ' + to_string(line.rstrip(b'\r\n'))) cat_proc.stdout.close() cat_proc.stderr.close() returncode = cat_proc.wait() if returncode != 0: raise IOError("Could not stream %s" % filename) return read_file(filename, cat_proc.stdout, cleanup=cleanup)
def gz_test(self, dir_path_name): contents_gz = [b"bar\n", b"qux\n", b"foo\n", b"bar\n", b"qux\n", b"foo\n"] contents_normal = [b"foo\n", b"bar\n", b"bar\n"] all_contents_sorted = sorted(contents_gz + contents_normal) input_gz_path = os.path.join(dir_path_name, "input.gz") input_gz = gzip.GzipFile(input_gz_path, "wb") input_gz.write(b"".join(contents_gz)) input_gz.close() input_path2 = os.path.join(dir_path_name, "input2") with open(input_path2, "wb") as input_file: input_file.write(b"".join(contents_normal)) runner = LocalMRJobRunner(conf_paths=[]) # split into 3 files file_splits = runner._get_file_splits([input_gz_path, input_path2], 3) # Make sure that input.gz occurs in a single split that starts at # its beginning and ends at its end for split_info in file_splits.values(): if split_info["orig_name"] == input_gz_path: self.assertEqual(split_info["start"], 0) self.assertEqual(split_info["length"], os.stat(input_gz_path)[stat.ST_SIZE]) # make sure we get 3 files self.assertEqual(len(file_splits), 3) # make sure all the data is preserved content = [] for file_name in file_splits: lines = list(read_file(file_name)) # make sure the input_gz split got its entire contents if file_name == input_gz_path: self.assertEqual(lines, contents_gz) content.extend(lines) self.assertEqual(sorted(content), all_contents_sorted)
def test_read_large_bz2_file(self): # catch incorrect use of bz2 library (Issue #814) input_bz2_path = os.path.join(self.tmp_dir, 'input.bz2') input_bz2 = bz2.BZ2File(input_bz2_path, 'wb') # can't just repeat same value, because we need the file to be # compressed! 50000 lines is too few to catch the bug. with random_seed(0): for _ in range(100000): input_bz2.write((random_identifier() + '\n').encode('ascii')) input_bz2.close() # now expect to read back the same bytes with random_seed(0): num_lines = 0 for line in read_file(input_bz2_path): self.assertEqual(line, (random_identifier() + '\n').encode('ascii')) num_lines += 1 self.assertEqual(num_lines, 100000)
def _cat_file(self, filename): # stream lines from the s3 key s3_key = self.get_s3_key(filename) buffer_iterator = read_file(s3_key_to_uri(s3_key), fileobj=s3_key) return buffer_iterator_to_line_iterator(buffer_iterator)
def _cat_file(self, filename): """cat a file, decompress if necessary.""" for line in read_file(filename): yield line
def _cat_file(self, filename): for line in read_file(filename): yield line
def _cat_file(self, filename): # stream lines from the s3 key s3_key = self.get_s3_key(filename) # yields_lines=False: warn read_file that s3_key yields chunks of bytes return read_file( s3_key_to_uri(s3_key), fileobj=s3_key, yields_lines=False)
def _cat_file(self, filename): return read_file(filename)
def _cat_file(self, filename): # stream lines from the s3 key s3_key = self._get_s3_key(filename) body = s3_key.get()['Body'] return read_file(filename, fileobj=body, yields_lines=False)