def test_s3_iter_moto(self): """Are S3 files iterated over correctly?""" # a list of strings to test with expected = [b"*" * 5 * 1024**2] + [b'0123456789'] * 1024 + [b"test"] # create fake bucket and fake key conn = boto.connect_s3() conn.create_bucket("mybucket") # lower the multipart upload size, to speed up these tests smart_open_lib.S3_MIN_PART_SIZE = 5 * 1024**2 with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout: # write a single huge line (=full multipart upload) fout.write(expected[0] + b'\n') # write lots of small lines for lineno, line in enumerate(expected[1:-1]): fout.write(line + b'\n') # ...and write the last line too, no newline at the end fout.write(expected[-1]) # connect to fake s3 and read from the fake key we filled above smart_open_object = smart_open.S3OpenRead( smart_open.ParseUri("s3://mybucket/mykey")) output = [line.rstrip(b'\n') for line in smart_open_object] self.assertEqual(output, expected) # same thing but using a context manager with smart_open.S3OpenRead(smart_open.ParseUri( "s3://mybucket/mykey")) as smart_open_object: output = [line.rstrip(b'\n') for line in smart_open_object] self.assertEqual(output, expected)
def test_hdfs(self, mock_subprocess): """Is HDFS write called correctly""" smart_open_object = smart_open.HdfsOpenWrite(smart_open.ParseUri("hdfs:///tmp/test.txt")) smart_open_object.write("test") # called with the correct params? mock_subprocess.Popen.assert_called_with(["hdfs","dfs","-put","-f","-","/tmp/test.txt"], stdin=mock_subprocess.PIPE) # second possibility of schema smart_open_object = smart_open.HdfsOpenWrite(smart_open.ParseUri("hdfs://tmp/test.txt")) smart_open_object.write("test") mock_subprocess.Popen.assert_called_with(["hdfs","dfs","-put","-f","-","/tmp/test.txt"], stdin=mock_subprocess.PIPE)
def test_webhdfs_uri(self): """Do webhdfs URIs parse correctly""" # valid uri, no query parsed_uri = smart_open.ParseUri("webhdfs://host:port/path/file") self.assertEqual(parsed_uri.scheme, "webhdfs") self.assertEqual(parsed_uri.uri_path, "host:port/webhdfs/v1/path/file") # valid uri, with query parsed_uri = smart_open.ParseUri("webhdfs://host:port/path/file?query_part_1&query_part2") self.assertEqual(parsed_uri.scheme, "webhdfs") self.assertEqual(parsed_uri.uri_path, "host:port/webhdfs/v1/path/file?query_part_1&query_part2")
def test_hdfs(self, mock_subprocess): """Is HDFS line iterator called correctly?""" mock_subprocess.PIPE.return_value = "test" smart_open_object = smart_open.HdfsOpenRead(smart_open.ParseUri("hdfs:///tmp/test.txt")) smart_open_object.__iter__() # called with the correct params? mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-cat", "/tmp/test.txt"], stdout=mock_subprocess.PIPE) # second possibility of schema smart_open_object = smart_open.HdfsOpenRead(smart_open.ParseUri("hdfs://tmp/test.txt")) smart_open_object.__iter__() mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-cat", "/tmp/test.txt"], stdout=mock_subprocess.PIPE)
def test_scheme(self): """Do URIs schemes parse correctly?""" # supported schemes for scheme in ("s3", "s3n", "hdfs", "file", "http", "https"): parsed_uri = smart_open.ParseUri(scheme + "://mybucket/mykey") self.assertEqual(parsed_uri.scheme, scheme) # unsupported scheme => NotImplementedError self.assertRaises(NotImplementedError, smart_open.ParseUri, "foobar://mybucket/mykey") # unknown scheme => default_scheme parsed_uri = smart_open.ParseUri("blah blah") self.assertEqual(parsed_uri.scheme, "file")
def convert(self, value, param, ctx): uri = smart_open.ParseUri(value) if uri.scheme == 'file': return super(SmartFile, self).convert(uri.uri_path, param, ctx) else: return smart_open.smart_open(value, self.mode)
def test_write(self): def request_callback(request): resp_body = "" headers = {'location': 'http://127.0.0.1:8440/file'} return (307, headers, resp_body) responses.add_callback(responses.PUT, "http://127.0.0.1:8440/webhdfs/v1/path/file", callback=request_callback) responses.add(responses.PUT, "http://127.0.0.1:8440/file", status=201) smart_open_object = smart_open.WebHdfsOpenWrite( smart_open.ParseUri("webhdfs://127.0.0.1:8440/path/file")) def write_callback(request): assert request.body == u"žluťoučký koníček".encode('utf8') headers = {} return (200, headers, "") test_string = u"žluťoučký koníček".encode('utf8') responses.add_callback(responses.POST, "http://127.0.0.1:8440/webhdfs/v1/path/file", callback=request_callback) responses.add_callback(responses.POST, "http://127.0.0.1:8440/file", callback=write_callback) smart_open_object.write(test_string) smart_open_object.close() assert len(responses.calls) == 4 assert responses.calls[ 2].request.url == "http://127.0.0.1:8440/webhdfs/v1/path/file?op=APPEND" assert responses.calls[3].request.url == "http://127.0.0.1:8440/file"
def test_webhdfs(self): """Is webhdfs line iterator called correctly""" responses.add(responses.GET, "http://127.0.0.1:8440/webhdfs/v1/path/file", body='line1\nline2') smart_open_object = smart_open.WebHdfsOpenRead(smart_open.ParseUri("webhdfs://127.0.0.1:8440/path/file")) iterator = iter(smart_open_object) self.assertEqual(next(iterator).decode("utf-8"), "line1") self.assertEqual(next(iterator).decode("utf-8"), "line2")
def test_rw_encoding(self): """Should read and write text, respecting encodings, etc.""" conn = boto.connect_s3() conn.create_bucket("bucket") uri = smart_open.ParseUri("s3://bucket/key") text = u"расцветали яблони и груши" with smart_open.s3_open_uri(uri, "w", encoding="koi8-r") as fout: fout.write(text) with smart_open.s3_open_uri(uri, "r", encoding="koi8-r") as fin: self.assertEqual(text, fin.read()) with smart_open.s3_open_uri(uri, "rb") as fin: self.assertEqual(text.encode("koi8-r"), fin.read()) with smart_open.s3_open_uri(uri, "r", encoding="euc-jp") as fin: self.assertRaises(UnicodeDecodeError, fin.read) with smart_open.s3_open_uri(uri, "r", encoding="euc-jp", errors="replace") as fin: fin.read()
def test_https_readline(self): """Does https readline method work correctly""" responses.add(responses.GET, "https://127.0.0.1/index.html", body='line1\nline2') smart_open_object = smart_open.HttpOpenRead( smart_open.ParseUri("https://127.0.0.1/index.html")) self.assertEqual(smart_open_object.readline().decode("utf-8"), "line1")
def test_http_pass(self): """Does http authentication work correctly""" responses.add(responses.GET, "http://127.0.0.1/index.html", body='line1\nline2') _ = smart_open.HttpOpenRead(smart_open.ParseUri("http://127.0.0.1/index.html"), user='******', password='******') self.assertEquals(len(responses.calls), 1) actual_request = responses.calls[0].request self.assert_('Authorization' in actual_request.headers) self.assert_(actual_request.headers['Authorization'].startswith('Basic '))
def test_gzip_write_mode(self): """Should always open in binary mode when writing through a codec.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') uri = smart_open.ParseUri("s3://bucket/key.gz") with mock.patch('smart_open.smart_open_s3.open') as mock_open: smart_open.s3_open_uri(uri, "wb") mock_open.assert_called_with('bucket', 'key.gz', 'wb')
def test_s3_boto(self, mock_s3_iter_lines, mock_boto): """Is S3 line iterator called correctly?""" # no credentials smart_open_object = smart_open.S3OpenRead(smart_open.ParseUri("s3://mybucket/mykey")) smart_open_object.__iter__() mock_boto.connect_s3.assert_called_with(aws_access_key_id=None, aws_secret_access_key=None) # with credential smart_open_object = smart_open.S3OpenRead(smart_open.ParseUri("s3://access_id:access_secret@mybucket/mykey")) smart_open_object.__iter__() mock_boto.connect_s3.assert_called_with(aws_access_key_id="access_id", aws_secret_access_key="access_secret") # lookup bucket, key; call s3_iter_lines smart_open_object = smart_open.S3OpenRead(smart_open.ParseUri("s3://access_id:access_secret@mybucket/mykey")) smart_open_object.__iter__() mock_boto.connect_s3().get_bucket.assert_called_with("mybucket") mock_boto.connect_s3().get_bucket().lookup.assert_called_with("mykey") self.assertTrue(mock_s3_iter_lines.called)
def test_gzip_write_mode(self): """Should always open in binary mode when writing through a codec.""" conn = boto.connect_s3() conn.create_bucket("bucket") uri = smart_open.ParseUri("s3://bucket/key.gz") with mock.patch('smart_open.smart_open_s3.open') as mock_open: smart_open.s3_open_uri(uri, "wb") mock_open.assert_called_with('bucket', 'key.gz', 'wb')
def test_webhdfs_read(self): """Does webhdfs read method work correctly""" responses.add(responses.GET, "http://127.0.0.1:8440/webhdfs/v1/path/file", body='line1\nline2') smart_open_object = smart_open.WebHdfsOpenRead( smart_open.ParseUri("webhdfs://127.0.0.1:8440/path/file")) self.assertEqual(smart_open_object.read().decode("utf-8"), "line1\nline2")
def test_http_gz(self): """Can open gzip via http?""" fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz') data = open(fpath, 'rb').read() responses.add(responses.GET, "http://127.0.0.1/data.gz", body=data) smart_open_object = smart_open.HttpOpenRead( smart_open.ParseUri("http://127.0.0.1/data.gz")) m = hashlib.md5(smart_open_object.read()) # decompress the gzip and get the same md5 hash self.assertEqual(m.hexdigest(), '18473e60f8c7c98d29d65bf805736a0d')
def test_s3_uri(self): """Do S3 URIs parse correctly?""" # correct uri without credentials parsed_uri = smart_open.ParseUri("s3://mybucket/mykey") self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "mykey") self.assertEqual(parsed_uri.access_id, None) self.assertEqual(parsed_uri.access_secret, None) # correct uri, key contains slash parsed_uri = smart_open.ParseUri("s3://mybucket/mydir/mykey") self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "mydir/mykey") self.assertEqual(parsed_uri.access_id, None) self.assertEqual(parsed_uri.access_secret, None) # correct uri with credentials parsed_uri = smart_open.ParseUri( "s3://ACCESSID456:acces/sse_cr-et@mybucket/mykey") self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "mykey") self.assertEqual(parsed_uri.access_id, "ACCESSID456") self.assertEqual(parsed_uri.access_secret, "acces/sse_cr-et") # correct uri, contains credentials parsed_uri = smart_open.ParseUri( "s3://accessid:access/secret@mybucket/mykey") self.assertEqual(parsed_uri.scheme, "s3") self.assertEqual(parsed_uri.bucket_id, "mybucket") self.assertEqual(parsed_uri.key_id, "mykey") self.assertEqual(parsed_uri.access_id, "accessid") self.assertEqual(parsed_uri.access_secret, "access/secret") # incorrect uri - only one '@' in uri is allowed self.assertRaises(RuntimeError, smart_open.ParseUri, "s3://access_id@access_secret@mybucket/mykey")
def test_initialize_write(self): def request_callback(request): resp_body = "" headers = {'location': 'http://127.0.0.1:8440/file'} return (307, headers, resp_body) responses.add_callback(responses.PUT, "http://127.0.0.1:8440/webhdfs/v1/path/file", callback=request_callback) responses.add(responses.PUT, "http://127.0.0.1:8440/file", status=201) smart_open_object = smart_open.WebHdfsOpenWrite(smart_open.ParseUri("webhdfs://127.0.0.1:8440/path/file")) assert len(responses.calls) == 2 path, params = responses.calls[0].request.url.split("?") assert path == "http://127.0.0.1:8440/webhdfs/v1/path/file" assert params == "overwrite=True&op=CREATE" or params == "op=CREATE&overwrite=True" assert responses.calls[1].request.url == "http://127.0.0.1:8440/file"
def test_gzip_read_mode(self): """Should always open in binary mode when reading through a codec.""" conn = boto.connect_s3() conn.create_bucket("bucket") uri = smart_open.ParseUri("s3://bucket/key.gz") text = u"если-б я был султан и имел трёх жён, то тройной красотой был бы окружён" with smart_open.s3_open_uri(uri, "wb") as fout: fout.write(text.encode("utf-8")) with mock.patch('smart_open.smart_open_s3.open') as mock_open: smart_open.s3_open_uri(uri, "r") mock_open.assert_called_with('bucket', 'key.gz', 'rb')
def stream_other(self, path, fn): """stream from local folders call a function Args: folders (list): list of folders fn (function): function to call """ parsed_uri = smart_open.ParseUri(path) if parsed_uri.scheme in ("file", ) and os.path.isdir( path): # stream each file from directory for f in glob.glob(path + "/*"): for line in smart_open.smart_open(f): fn(line) else: #let smart_open handle streaming of file from location for line in smart_open.smart_open(path): fn(line)
def test_r(self): """Reading a UTF string should work.""" conn = boto.connect_s3() conn.create_bucket("bucket") bucket = conn.get_bucket("bucket") key = boto.s3.key.Key(bucket) key.key = "key" text = u"физкульт-привет!" key.set_contents_from_string(text.encode("utf-8")) with smart_open.s3_open_key(key, "r") as fin: self.assertEqual(fin.read(), u"физкульт-привет!") parsed_uri = smart_open.ParseUri("s3://bucket/key") with smart_open.s3_open_uri(parsed_uri, "r") as fin: self.assertEqual(fin.read(), u"физкульт-привет!")
def test_r(self): """Reading a UTF string should work.""" text = u"физкульт-привет!" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = s3.Object('bucket', 'key') key.put(Body=text.encode('utf-8')) with smart_open.s3_open_key(key, "rb") as fin: self.assertEqual(fin.read(), text.encode('utf-8')) with smart_open.s3_open_key(key, "r", encoding='utf-8') as fin: self.assertEqual(fin.read(), text) uri = smart_open.ParseUri("s3://bucket/key") with smart_open.s3_open_uri(uri, "r", encoding='utf-8') as fin: self.assertEqual(fin.read(), text)
def test_s3_read_moto(self): """Are S3 files read correctly?""" conn = boto.connect_s3() conn.create_bucket("mybucket") # write some bogus key so we can check it below content = "hello wořld\nhow are you?" with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout: fout.write(content) smart_open_object = smart_open.S3OpenRead(smart_open.ParseUri("s3://mybucket/mykey")) self.assertEqual(content[:6], smart_open_object.read(6)) self.assertEqual(content[6:14], smart_open_object.read(8)) # ř is 2 bytes # make sure iteration does not affect read() for line in smart_open_object: pass self.assertEqual(content[14:], smart_open_object.read()) # read the rest
def test_s3_seek_moto(self): """Does seeking in S3 files work correctly?""" conn = boto.connect_s3() conn.create_bucket("mybucket") # write some bogus key so we can check it below content = "hello wořld\nhow are you?" with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout: fout.write(content) smart_open_object = smart_open.S3OpenRead(smart_open.ParseUri("s3://mybucket/mykey")) self.assertEqual(content[:6], smart_open_object.read(6)) self.assertEqual(content[6:14], smart_open_object.read(8)) # ř is 2 bytes smart_open_object.seek(0) self.assertEqual(content, smart_open_object.read()) # no size given => read whole file smart_open_object.seek(0) self.assertEqual(content, smart_open_object.read(-1)) # same thing
def test_rw_gzip(self): """Should read/write gzip files, implicitly and explicitly.""" conn = boto.connect_s3() conn.create_bucket("bucket") uri = smart_open.ParseUri("s3://bucket/key.gz") text = u"не слышны в саду даже шорохи" with smart_open.s3_open_uri(uri, "wb") as fout: fout.write(text.encode("utf-8")) # # Check that what we've created is a gzip. # with smart_open.s3_open_uri(uri, "rb", ignore_extension=True) as fin: gz = gzip.GzipFile(fileobj=fin) self.assertEqual(gz.read().decode("utf-8"), text) # # We should be able to read it back as well. # with smart_open.s3_open_uri(uri, "rb") as fin: self.assertEqual(fin.read().decode("utf-8"), text)
def test_http_bz2(self): """Can open bz2 via http?""" test_string = b'Hello World Compressed.' test_file = tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False).name with smart_open.smart_open(test_file, 'wb') as outfile: outfile.write(test_string) with open(test_file, 'rb') as infile: compressed_data = infile.read() if os.path.isfile(test_file): os.unlink(test_file) responses.add(responses.GET, "http://127.0.0.1/data.bz2", body=compressed_data) smart_open_object = smart_open.HttpOpenRead( smart_open.ParseUri("http://127.0.0.1/data.bz2")) # decompress the gzip and get the same md5 hash self.assertEqual(smart_open_object.read(), test_string)
def test_bad_mode(self): """Bad mode should raise and exception.""" uri = smart_open.ParseUri("s3://bucket/key") self.assertRaises(NotImplementedError, smart_open.s3_open_uri, uri, "x")
def test_webhdfs_uri(self): """Do webhdfs URIs parse correctly""" parsed_uri = smart_open.ParseUri("webhdfs://host:port/path/file") self.assertEqual(parsed_uri.scheme, "webhdfs") self.assertEqual(parsed_uri.uri_path, "host:port/webhdfs/v1/path/file")