Exemple #1
0
    def test_s3_iter_moto(self):
        """Are S3 files iterated over correctly?"""
        # a list of strings to test with
        expected = [b"*" * 5 * 1024**2] + [b'0123456789'] * 1024 + [b"test"]

        # create fake bucket and fake key
        conn = boto.connect_s3()
        conn.create_bucket("mybucket")
        # lower the multipart upload size, to speed up these tests
        smart_open_lib.S3_MIN_PART_SIZE = 5 * 1024**2
        with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout:
            # write a single huge line (=full multipart upload)
            fout.write(expected[0] + b'\n')

            # write lots of small lines
            for lineno, line in enumerate(expected[1:-1]):
                fout.write(line + b'\n')

            # ...and write the last line too, no newline at the end
            fout.write(expected[-1])

        # connect to fake s3 and read from the fake key we filled above
        smart_open_object = smart_open.S3OpenRead(
            smart_open.ParseUri("s3://mybucket/mykey"))
        output = [line.rstrip(b'\n') for line in smart_open_object]
        self.assertEqual(output, expected)

        # same thing but using a context manager
        with smart_open.S3OpenRead(smart_open.ParseUri(
                "s3://mybucket/mykey")) as smart_open_object:
            output = [line.rstrip(b'\n') for line in smart_open_object]
            self.assertEqual(output, expected)
Exemple #2
0
    def test_hdfs(self, mock_subprocess):
        """Is HDFS write called correctly"""
        smart_open_object = smart_open.HdfsOpenWrite(smart_open.ParseUri("hdfs:///tmp/test.txt"))
        smart_open_object.write("test")
        # called with the correct params?
        mock_subprocess.Popen.assert_called_with(["hdfs","dfs","-put","-f","-","/tmp/test.txt"], stdin=mock_subprocess.PIPE)

        # second possibility of schema
        smart_open_object = smart_open.HdfsOpenWrite(smart_open.ParseUri("hdfs://tmp/test.txt"))
        smart_open_object.write("test")
        mock_subprocess.Popen.assert_called_with(["hdfs","dfs","-put","-f","-","/tmp/test.txt"], stdin=mock_subprocess.PIPE)
Exemple #3
0
    def test_webhdfs_uri(self):
        """Do webhdfs URIs parse correctly"""
        # valid uri, no query
        parsed_uri = smart_open.ParseUri("webhdfs://host:port/path/file")
        self.assertEqual(parsed_uri.scheme, "webhdfs")
        self.assertEqual(parsed_uri.uri_path, "host:port/webhdfs/v1/path/file")

        # valid uri, with query
        parsed_uri = smart_open.ParseUri("webhdfs://host:port/path/file?query_part_1&query_part2")
        self.assertEqual(parsed_uri.scheme, "webhdfs")
        self.assertEqual(parsed_uri.uri_path, "host:port/webhdfs/v1/path/file?query_part_1&query_part2")
Exemple #4
0
    def test_hdfs(self, mock_subprocess):
        """Is HDFS line iterator called correctly?"""
        mock_subprocess.PIPE.return_value = "test"
        smart_open_object = smart_open.HdfsOpenRead(smart_open.ParseUri("hdfs:///tmp/test.txt"))
        smart_open_object.__iter__()
        # called with the correct params?
        mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-cat", "/tmp/test.txt"], stdout=mock_subprocess.PIPE)

        # second possibility of schema
        smart_open_object = smart_open.HdfsOpenRead(smart_open.ParseUri("hdfs://tmp/test.txt"))
        smart_open_object.__iter__()
        mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-cat", "/tmp/test.txt"], stdout=mock_subprocess.PIPE)
Exemple #5
0
    def test_scheme(self):
        """Do URIs schemes parse correctly?"""
        # supported schemes
        for scheme in ("s3", "s3n", "hdfs", "file", "http", "https"):
            parsed_uri = smart_open.ParseUri(scheme + "://mybucket/mykey")
            self.assertEqual(parsed_uri.scheme, scheme)

        # unsupported scheme => NotImplementedError
        self.assertRaises(NotImplementedError, smart_open.ParseUri, "foobar://mybucket/mykey")

        # unknown scheme => default_scheme
        parsed_uri = smart_open.ParseUri("blah blah")
        self.assertEqual(parsed_uri.scheme, "file")
Exemple #6
0
    def convert(self, value, param, ctx):
        uri = smart_open.ParseUri(value)

        if uri.scheme == 'file':
            return super(SmartFile, self).convert(uri.uri_path, param, ctx)
        else:
            return smart_open.smart_open(value, self.mode)
Exemple #7
0
    def test_write(self):
        def request_callback(request):
            resp_body = ""
            headers = {'location': 'http://127.0.0.1:8440/file'}
            return (307, headers, resp_body)

        responses.add_callback(responses.PUT,
                               "http://127.0.0.1:8440/webhdfs/v1/path/file",
                               callback=request_callback)
        responses.add(responses.PUT, "http://127.0.0.1:8440/file", status=201)
        smart_open_object = smart_open.WebHdfsOpenWrite(
            smart_open.ParseUri("webhdfs://127.0.0.1:8440/path/file"))

        def write_callback(request):
            assert request.body == u"žluťoučký koníček".encode('utf8')
            headers = {}
            return (200, headers, "")

        test_string = u"žluťoučký koníček".encode('utf8')
        responses.add_callback(responses.POST,
                               "http://127.0.0.1:8440/webhdfs/v1/path/file",
                               callback=request_callback)
        responses.add_callback(responses.POST,
                               "http://127.0.0.1:8440/file",
                               callback=write_callback)
        smart_open_object.write(test_string)
        smart_open_object.close()
        assert len(responses.calls) == 4
        assert responses.calls[
            2].request.url == "http://127.0.0.1:8440/webhdfs/v1/path/file?op=APPEND"
        assert responses.calls[3].request.url == "http://127.0.0.1:8440/file"
Exemple #8
0
 def test_webhdfs(self):
     """Is webhdfs line iterator called correctly"""
     responses.add(responses.GET, "http://127.0.0.1:8440/webhdfs/v1/path/file", body='line1\nline2')
     smart_open_object = smart_open.WebHdfsOpenRead(smart_open.ParseUri("webhdfs://127.0.0.1:8440/path/file"))
     iterator = iter(smart_open_object)
     self.assertEqual(next(iterator).decode("utf-8"), "line1")
     self.assertEqual(next(iterator).decode("utf-8"), "line2")
Exemple #9
0
    def test_rw_encoding(self):
        """Should read and write text, respecting encodings, etc."""
        conn = boto.connect_s3()
        conn.create_bucket("bucket")

        uri = smart_open.ParseUri("s3://bucket/key")
        text = u"расцветали яблони и груши"

        with smart_open.s3_open_uri(uri, "w", encoding="koi8-r") as fout:
            fout.write(text)

        with smart_open.s3_open_uri(uri, "r", encoding="koi8-r") as fin:
            self.assertEqual(text, fin.read())

        with smart_open.s3_open_uri(uri, "rb") as fin:
            self.assertEqual(text.encode("koi8-r"), fin.read())

        with smart_open.s3_open_uri(uri, "r", encoding="euc-jp") as fin:
            self.assertRaises(UnicodeDecodeError, fin.read)

        with smart_open.s3_open_uri(uri,
                                    "r",
                                    encoding="euc-jp",
                                    errors="replace") as fin:
            fin.read()
Exemple #10
0
 def test_https_readline(self):
     """Does https readline method work correctly"""
     responses.add(responses.GET,
                   "https://127.0.0.1/index.html",
                   body='line1\nline2')
     smart_open_object = smart_open.HttpOpenRead(
         smart_open.ParseUri("https://127.0.0.1/index.html"))
     self.assertEqual(smart_open_object.readline().decode("utf-8"), "line1")
Exemple #11
0
 def test_http_pass(self):
     """Does http authentication work correctly"""
     responses.add(responses.GET, "http://127.0.0.1/index.html", body='line1\nline2')
     _ = smart_open.HttpOpenRead(smart_open.ParseUri("http://127.0.0.1/index.html"), user='******', password='******')
     self.assertEquals(len(responses.calls), 1)
     actual_request = responses.calls[0].request
     self.assert_('Authorization' in actual_request.headers)
     self.assert_(actual_request.headers['Authorization'].startswith('Basic '))
Exemple #12
0
    def test_gzip_write_mode(self):
        """Should always open in binary mode when writing through a codec."""
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='bucket')
        uri = smart_open.ParseUri("s3://bucket/key.gz")

        with mock.patch('smart_open.smart_open_s3.open') as mock_open:
            smart_open.s3_open_uri(uri, "wb")
            mock_open.assert_called_with('bucket', 'key.gz', 'wb')
Exemple #13
0
    def test_s3_boto(self, mock_s3_iter_lines, mock_boto):
        """Is S3 line iterator called correctly?"""
        # no credentials
        smart_open_object = smart_open.S3OpenRead(smart_open.ParseUri("s3://mybucket/mykey"))
        smart_open_object.__iter__()
        mock_boto.connect_s3.assert_called_with(aws_access_key_id=None, aws_secret_access_key=None)

        # with credential
        smart_open_object = smart_open.S3OpenRead(smart_open.ParseUri("s3://access_id:access_secret@mybucket/mykey"))
        smart_open_object.__iter__()
        mock_boto.connect_s3.assert_called_with(aws_access_key_id="access_id", aws_secret_access_key="access_secret")

        # lookup bucket, key; call s3_iter_lines
        smart_open_object = smart_open.S3OpenRead(smart_open.ParseUri("s3://access_id:access_secret@mybucket/mykey"))
        smart_open_object.__iter__()
        mock_boto.connect_s3().get_bucket.assert_called_with("mybucket")
        mock_boto.connect_s3().get_bucket().lookup.assert_called_with("mykey")
        self.assertTrue(mock_s3_iter_lines.called)
Exemple #14
0
    def test_gzip_write_mode(self):
        """Should always open in binary mode when writing through a codec."""
        conn = boto.connect_s3()
        conn.create_bucket("bucket")
        uri = smart_open.ParseUri("s3://bucket/key.gz")

        with mock.patch('smart_open.smart_open_s3.open') as mock_open:
            smart_open.s3_open_uri(uri, "wb")
            mock_open.assert_called_with('bucket', 'key.gz', 'wb')
Exemple #15
0
 def test_webhdfs_read(self):
     """Does webhdfs read method work correctly"""
     responses.add(responses.GET,
                   "http://127.0.0.1:8440/webhdfs/v1/path/file",
                   body='line1\nline2')
     smart_open_object = smart_open.WebHdfsOpenRead(
         smart_open.ParseUri("webhdfs://127.0.0.1:8440/path/file"))
     self.assertEqual(smart_open_object.read().decode("utf-8"),
                      "line1\nline2")
Exemple #16
0
    def test_http_gz(self):
        """Can open gzip via http?"""
        fpath = os.path.join(CURR_DIR, 'test_data/crlf_at_1k_boundary.warc.gz')
        data = open(fpath, 'rb').read()

        responses.add(responses.GET, "http://127.0.0.1/data.gz", body=data)
        smart_open_object = smart_open.HttpOpenRead(
            smart_open.ParseUri("http://127.0.0.1/data.gz"))

        m = hashlib.md5(smart_open_object.read())
        # decompress the gzip and get the same md5 hash
        self.assertEqual(m.hexdigest(), '18473e60f8c7c98d29d65bf805736a0d')
Exemple #17
0
    def test_s3_uri(self):
        """Do S3 URIs parse correctly?"""
        # correct uri without credentials
        parsed_uri = smart_open.ParseUri("s3://mybucket/mykey")
        self.assertEqual(parsed_uri.scheme, "s3")
        self.assertEqual(parsed_uri.bucket_id, "mybucket")
        self.assertEqual(parsed_uri.key_id, "mykey")
        self.assertEqual(parsed_uri.access_id, None)
        self.assertEqual(parsed_uri.access_secret, None)

        # correct uri, key contains slash
        parsed_uri = smart_open.ParseUri("s3://mybucket/mydir/mykey")
        self.assertEqual(parsed_uri.scheme, "s3")
        self.assertEqual(parsed_uri.bucket_id, "mybucket")
        self.assertEqual(parsed_uri.key_id, "mydir/mykey")
        self.assertEqual(parsed_uri.access_id, None)
        self.assertEqual(parsed_uri.access_secret, None)

        # correct uri with credentials
        parsed_uri = smart_open.ParseUri(
            "s3://ACCESSID456:acces/sse_cr-et@mybucket/mykey")
        self.assertEqual(parsed_uri.scheme, "s3")
        self.assertEqual(parsed_uri.bucket_id, "mybucket")
        self.assertEqual(parsed_uri.key_id, "mykey")
        self.assertEqual(parsed_uri.access_id, "ACCESSID456")
        self.assertEqual(parsed_uri.access_secret, "acces/sse_cr-et")

        # correct uri, contains credentials
        parsed_uri = smart_open.ParseUri(
            "s3://accessid:access/secret@mybucket/mykey")
        self.assertEqual(parsed_uri.scheme, "s3")
        self.assertEqual(parsed_uri.bucket_id, "mybucket")
        self.assertEqual(parsed_uri.key_id, "mykey")
        self.assertEqual(parsed_uri.access_id, "accessid")
        self.assertEqual(parsed_uri.access_secret, "access/secret")

        # incorrect uri - only one '@' in uri is allowed
        self.assertRaises(RuntimeError, smart_open.ParseUri,
                          "s3://access_id@access_secret@mybucket/mykey")
Exemple #18
0
 def test_initialize_write(self):
     def request_callback(request):
         resp_body = ""
         headers = {'location': 'http://127.0.0.1:8440/file'}
         return (307, headers, resp_body)
     responses.add_callback(responses.PUT, "http://127.0.0.1:8440/webhdfs/v1/path/file", callback=request_callback)
     responses.add(responses.PUT, "http://127.0.0.1:8440/file", status=201)
     smart_open_object = smart_open.WebHdfsOpenWrite(smart_open.ParseUri("webhdfs://127.0.0.1:8440/path/file"))
     assert len(responses.calls) == 2
     path, params = responses.calls[0].request.url.split("?")
     assert path == "http://127.0.0.1:8440/webhdfs/v1/path/file"
     assert params == "overwrite=True&op=CREATE" or params == "op=CREATE&overwrite=True"
     assert responses.calls[1].request.url == "http://127.0.0.1:8440/file"
Exemple #19
0
    def test_gzip_read_mode(self):
        """Should always open in binary mode when reading through a codec."""
        conn = boto.connect_s3()
        conn.create_bucket("bucket")
        uri = smart_open.ParseUri("s3://bucket/key.gz")

        text = u"если-б я был султан и имел трёх жён, то тройной красотой был бы окружён"
        with smart_open.s3_open_uri(uri, "wb") as fout:
            fout.write(text.encode("utf-8"))

        with mock.patch('smart_open.smart_open_s3.open') as mock_open:
            smart_open.s3_open_uri(uri, "r")
            mock_open.assert_called_with('bucket', 'key.gz', 'rb')
Exemple #20
0
    def stream_other(self, path, fn):
        """stream from local folders call a function

        Args:
            folders (list): list of folders
            fn (function): function to call
        """
        parsed_uri = smart_open.ParseUri(path)
        if parsed_uri.scheme in ("file", ) and os.path.isdir(
                path):  # stream each file from directory
            for f in glob.glob(path + "/*"):
                for line in smart_open.smart_open(f):
                    fn(line)
        else:  #let smart_open handle streaming of file from location
            for line in smart_open.smart_open(path):
                fn(line)
Exemple #21
0
    def test_r(self):
        """Reading a UTF string should work."""
        conn = boto.connect_s3()
        conn.create_bucket("bucket")
        bucket = conn.get_bucket("bucket")
        key = boto.s3.key.Key(bucket)
        key.key = "key"

        text = u"физкульт-привет!"
        key.set_contents_from_string(text.encode("utf-8"))

        with smart_open.s3_open_key(key, "r") as fin:
            self.assertEqual(fin.read(), u"физкульт-привет!")

        parsed_uri = smart_open.ParseUri("s3://bucket/key")
        with smart_open.s3_open_uri(parsed_uri, "r") as fin:
            self.assertEqual(fin.read(), u"физкульт-привет!")
Exemple #22
0
    def test_r(self):
        """Reading a UTF string should work."""
        text = u"физкульт-привет!"

        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='bucket')
        key = s3.Object('bucket', 'key')
        key.put(Body=text.encode('utf-8'))

        with smart_open.s3_open_key(key, "rb") as fin:
            self.assertEqual(fin.read(), text.encode('utf-8'))

        with smart_open.s3_open_key(key, "r", encoding='utf-8') as fin:
            self.assertEqual(fin.read(), text)

        uri = smart_open.ParseUri("s3://bucket/key")
        with smart_open.s3_open_uri(uri, "r", encoding='utf-8') as fin:
            self.assertEqual(fin.read(), text)
Exemple #23
0
    def test_s3_read_moto(self):
        """Are S3 files read correctly?"""
        conn = boto.connect_s3()
        conn.create_bucket("mybucket")

        # write some bogus key so we can check it below
        content = "hello wořld\nhow are you?"
        with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout:
            fout.write(content)

        smart_open_object = smart_open.S3OpenRead(smart_open.ParseUri("s3://mybucket/mykey"))
        self.assertEqual(content[:6], smart_open_object.read(6))
        self.assertEqual(content[6:14], smart_open_object.read(8))  # ř is 2 bytes

        # make sure iteration does not affect read()
        for line in smart_open_object:
            pass
        self.assertEqual(content[14:], smart_open_object.read())  # read the rest
Exemple #24
0
    def test_s3_seek_moto(self):
        """Does seeking in S3 files work correctly?"""
        conn = boto.connect_s3()
        conn.create_bucket("mybucket")

        # write some bogus key so we can check it below
        content = "hello wořld\nhow are you?"
        with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout:
            fout.write(content)

        smart_open_object = smart_open.S3OpenRead(smart_open.ParseUri("s3://mybucket/mykey"))
        self.assertEqual(content[:6], smart_open_object.read(6))
        self.assertEqual(content[6:14], smart_open_object.read(8))  # ř is 2 bytes

        smart_open_object.seek(0)
        self.assertEqual(content, smart_open_object.read()) # no size given => read whole file

        smart_open_object.seek(0)
        self.assertEqual(content, smart_open_object.read(-1)) # same thing
Exemple #25
0
    def test_rw_gzip(self):
        """Should read/write gzip files, implicitly and explicitly."""
        conn = boto.connect_s3()
        conn.create_bucket("bucket")
        uri = smart_open.ParseUri("s3://bucket/key.gz")

        text = u"не слышны в саду даже шорохи"
        with smart_open.s3_open_uri(uri, "wb") as fout:
            fout.write(text.encode("utf-8"))

        #
        # Check that what we've created is a gzip.
        #
        with smart_open.s3_open_uri(uri, "rb", ignore_extension=True) as fin:
            gz = gzip.GzipFile(fileobj=fin)
            self.assertEqual(gz.read().decode("utf-8"), text)

        #
        # We should be able to read it back as well.
        #
        with smart_open.s3_open_uri(uri, "rb") as fin:
            self.assertEqual(fin.read().decode("utf-8"), text)
Exemple #26
0
    def test_http_bz2(self):
        """Can open bz2 via http?"""
        test_string = b'Hello World Compressed.'
        test_file = tempfile.NamedTemporaryFile('wb',
                                                suffix='.bz2',
                                                delete=False).name

        with smart_open.smart_open(test_file, 'wb') as outfile:
            outfile.write(test_string)

        with open(test_file, 'rb') as infile:
            compressed_data = infile.read()

        if os.path.isfile(test_file):
            os.unlink(test_file)

        responses.add(responses.GET,
                      "http://127.0.0.1/data.bz2",
                      body=compressed_data)
        smart_open_object = smart_open.HttpOpenRead(
            smart_open.ParseUri("http://127.0.0.1/data.bz2"))

        # decompress the gzip and get the same md5 hash
        self.assertEqual(smart_open_object.read(), test_string)
Exemple #27
0
 def test_bad_mode(self):
     """Bad mode should raise and exception."""
     uri = smart_open.ParseUri("s3://bucket/key")
     self.assertRaises(NotImplementedError, smart_open.s3_open_uri, uri,
                       "x")
Exemple #28
0
 def test_webhdfs_uri(self):
     """Do webhdfs URIs parse correctly"""
     parsed_uri = smart_open.ParseUri("webhdfs://host:port/path/file")
     self.assertEqual(parsed_uri.scheme, "webhdfs")
     self.assertEqual(parsed_uri.uri_path, "host:port/webhdfs/v1/path/file")