def test_do_not_crash_on_utf8_encoded_content_disposition_header(self): # If the server responded with a UTF-8-encoded header, that's a bug # on the server: the author didn't realize all headers are # latin1-encoded, so the header is actually double-encoded. # # The result: if a developer unwittingly utf8-encodes a filename, then # the result is _unambiguously_ something else. For instance, "café" # encodes to "café". # # We're spec-compliant here. We will correctly return "café". The # caller can second-guess us if it sees fit. # # https://www.pivotaltracker.com/story/show/174715741 with tempfile.NamedTemporaryFile() as tf: path = Path(tf.name) path.write_bytes( gzip.compress(b"".join([ b'{"url":"http://example.com/hello"}\r\n', b"200 OK\r\n", b"content-disposition: attachment; filename=caf\xc3\xa9\r\n", b"\r\n", b"Some text", ]))) with httpfile.read(path) as (parameters, status_line, headers, body_path): assert headers == [("content-disposition", "attachment; filename=café")]
def _render_file(path: Path, output_path: Path, params: Dict[str, Any]): with httpfile.read(path) as (parameters, status_line, headers, body_path): content_type = httpfile.extract_first_header(headers, "Content-Type") or "" content_disposition = httpfile.extract_first_header( headers, "Content-Disposition") mime_type = guess_mime_type_or_none(content_type, content_disposition, parameters["url"]) if not mime_type: return [ trans( "error.unhandledContentType", "Server responded with unhandled Content-Type {content_type}. " "Please use a different URL.", {"content_type": content_type}, ) ] maybe_charset = guess_charset_or_none(content_type) return parse_file( body_path, output_path=output_path, encoding=maybe_charset, mime_type=mime_type, has_header=params["has_header"], )
def test_fetch_xlsx_file(self): body = b"abcd" self.mock_http_response = MockHttpResponse.ok( body, [( "Content-Type", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", )], ) with self.fetch( P( file={ **default_file, "mimeType": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", }), secrets(DEFAULT_SECRET), ) as result: self.assertEqual(result.errors, []) with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body) self.assertEqual( headers, [ ( "content-type", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ), ("content-length", "4"), ], ) self.assertRegex(self.last_http_requestline, "/files/.*?alt=media")
def test_fetch_chunked_csv(self): self.mock_http_response = MockHttpResponse.ok( [b"A,B\nx", b",y\nz,", b"a"], [("Content-Type", "text/csv; charset=utf-8")] ) url = self.build_url("/path/to.csv.chunks") with call_fetch(url) as result: self.assertEqual(result.errors, []) with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), b"A,B\nx,y\nz,a")
def test_fetch_gzip_encoded_csv(self): body = b"A,B\nx,y\nz,a" url = self.build_url("/path/to.csv.gz") self.mock_http_response = MockHttpResponse.ok( gzip.compress(body), [("Content-Type", "text/csv; charset=utf-8"), ("Content-Encoding", "gzip")], ) with call_fetch(url) as result: self.assertEqual(result.errors, []) with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body)
def test_fetch_native_sheet(self): body = b"A,B\nx,y\nz,a" self.mock_http_response = MockHttpResponse.ok( body, [("Content-Type", "text/csv")]) with self.fetch(P(), secrets(DEFAULT_SECRET)) as result: self.assertEqual(result.errors, []) with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body) self.assertEqual(headers, [("content-type", "text/csv"), ("content-length", "11")]) self.assertRegex(self.last_http_requestline, "/files/.*/export\?mimeType=text%2Fcsv")
def _render_file(path: Path, params: Dict[str, Any], output_path: Path): with httpfile.read(path) as (parameters, status_line, headers, body_path): content_type = httpfile.extract_first_header(headers, "Content-Type") mime_type = _calculate_mime_type(content_type) # Ignore Google-reported charset. Google's headers imply latin-1 when # their data is utf-8. return parse_file( body_path, encoding=None, mime_type=mime_type, has_header=params["has_header"], output_path=output_path, )
def test_fetch_csv_file(self): body = b"A,B\nx,y\nz,a" self.mock_http_response = MockHttpResponse.ok( body, [("Content-Type", "text/csv")]) with self.fetch(P(file={ **default_file, "mimeType": "text/csv" }), secrets(DEFAULT_SECRET)) as result: self.assertEqual(result.errors, []) with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body) self.assertEqual(headers, [("content-type", "text/csv"), ("content-length", "11")]) self.assertRegex(self.last_http_requestline, "/files/.*?alt=media")
def test_latin1_headers(self): with tempfile.NamedTemporaryFile() as tf: path = Path(tf.name) path.write_bytes( gzip.compress(b"".join([ b'{"url":"http://example.com/hello"}\r\n', b"200 OK\r\n", b"content-disposition: attachment; filename=caf\xe9\r\n", b"\r\n", b"Some text", ]))) with httpfile.read(path) as (parameters, status_line, headers, body_path): assert headers == [("content-disposition", "attachment; filename=café")]
def test_special_headers(self): # Content-Length doesn't get stored in the httpfile format, because it # would be ambiguous. (It does not specify the number of bytes of body. # That's because httpfile stores *decoded* body, and it stores headers # as passed over HTTP.) with tempfile.NamedTemporaryFile() as tf: path = Path(tf.name) path.write_bytes( gzip.compress(b'{"url":"http://example.com/hello"}\r\n' b"200 OK\r\n" b"Cjw-Original-content-length: 9\r\n" b"\r\n" b"Some text")) with httpfile.read(path) as (parameters, status_line, headers, body_path): assert headers == [("content-length", "9")]
def test_fetch_deflate_encoded_csv(self): body = b"A,B\nx,y\nz,a" zo = zlib.compressobj(wbits=-zlib.MAX_WBITS) zbody = zo.compress(body) + zo.flush() url = self.build_url("/path/to.csv.gz") self.mock_http_response = MockHttpResponse.ok( zbody, [ ("Content-Type", "text/csv; charset=utf-8"), ("Content-Encoding", "deflate"), ], ) with call_fetch(url) as result: self.assertEqual(result.errors, []) with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body)
def test_fetch_csv(self): body = b"A,B\nx,y\nz,a" url = self.build_url("/path/to.csv") self.mock_http_response = MockHttpResponse.ok( body, [("Content-Type", "text/csv; charset=utf-8")]) with call_fetch(url) as result: self.assertEqual(result.errors, []) with httpfile.read(result.path) as (_, __, headers, body_path): self.assertEqual(body_path.read_bytes(), body) self.assertEqual( headers, [ ("content-type", "text/csv; charset=utf-8"), ("content-length", "11"), ], )
async def test_decode_chunked_csv(self, http_server): http_server.mock_response( MockHttpResponse.ok( [b"A,B\nx", b",y\nz,", b"a"], [("content-type", "text/csv; charset=utf-8")], )) url = http_server.build_url("/path/to.csv.chunks") async with self.download(url) as path: assert (b"\r\nCjw-Original-transfer-encoding: chunked\r\n" in gzip.decompress(path.read_bytes())) with httpfile.read(path) as (parameters, status_line, headers, body_path): assert body_path.read_bytes() == b"A,B\nx,y\nz,a" assert headers == [ ("content-type", "text/csv; charset=utf-8"), ("transfer-encoding", "chunked"), ]
def test_fetch_follow_redirect(self): url1 = self.build_url("/url1.csv") url2 = self.build_url("/url2.csv") url3 = self.build_url("/url3.csv") self.mock_http_response = iter([ MockHttpResponse(302, [("Location", url2)]), MockHttpResponse(302, [("Location", url3)]), MockHttpResponse.ok(b"A,B\n1,2", [("Content-Type", "text/csv")]), ]) with call_fetch(url1) as result: self.assertEqual(result.errors, []) with httpfile.read(result.path) as (parameters, __, headers, body_path): self.assertEqual(body_path.read_bytes(), b"A,B\n1,2") self.assertEqual(parameters, {"url": url1}) self.assertIn("/url1.csv", self.http_requestlines[0]) self.assertIn("/url2.csv", self.http_requestlines[1]) self.assertIn("/url3.csv", self.http_requestlines[2])
async def test_follow_redirect(self, http_server): url1 = http_server.build_url("/url1.csv") url2 = http_server.build_url("/url2.csv") url3 = http_server.build_url("/url3.csv") http_server.mock_response( iter([ MockHttpResponse(302, [("location", url2)]), MockHttpResponse(302, [("location", url3)]), MockHttpResponse.ok(b"A,B\n1,2", [("content-type", "text/csv")]), ])) async with self.download(url1) as path: with httpfile.read(path) as (parameters, status_line, headers, body_path): assert body_path.read_bytes() == b"A,B\n1,2" assert http_server.requested_paths == [ "/url1.csv", "/url2.csv", "/url3.csv" ]
def _render_file(path: Path, output_path: Path, params: Dict[str, Any]): with httpfile.read(path) as (parameters, status_line, headers, body_path): content_type = httpfile.extract_first_header(headers, "Content-Type") mime_type = guess_mime_type_or_none(content_type, parameters["url"]) if not mime_type: return RenderResult(errors=[ RenderError( I18nMessage.TODO_i18n( ("Server responded with unhandled Content-Type %r. " "Please use a different URL.") % content_type)) ]) maybe_charset = guess_charset_or_none(content_type) return parse_file( body_path, output_path=output_path, encoding=maybe_charset, mime_type=mime_type, has_header=params["has_header"], )
def test_happy_path(self): with tempfile.NamedTemporaryFile() as tf: path = Path(tf.name) path.write_bytes( gzip.compress(b"".join([ b'{"url":"http://example.com/hello"}\r\n', b"200 OK\r\n", b"content-type: text/plain; charset=utf-8\r\n", b"content-disposition: inline\r\n", b"\r\n", b"Some text", ]))) with httpfile.read(path) as (parameters, status_line, headers, body_path): assert parameters == {"url": "http://example.com/hello"} assert status_line == "200 OK" assert headers == [ ("content-type", "text/plain; charset=utf-8"), ("content-disposition", "inline"), ] assert body_path.read_bytes() == b"Some text"
async def test_gunzip_encoded_body(self, http_server): body = b"A,B\nx,y\nz,a" gzbody = gzip.compress(body) url = http_server.build_url("/path/to.csv.gz") http_server.mock_response( MockHttpResponse.ok( gzbody, [ ("content-type", "text/csv; charset=utf-8"), ("content-encoding", "gzip"), ], )) async with self.download(url) as path: assert b"\r\nCjw-Original-content-encoding: gzip\r\n" in gzip.decompress( path.read_bytes()) with httpfile.read(path) as (parameters, status_line, headers, body_path): assert body_path.read_bytes() == body assert headers == [ ("content-type", "text/csv; charset=utf-8"), ("content-encoding", "gzip"), ("content-length", str(len(gzbody))), ]
async def test_deflate_encoded_body(self, http_server): body = b"A,B\nx,y\nz,a" zo = zlib.compressobj(wbits=-zlib.MAX_WBITS) zbody = zo.compress(body) + zo.flush() url = http_server.build_url("/path/to.csv.gz") http_server.mock_response( MockHttpResponse.ok( zbody, [ ("content-type", "text/csv; charset=utf-8"), ("content-encoding", "deflate"), ], )) async with self.download(url) as path: assert b"\r\nCjw-Original-content-encoding: deflate\r\n" in gzip.decompress( path.read_bytes()) with httpfile.read(path) as (parameters, status_line, headers, body_path): assert body_path.read_bytes() == body assert headers == [ ("content-type", "text/csv; charset=utf-8"), ("content-encoding", "deflate"), ("content-length", str(len(zbody))), ]