def test_prefix_special_headers(self): # Content-Length doesn't get stored in the httpfile format, because it # would be ambiguous. (It does not specify the number of bytes of body. # That's because httpfile stores *decoded* body, and it stores headers # as passed over HTTP.) with tempfile.NamedTemporaryFile() as tf: path = Path(tf.name) httpfile.write( Path(tf.name), {"url": "http://example.com/hello"}, "200 OK", [ ("transfer-encoding", "chunked"), ("content-encoding", "gzip"), ("content-length", "3"), ], io.BytesIO(b"\x00\x01\x02"), ) assert gzip.decompress(path.read_bytes()) == ( b'{"url":"http://example.com/hello"}\r\n' b"200 OK\r\n" b"Cjw-Original-transfer-encoding: chunked\r\n" b"Cjw-Original-content-encoding: gzip\r\n" b"Cjw-Original-content-length: 3\r\n" b"\r\n" b"\x00\x01\x02")
def test_render_xlsx_bad_content(self): with tempfile_context("fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", XLSX_MIME_TYPE)], io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")), ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) self.assertEqual( result, RenderResult( ArrowTable(), [ RenderError( I18nMessage.TODO_i18n( 'Error reading Excel file: Unsupported format, or corrupt file: Expected BOF record; found b"ce\\xc3\\xa7i n\'"' ) ) ], ), )
def test_render_has_header_true(self): with tempfile_context(prefix="http-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", "text/csv")], io.BytesIO(b"A,B\na,b"), ) table, errors = call_render(P(has_header=True), FetchResult(http_path)) assert_arrow_table_equals(table, {"A": ["a"], "B": ["b"]}) self.assertEqual(errors, [])
def test_render_csv_use_content_type_charset(self): with tempfile_context(prefix="fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/file.unknownext"}, "200 OK", [("content-type", "text/csv; charset=iso-8859-1")], io.BytesIO(b"A,B\n\xc3\xa1a,b"), # looks like UTF-8; force latin1 ) table, errors = call_render(P(has_header=True), FetchResult(http_path)) assert_arrow_table_equals(table, {"A": ["áa"], "B": ["b"]}) self.assertEqual(errors, [])
def test_render_json(self): with tempfile_context(prefix="fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", "application/json")], io.BytesIO(b'[{"A": "a"}]'), ) table, errors = call_render(P(has_header=True), FetchResult(http_path)) self.assertEqual(errors, []) assert_arrow_table_equals(table, {"A": ["a"]})
def test_render_xlsx(self): with tempfile_context(prefix="fetch-") as http_path: with (TestDataPath / "example.xlsx").open("rb") as xlsx_f: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", XLSX_MIME_TYPE)], xlsx_f, ) table, errors = call_render(P(has_header=True), FetchResult(http_path)) self.assertEqual(errors, []) assert_arrow_table_equals(table, {"foo": [1.0, 2.0], "bar": [2.0, 3.0]})
def test_render_text_plain(self): # guess_mime_type_or_none() treats text/plain specially. with tempfile_context(prefix="fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/file.unknownext"}, "200 OK", [("content-type", "text/plain")], io.BytesIO(b"A;B\na;b"), ) table, errors = call_render(P(has_header=True), FetchResult(http_path)) assert_arrow_table_equals(table, {"A": ["a"], "B": ["b"]}) self.assertEqual(errors, [])
def test_render_csv_handle_nonstandard_mime_type(self): # Transform 'application/csv' into 'text/csv', etc. # # Sysadmins sometimes invent MIME types. We hard-code to rewrite fake # MIME types we've seen in the wild that seem unambiguous. with tempfile_context(prefix="fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", "application/x-csv")], io.BytesIO(b"A,B\na,b"), ) table, errors = call_render(P(has_header=True), FetchResult(http_path)) assert_arrow_table_equals(table, {"A": ["a"], "B": ["b"]}) self.assertEqual(errors, [])
def test_render_has_header_true(self): with tempfile_context("http") as http_path: httpfile.write( http_path, {"url": "https://blah"}, "200 OK", [("content-type", "text/csv")], io.BytesIO(b"A,B\na,b"), ) with self.render(P(has_header=True), FetchResult(http_path)) as result: assert_arrow_table_equals(result.table, { "A": ["a"], "B": ["b"] }) self.assertEqual(result.errors, [])
def test_render_csv_use_url_ext_given_bad_content_type(self): # Use text/plain type and rely on filename detection, as # https://raw.githubusercontent.com/ does with tempfile_context(prefix="fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/file.csv"}, "200 OK", [("content-type", "text/plain")], # bytes will prove we used "csv" explicitly -- we didn't # take "text/plain" and decide to use a CSV sniffer to # find the delimiter. io.BytesIO(b"A;B;C,D\na;b;c,d"), ) table, errors = call_render(P(has_header=True), FetchResult(http_path)) assert_arrow_table_equals(table, {"A;B;C": ["a;b;c"], "D": ["d"]}) self.assertEqual(errors, [])
def test_render_json(self): with tempfile_context("fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", "application/json")], io.BytesIO(b'[{"A": "a"}]'), ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) self.assertEqual(result.errors, []) assert_arrow_table_equals(result.table, {"A": ["a"]})
def test_render_has_header_false(self): with tempfile_context(prefix="http-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", "text/csv")], io.BytesIO(b"1,2\n3,4"), ) table, errors = call_render(P(has_header=False), FetchResult(http_path)) assert_arrow_table_equals( table, { "Column 1": pyarrow.array([1, 3], pyarrow.int8()), "Column 2": pyarrow.array([2, 4], pyarrow.int8()), }, ) self.assertEqual(errors, [])
def test_render_has_header_true(self): with tempfile_context("http") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", "text/csv")], io.BytesIO(b"A,B\na,b"), ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) assert_arrow_table_equals(result.table, {"A": ["a"], "B": ["b"]}) self.assertEqual(result.errors, [])
def test_render_xlsx(self): with tempfile_context("fetch-") as http_path: with (TestDataPath / "example.xlsx").open("rb") as xlsx_f: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", XLSX_MIME_TYPE)], xlsx_f, ) result = render_arrow( ArrowTable(), P(has_header=True), "tab-x", FetchResult(http_path), self.output_path, ) self.assertEqual(result.errors, []) assert_arrow_table_equals(result.table, {"foo": [1, 2], "bar": [2, 3]})
def test_render_has_header_false(self): with tempfile_context("http") as http_path: httpfile.write( http_path, {"url": "https://blah"}, "200 OK", [("content-type", "text/csv")], io.BytesIO(b"1,2\n3,4"), ) with self.render(P(has_header=False), FetchResult(http_path)) as result: assert_arrow_table_equals( result.table, { "Column 1": pyarrow.array([1, 3], pyarrow.int8()), "Column 2": pyarrow.array([2, 4], pyarrow.int8()), }, ) self.assertEqual(result.errors, [])
def test_render_csv_use_content_disposition_given_bad_content_type(self): with tempfile_context(prefix="fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/file"}, "200 OK", [ ("content-type", "application/octet-stream"), ( "content-disposition", 'attachment; filename="file.csv"; size=4405', ), ], # bytes will prove we used "file.csv", not a sniffer. io.BytesIO(b"A;B;C,D\na;b;c,d"), ) table, errors = call_render(P(has_header=True), FetchResult(http_path)) assert_arrow_table_equals(table, {"A;B;C": ["a;b;c"], "D": ["d"]}) self.assertEqual(errors, [])
def test_render_prefer_content_disposition_to_url_ext(self): # When content-disposition uses a different name, prefer that name. with tempfile_context(prefix="fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/file.csv"}, "200 OK", [ # Wrong MIME type -- so we detect from filename ("content-type", "application/octet-stream"), ( "content-disposition", 'attachment; filename="file.tsv"', ), ], # bytes will prove we used "file.tsv", not "file.csv". io.BytesIO(b"A,B\tC,D\na,b\tc,d"), ) table, errors = call_render(P(has_header=True), FetchResult(http_path)) assert_arrow_table_equals(table, {"A,B": ["a,b"], "C,D": ["c,d"]}) self.assertEqual(errors, [])
def test_render_xlsx_bad_content(self): with tempfile_context(prefix="fetch-") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", XLSX_MIME_TYPE)], io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")), ) table, errors = call_render(P(), FetchResult(http_path)) assert_arrow_table_equals(table, {}) self.assertEqual( errors, [ I18nMessage( "TODO_i18n", { "text": "Invalid XLSX file: xlnt::exception : failed to find zip header" }, None, ) ], )
def test_ignore_most_headers(self): # a header like "ETag", "Date" or "Last-Modified" is unreliable. It # doesn't reliably indicate new data from the server. On the flipside, # it _does_ mean that two files won't compare as equivalent. So we # nix these headers. with tempfile.NamedTemporaryFile() as tf: path = Path(tf.name) httpfile.write( path, {"url": "http://example.com/hello"}, "200 OK", [ ("date", "Wed, 21 Oct 2015 07:28:00 GMT"), ("server", "custom-server 0.1"), ("ETag", "some-etag"), ], io.BytesIO(b"\x00\x01\x02"), ) assert gzip.decompress(path.read_bytes()) == ( b'{"url":"http://example.com/hello"}\r\n' b"200 OK\r\n" b"server: custom-server 0.1\r\n" b"\r\n" b"\x00\x01\x02")
def test_render_has_header_false(self): with tempfile_context("http") as http_path: httpfile.write( http_path, {"url": "http://example.com/hello"}, "200 OK", [("content-type", "text/csv")], io.BytesIO(b"1,2\n3,4"), ) result = render_arrow( ArrowTable(), P(has_header=False), "tab-x", FetchResult(http_path), self.output_path, ) assert_arrow_table_equals( result.table, { "Column 1": pyarrow.array([1, 3], pyarrow.int8()), "Column 2": pyarrow.array([2, 4], pyarrow.int8()), }, ) self.assertEqual(result.errors, [])