예제 #1
0
 def test_prefix_special_headers(self):
     # Content-Length doesn't get stored in the httpfile format, because it
     # would be ambiguous. (It does not specify the number of bytes of body.
     # That's because httpfile stores *decoded* body, and it stores headers
     # as passed over HTTP.)
     with tempfile.NamedTemporaryFile() as tf:
         path = Path(tf.name)
         httpfile.write(
             Path(tf.name),
             {"url": "http://example.com/hello"},
             "200 OK",
             [
                 ("transfer-encoding", "chunked"),
                 ("content-encoding", "gzip"),
                 ("content-length", "3"),
             ],
             io.BytesIO(b"\x00\x01\x02"),
         )
         assert gzip.decompress(path.read_bytes()) == (
             b'{"url":"http://example.com/hello"}\r\n'
             b"200 OK\r\n"
             b"Cjw-Original-transfer-encoding: chunked\r\n"
             b"Cjw-Original-content-encoding: gzip\r\n"
             b"Cjw-Original-content-length: 3\r\n"
             b"\r\n"
             b"\x00\x01\x02")
예제 #2
0
 def test_render_xlsx_bad_content(self):
     with tempfile_context("fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", XLSX_MIME_TYPE)],
             io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     self.assertEqual(
         result,
         RenderResult(
             ArrowTable(),
             [
                 RenderError(
                     I18nMessage.TODO_i18n(
                         'Error reading Excel file: Unsupported format, or corrupt file: Expected BOF record; found b"ce\\xc3\\xa7i n\'"'
                     )
                 )
             ],
         ),
     )
예제 #3
0
 def test_render_has_header_true(self):
     with tempfile_context(prefix="http-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", "text/csv")],
             io.BytesIO(b"A,B\na,b"),
         )
         table, errors = call_render(P(has_header=True), FetchResult(http_path))
         assert_arrow_table_equals(table, {"A": ["a"], "B": ["b"]})
         self.assertEqual(errors, [])
예제 #4
0
 def test_render_csv_use_content_type_charset(self):
     with tempfile_context(prefix="fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/file.unknownext"},
             "200 OK",
             [("content-type", "text/csv; charset=iso-8859-1")],
             io.BytesIO(b"A,B\n\xc3\xa1a,b"),  # looks like UTF-8; force latin1
         )
         table, errors = call_render(P(has_header=True), FetchResult(http_path))
         assert_arrow_table_equals(table, {"A": ["áa"], "B": ["b"]})
         self.assertEqual(errors, [])
예제 #5
0
 def test_render_json(self):
     with tempfile_context(prefix="fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", "application/json")],
             io.BytesIO(b'[{"A": "a"}]'),
         )
         table, errors = call_render(P(has_header=True), FetchResult(http_path))
         self.assertEqual(errors, [])
         assert_arrow_table_equals(table, {"A": ["a"]})
예제 #6
0
 def test_render_xlsx(self):
     with tempfile_context(prefix="fetch-") as http_path:
         with (TestDataPath / "example.xlsx").open("rb") as xlsx_f:
             httpfile.write(
                 http_path,
                 {"url": "http://example.com/hello"},
                 "200 OK",
                 [("content-type", XLSX_MIME_TYPE)],
                 xlsx_f,
             )
         table, errors = call_render(P(has_header=True), FetchResult(http_path))
         self.assertEqual(errors, [])
         assert_arrow_table_equals(table, {"foo": [1.0, 2.0], "bar": [2.0, 3.0]})
예제 #7
0
 def test_render_text_plain(self):
     # guess_mime_type_or_none() treats text/plain specially.
     with tempfile_context(prefix="fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/file.unknownext"},
             "200 OK",
             [("content-type", "text/plain")],
             io.BytesIO(b"A;B\na;b"),
         )
         table, errors = call_render(P(has_header=True), FetchResult(http_path))
         assert_arrow_table_equals(table, {"A": ["a"], "B": ["b"]})
         self.assertEqual(errors, [])
예제 #8
0
 def test_render_csv_handle_nonstandard_mime_type(self):
     # Transform 'application/csv' into 'text/csv', etc.
     #
     # Sysadmins sometimes invent MIME types. We hard-code to rewrite fake
     # MIME types we've seen in the wild that seem unambiguous.
     with tempfile_context(prefix="fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", "application/x-csv")],
             io.BytesIO(b"A,B\na,b"),
         )
         table, errors = call_render(P(has_header=True), FetchResult(http_path))
         assert_arrow_table_equals(table, {"A": ["a"], "B": ["b"]})
         self.assertEqual(errors, [])
예제 #9
0
 def test_render_has_header_true(self):
     with tempfile_context("http") as http_path:
         httpfile.write(
             http_path,
             {"url": "https://blah"},
             "200 OK",
             [("content-type", "text/csv")],
             io.BytesIO(b"A,B\na,b"),
         )
         with self.render(P(has_header=True),
                          FetchResult(http_path)) as result:
             assert_arrow_table_equals(result.table, {
                 "A": ["a"],
                 "B": ["b"]
             })
             self.assertEqual(result.errors, [])
예제 #10
0
 def test_render_csv_use_url_ext_given_bad_content_type(self):
     # Use text/plain type and rely on filename detection, as
     # https://raw.githubusercontent.com/ does
     with tempfile_context(prefix="fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/file.csv"},
             "200 OK",
             [("content-type", "text/plain")],
             # bytes will prove we used "csv" explicitly -- we didn't
             # take "text/plain" and decide to use a CSV sniffer to
             # find the delimiter.
             io.BytesIO(b"A;B;C,D\na;b;c,d"),
         )
         table, errors = call_render(P(has_header=True), FetchResult(http_path))
         assert_arrow_table_equals(table, {"A;B;C": ["a;b;c"], "D": ["d"]})
         self.assertEqual(errors, [])
예제 #11
0
 def test_render_json(self):
     with tempfile_context("fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", "application/json")],
             io.BytesIO(b'[{"A": "a"}]'),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     self.assertEqual(result.errors, [])
     assert_arrow_table_equals(result.table, {"A": ["a"]})
예제 #12
0
 def test_render_has_header_false(self):
     with tempfile_context(prefix="http-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", "text/csv")],
             io.BytesIO(b"1,2\n3,4"),
         )
         table, errors = call_render(P(has_header=False), FetchResult(http_path))
         assert_arrow_table_equals(
             table,
             {
                 "Column 1": pyarrow.array([1, 3], pyarrow.int8()),
                 "Column 2": pyarrow.array([2, 4], pyarrow.int8()),
             },
         )
         self.assertEqual(errors, [])
예제 #13
0
 def test_render_has_header_true(self):
     with tempfile_context("http") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", "text/csv")],
             io.BytesIO(b"A,B\na,b"),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     assert_arrow_table_equals(result.table, {"A": ["a"], "B": ["b"]})
     self.assertEqual(result.errors, [])
예제 #14
0
 def test_render_xlsx(self):
     with tempfile_context("fetch-") as http_path:
         with (TestDataPath / "example.xlsx").open("rb") as xlsx_f:
             httpfile.write(
                 http_path,
                 {"url": "http://example.com/hello"},
                 "200 OK",
                 [("content-type", XLSX_MIME_TYPE)],
                 xlsx_f,
             )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     self.assertEqual(result.errors, [])
     assert_arrow_table_equals(result.table, {"foo": [1, 2], "bar": [2, 3]})
예제 #15
0
 def test_render_has_header_false(self):
     with tempfile_context("http") as http_path:
         httpfile.write(
             http_path,
             {"url": "https://blah"},
             "200 OK",
             [("content-type", "text/csv")],
             io.BytesIO(b"1,2\n3,4"),
         )
         with self.render(P(has_header=False),
                          FetchResult(http_path)) as result:
             assert_arrow_table_equals(
                 result.table,
                 {
                     "Column 1": pyarrow.array([1, 3], pyarrow.int8()),
                     "Column 2": pyarrow.array([2, 4], pyarrow.int8()),
                 },
             )
             self.assertEqual(result.errors, [])
예제 #16
0
 def test_render_csv_use_content_disposition_given_bad_content_type(self):
     with tempfile_context(prefix="fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/file"},
             "200 OK",
             [
                 ("content-type", "application/octet-stream"),
                 (
                     "content-disposition",
                     'attachment; filename="file.csv"; size=4405',
                 ),
             ],
             # bytes will prove we used "file.csv", not a sniffer.
             io.BytesIO(b"A;B;C,D\na;b;c,d"),
         )
         table, errors = call_render(P(has_header=True), FetchResult(http_path))
         assert_arrow_table_equals(table, {"A;B;C": ["a;b;c"], "D": ["d"]})
         self.assertEqual(errors, [])
예제 #17
0
 def test_render_prefer_content_disposition_to_url_ext(self):
     # When content-disposition uses a different name, prefer that name.
     with tempfile_context(prefix="fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/file.csv"},
             "200 OK",
             [
                 # Wrong MIME type -- so we detect from filename
                 ("content-type", "application/octet-stream"),
                 (
                     "content-disposition",
                     'attachment; filename="file.tsv"',
                 ),
             ],
             # bytes will prove we used "file.tsv", not "file.csv".
             io.BytesIO(b"A,B\tC,D\na,b\tc,d"),
         )
         table, errors = call_render(P(has_header=True), FetchResult(http_path))
         assert_arrow_table_equals(table, {"A,B": ["a,b"], "C,D": ["c,d"]})
         self.assertEqual(errors, [])
예제 #18
0
 def test_render_xlsx_bad_content(self):
     with tempfile_context(prefix="fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", XLSX_MIME_TYPE)],
             io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")),
         )
         table, errors = call_render(P(), FetchResult(http_path))
         assert_arrow_table_equals(table, {})
         self.assertEqual(
             errors,
             [
                 I18nMessage(
                     "TODO_i18n",
                     {
                         "text": "Invalid XLSX file: xlnt::exception : failed to find zip header"
                     },
                     None,
                 )
             ],
         )
예제 #19
0
 def test_ignore_most_headers(self):
     # a header like "ETag", "Date" or "Last-Modified" is unreliable. It
     # doesn't reliably indicate new data from the server. On the flipside,
     # it _does_ mean that two files won't compare as equivalent. So we
     # nix these headers.
     with tempfile.NamedTemporaryFile() as tf:
         path = Path(tf.name)
         httpfile.write(
             path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [
                 ("date", "Wed, 21 Oct 2015 07:28:00 GMT"),
                 ("server", "custom-server 0.1"),
                 ("ETag", "some-etag"),
             ],
             io.BytesIO(b"\x00\x01\x02"),
         )
         assert gzip.decompress(path.read_bytes()) == (
             b'{"url":"http://example.com/hello"}\r\n'
             b"200 OK\r\n"
             b"server: custom-server 0.1\r\n"
             b"\r\n"
             b"\x00\x01\x02")
예제 #20
0
 def test_render_has_header_false(self):
     with tempfile_context("http") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", "text/csv")],
             io.BytesIO(b"1,2\n3,4"),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=False),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     assert_arrow_table_equals(
         result.table,
         {
             "Column 1": pyarrow.array([1, 3], pyarrow.int8()),
             "Column 2": pyarrow.array([2, 4], pyarrow.int8()),
         },
     )
     self.assertEqual(result.errors, [])