Exemple #1
0
def render_arrow(
    table, params, tab_name, fetch_result: Optional[FetchResult], output_path: Path
) -> RenderResult:
    # Must perform header operation here in the event the header checkbox
    # state changes
    if fetch_result is None:
        # empty table
        return RenderResult(ArrowTable())
    elif fetch_result.path is not None and parquet.file_has_parquet_magic_number(
        fetch_result.path
    ):
        # Deprecated files: we used to parse in fetch() and store the result
        # as Parquet. Now we've lost the original file data, and we need to
        # support our oldest users.
        #
        # In this deprecated format, parse errors were written as
        # fetch_result.errors.
        return _render_deprecated_parquet(
            fetch_result.path, fetch_result.errors, output_path, params
        )
    elif fetch_result.errors:
        # We've never stored errors+data. If there are errors, assume
        # there's no data.
        return RenderResult(ArrowTable(), fetch_result.errors)
    else:
        assert not fetch_result.errors  # we've never stored errors+data.
        return _render_file(fetch_result.path, params, output_path)
 def test_render_xlsx_bad_content(self):
     with tempfile_context("fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", XLSX_MIME_TYPE)],
             io.BytesIO("ceçi n'est pas une .xlsx".encode("utf-8")),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     self.assertEqual(
         result,
         RenderResult(
             ArrowTable(),
             [
                 RenderError(
                     I18nMessage.TODO_i18n(
                         'Error reading Excel file: Unsupported format, or corrupt file: Expected BOF record; found b"ce\\xc3\\xa7i n\'"'
                     )
                 )
             ],
         ),
     )
Exemple #3
0
def parse_csv(
    path: Path,
    *,
    output_path: Path,
    encoding: Optional[str],
    delimiter: Optional[str],
    has_header: bool,
    autoconvert_text_to_numbers: bool,
) -> RenderResult:
    result = _parse_csv(
        path,
        encoding=encoding,
        delimiter=delimiter,
        has_header=has_header,
        autoconvert_text_to_numbers=autoconvert_text_to_numbers,
    )
    with pyarrow.ipc.RecordBatchFileWriter(
            output_path.as_posix(), schema=result.table.schema) as writer:
        writer.write_table(result.table)

    metadata = infer_table_metadata(result.table)

    if len(metadata.columns) == 0:
        arrow_table = ArrowTable()
    else:
        arrow_table = ArrowTable(output_path, result.table, metadata)
    if result.warnings:
        # TODO when we support i18n, this will be even simpler....
        en_message = "\n".join([str(warning) for warning in result.warnings])
        errors = [RenderError(I18nMessage.TODO_i18n(en_message))]
    else:
        errors = []

    return RenderResult(arrow_table, errors)
Exemple #4
0
def arrow_table_context(
    table: Union[Dict[str, List[Any]], pyarrow.Table],
    columns: Optional[List[Column]] = None,
    dir: Optional[pathlib.Path] = None,
) -> ContextManager[ArrowTable]:
    """
    Yield an ArrowTable (whose `.path` is a file).

    Metadata is inferred. Number columns have format `{:,}`.
    """
    if isinstance(table, dict):
        table = pyarrow.table(table)

    if columns is None:
        columns = [
            _arrow_column_to_column(name, col)
            for name, col in zip(table.column_names, table.columns)
        ]
    metadata = TableMetadata(table.num_rows, columns)

    if metadata.columns:
        with arrow_file(table, dir=dir) as path:
            yield ArrowTable(path, table, metadata)
    else:
        yield ArrowTable(None, None, metadata)
 def test_render_fetch_error(self):
     errors = [RenderResult(I18nMessage("x", {"y": "z"}))]
     with tempfile_context() as empty_path:
         result = render_arrow(
             ArrowTable(),
             P(),
             "tab-x",
             FetchResult(empty_path, errors),
             self.output_path,
         )
     assert_arrow_table_equals(result.table, ArrowTable())
     self.assertEqual(result.errors, errors)
 def render(self, params: Dict[str, Any],
            fetch_result: Optional[FetchResult]):
     with tempfile_context(prefix="output-",
                           suffix=".arrow") as output_path:
         errors = render(ArrowTable(),
                         params,
                         output_path,
                         fetch_result=fetch_result)
         arrow_table = ArrowTable.from_arrow_file_with_inferred_metadata(
             output_path)
         yield RenderResult(arrow_table,
                            [RenderError(I18nMessage(*e)) for e in errors])
 def test_render_error(self):
     path = self._file(b"A,B\nx,y", suffix=".json")
     result = upload.render_arrow(
         ArrowTable(),
         {
             "file": path,
             "has_header": True
         },
         "tab-x",
         None,
         self.output_path,
     )
     assert_arrow_table_equals(result.table, {})
     self.assertEqual(
         result.errors,
         [
             RenderError(
                 message=I18nMessage(
                     id="TODO_i18n",
                     args={
                         "text":
                         "JSON parse error at byte 0: Invalid value."
                     },
                 ),
                 quick_fixes=[],
             )
         ],
     )
 def test_render_fetch_error(self):
     fetch_errors = [RenderError(I18nMessage("x", {"y": "z"}))]
     with tempfile_context() as empty_path:
         with self.render(P(), FetchResult(empty_path,
                                           fetch_errors)) as result:
             assert_arrow_table_equals(result.table, ArrowTable())
             self.assertEqual(result.errors, fetch_errors)
Exemple #9
0
def open_cached_render_result(
        crr: CachedRenderResult) -> ContextManager[RenderResult]:
    """
    Yield a RenderResult equivalent to the one passed to `cache_render_result()`.

    Raise CorruptCacheError if the cached data does not match `crr`. That can
    mean:

        * The cached Parquet file is corrupt
        * The cached Parquet file is missing
        * `crr` is stale -- the cached result is for a different delta. This
          could be detected by a `Workflow.cooperative_lock()`, too, should the
          caller want to distinguish this error from the others.

    The returned RenderResult is backed by an mmapped file on disk, so it
    doesn't require much physical RAM.
    """
    if not crr.table_metadata.columns:
        # Zero-column tables aren't written to cache
        yield RenderResult(
            ArrowTable.from_zero_column_metadata(
                TableMetadata(crr.table_metadata.n_rows, [])),
            crr.errors,
            crr.json,
        )
        return

    with tempfile_context(prefix="cached-render-result") as arrow_path:
        # raise CorruptCacheError (deleting `arrow_path` in the process)
        result = load_cached_render_result(crr, arrow_path)

        yield result
 def test_render_deprecated_parquet(self):
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         result = render_arrow(
             ArrowTable(), P(), "tab-x", FetchResult(fetched_path), self.output_path
         )
     assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]})
     self.assertEqual(result.errors, [])
 def test_render_deprecated_parquet_warning(self):
     errors = [RenderError(I18nMessage.TODO_i18n("truncated table"))]
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         result = render_arrow(
             ArrowTable(),
             P(),
             "tab-x",
             FetchResult(fetched_path, errors=errors),
             self.output_path,
         )
     assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]})
     self.assertEqual(result.errors, errors)
Exemple #12
0
def parse_xlsx(path: Path, *, output_path: Path,
               has_header: bool) -> RenderResult:
    result = _parse_xlsx(path, header_rows=("0-1" if has_header else ""))
    with pyarrow.ipc.RecordBatchFileWriter(
            output_path.as_posix(), schema=result.table.schema) as writer:
        writer.write_table(result.table)

    metadata = infer_table_metadata(result.table)

    if len(metadata.columns) == 0:
        arrow_table = ArrowTable()
    else:
        arrow_table = ArrowTable(output_path, result.table, metadata)
    if result.warnings:
        # TODO when we support i18n, this will be even simpler....
        en_message = "\n".join([str(warning) for warning in result.warnings])
        errors = [RenderError(I18nMessage.TODO_i18n(en_message))]
    else:
        errors = []

    return RenderResult(arrow_table, errors)
    def render(
        self,
        compiled_module: CompiledModule,
        chroot_context: ChrootContext,
        basedir: Path,
        input_table: ArrowTable,
        params: Params,
        tab: Tab,
        fetch_result: Optional[FetchResult],
        output_filename: str,
    ) -> RenderResult:
        """
        Run the module's `render_thrift()` function and return its result.

        Raise ModuleError if the module has a bug.
        """
        chroot_dir = chroot_context.chroot.root
        basedir_seen_by_module = Path("/") / basedir.relative_to(chroot_dir)
        request = ttypes.RenderRequest(
            str(basedir_seen_by_module),
            input_table.to_thrift(),
            params.to_thrift(),
            tab.to_thrift(),
            None if fetch_result is None else fetch_result.to_thrift(),
            output_filename,
        )
        try:
            with chroot_context.writable_file(basedir / output_filename):
                result = self._run_in_child(
                    chroot_dir=chroot_dir,
                    network_config=pyspawner.NetworkConfig(),  # TODO disallow networking
                    compiled_module=compiled_module,
                    timeout=self.render_timeout,
                    result=ttypes.RenderResult(),
                    function="render_thrift",
                    args=[request],
                )
        finally:
            chroot_context.clear_unowned_edits()

        if result.table.filename and result.table.filename != output_filename:
            raise ModuleExitedError(0, "Module wrote to wrong output file")

        try:
            # RenderResult.from_thrift() verifies all filenames passed by the
            # module are in the directory the module has access to. It assumes
            # the Arrow file (if there is one) is untrusted, so it can raise
            # ValidateError
            render_result = RenderResult.from_thrift(result, basedir)
        except ValidateError as err:
            raise ModuleExitedError(0, "Module produced invalid data: %s" % str(err))
        return render_result
 def test_render_no_file(self):
     result = upload.render_arrow(
         ArrowTable(),
         {
             "file": None,
             "has_header": True
         },
         "tab-x",
         None,
         self.output_path,
     )
     assert_arrow_table_equals(result.table, {})
     self.assertEqual(result.errors, [])
 def test_render_success(self):
     path = self._file(b"A,B\nx,y", suffix=".csv")
     result = upload.render_arrow(
         ArrowTable(),
         {
             "file": path,
             "has_header": True
         },
         "tab-x",
         None,
         self.output_path,
     )
     assert_arrow_table_equals(result.table, {"A": ["x"], "B": ["y"]})
     self.assertEqual(result.errors, [])
Exemple #16
0
def load_cached_render_result(crr: CachedRenderResult,
                              path: Path) -> RenderResult:
    """
    Return a RenderResult equivalent to the one passed to `cache_render_result()`.

    Raise CorruptCacheError if the cached data does not match `crr`. That can
    mean:

        * The cached Parquet file is corrupt
        * The cached Parquet file is missing
        * `crr` is stale -- the cached result is for a different delta. This
          could be detected by a `Workflow.cooperative_lock()`, too, should the
          caller want to distinguish this error from the others.

    The returned RenderResult is backed by an mmapped file on disk -- the one
    supplied as `path`. It doesn't require much physical RAM: the Linux kernel
    may page out data we aren't using.
    """
    if not crr.table_metadata.columns:
        # Zero-column tables aren't written to cache
        return RenderResult(
            ArrowTable.from_zero_column_metadata(
                TableMetadata(crr.table_metadata.n_rows, [])),
            crr.errors,
            crr.json,
        )

    # raises CorruptCacheError
    with downloaded_parquet_file(crr) as parquet_path:
        try:
            # raises ArrowIOError
            cjwparquet.convert_parquet_file_to_arrow_file(parquet_path, path)
        except pyarrow.ArrowIOError as err:
            raise CorruptCacheError from err
    # TODO handle validation errors => CorruptCacheError
    arrow_table = ArrowTable.from_trusted_file(path, crr.table_metadata)
    return RenderResult(arrow_table, crr.errors, crr.json)
Exemple #17
0
def _render_deprecated_parquet(
    input_path: Path,
    errors: List[RenderError],
    output_path: Path,
    params: Dict[str, Any],
) -> RenderResult:
    parquet.convert_parquet_file_to_arrow_file(input_path, output_path)
    result = RenderResult(
        ArrowTable.from_arrow_file_with_inferred_metadata(output_path), errors)

    if result.table.metadata.n_rows > 0 and not params["has_header"]:
        pandas_result = ProcessResult.from_arrow(result)
        dataframe = moduleutils.turn_header_into_first_row(
            pandas_result.dataframe)
        return ProcessResult(dataframe).to_arrow(output_path)

    return result
 def test_render_json(self):
     with tempfile_context("fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", "application/json")],
             io.BytesIO(b'[{"A": "a"}]'),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     self.assertEqual(result.errors, [])
     assert_arrow_table_equals(result.table, {"A": ["a"]})
 def test_render_has_header_true(self):
     with tempfile_context("http") as http_path:
         httpfile.write(
             http_path,
             {"url": "https://blah"},
             "200 OK",
             [("content-type", "text/csv")],
             io.BytesIO(b"A,B\na,b"),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     assert_arrow_table_equals(result.table, {"A": ["a"], "B": ["b"]})
     self.assertEqual(result.errors, [])
 def test_render_text_plain(self):
     # guess_mime_type_or_none() treats text/plain specially.
     with tempfile_context(prefix="fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/file.unknownext"},
             "200 OK",
             [("content-type", "text/plain")],
             io.BytesIO(b"A;B\na;b"),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     self.assertEqual(result.errors, [])
     assert_arrow_table_equals(result.table, {"A": ["a"], "B": ["b"]})
 def test_render_xlsx(self):
     with tempfile_context("fetch-") as http_path:
         with (TestDataPath / "example.xlsx").open("rb") as xlsx_f:
             httpfile.write(
                 http_path,
                 {"url": "http://example.com/hello"},
                 "200 OK",
                 [("content-type", XLSX_MIME_TYPE)],
                 xlsx_f,
             )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     self.assertEqual(result.errors, [])
     assert_arrow_table_equals(result.table, {"foo": [1, 2], "bar": [2, 3]})
Exemple #22
0
    def render(
        self,
        compiled_module: CompiledModule,
        basedir: Path,
        input_table: ArrowTable,
        params: Params,
        tab: Tab,
        fetch_result: Optional[FetchResult],
        output_filename: str,
    ) -> RenderResult:
        request = ttypes.RenderRequest(
            str(basedir),
            input_table.to_thrift(),
            params.to_thrift(),
            tab.to_thrift(),
            None if fetch_result is None else fetch_result.to_thrift(),
            output_filename,
        )
        with _chroot_dir_context(provide_paths=[basedir],
                                 extract_paths=[basedir / output_filename
                                                ]) as chroot:
            result = self._run_in_child(
                chroot=chroot,
                chroot_paths=[basedir] + DATA_PATHS + PARQUET_PATHS +
                NETWORKING_PATHS,  # TODO nix networking
                compiled_module=compiled_module,
                timeout=self.render_timeout,
                result=ttypes.RenderResult(),
                function="render_thrift",
                args=[request],
            )
            if result.table.filename and result.table.filename != output_filename:
                raise ModuleExitedError(0, "Module wrote to wrong output file")

        # RenderResult.from_thrift() verifies all filenames passed by the
        # module are in the directory the module has access to.
        render_result = RenderResult.from_thrift(result, basedir)
        if render_result.table.table is not None:
            validate(render_result.table.table, render_result.table.metadata)
        return render_result
 def test_render_csv_handle_nonstandard_mime_type(self):
     # Transform 'application/csv' into 'text/csv', etc.
     #
     # Sysadmins sometimes invent MIME types. We hard-code to rewrite fake
     # MIME types we've seen in the wild that seem unambiguous.
     with tempfile_context(prefix="fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/hello"},
             "200 OK",
             [("content-type", "application/x-csv")],
             io.BytesIO(b"A,B\na,b"),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     assert_arrow_table_equals(result.table, {"A": ["a"], "B": ["b"]})
     self.assertEqual(result.errors, [])
 def test_render_deprecated_parquet_has_header_false(self):
     # This behavior is totally awful, but we support it for backwards
     # compatibility.
     #
     # Back in the day, we parsed during fetch. But has_header can change
     # between fetch and render. We were lazy, so we made fetch() follow the
     # most-common path: has_header=True. Then, in render(), we would "undo"
     # the change if has_header=False. This was lossy. It took a lot of time
     # to figure it out. It was _never_ wise to code this. Now we need to
     # support these lossy, mangled files.
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         result = render_arrow(
             ArrowTable(),
             P(has_header=False),
             "tab-x",
             FetchResult(fetched_path),
             self.output_path,
         )
     assert_arrow_table_equals(
         result.table, {"0": ["A", "1", "2"], "1": ["B", "3", "4"]}
     )
     self.assertEqual(result.errors, [])
 def test_render_csv_use_url_ext_given_bad_content_type(self):
     # Use text/plain type and rely on filename detection, as
     # https://raw.githubusercontent.com/ does
     with tempfile_context(prefix="fetch-") as http_path:
         httpfile.write(
             http_path,
             {"url": "http://example.com/file.csv"},
             "200 OK",
             [("content-type", "text/plain")],
             # bytes will prove we used "csv" explicitly -- we didn't
             # take "text/plain" and decide to use a CSV sniffer to
             # find the delimiter.
             io.BytesIO(b"A;B\na;b"),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=True),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     assert_arrow_table_equals(result.table, {"A;B": ["a;b"]})
     self.assertEqual(result.errors, [])
 def test_render_has_header_false(self):
     with tempfile_context("http") as http_path:
         httpfile.write(
             http_path,
             {"url": "https://blah"},
             "200 OK",
             [("content-type", "text/csv")],
             io.BytesIO(b"1,2\n3,4"),
         )
         result = render_arrow(
             ArrowTable(),
             P(has_header=False),
             "tab-x",
             FetchResult(http_path),
             self.output_path,
         )
     assert_arrow_table_equals(
         result.table,
         {
             "Column 1": pyarrow.array([1, 3], pyarrow.int8()),
             "Column 2": pyarrow.array([2, 4], pyarrow.int8()),
         },
     )
     self.assertEqual(result.errors, [])
 def test_render_missing_fetch_result_returns_empty(self):
     result = render_arrow(ArrowTable(), P(), "tab-x", None,
                           self.output_path)
     assert_arrow_table_equals(result.table, {})
     self.assertEqual(result.errors, [])
 def test_render_no_file(self):
     with self.render(P(), None) as result:
         assert_arrow_table_equals(result.table, ArrowTable())
         self.assertEqual(result.errors, [])
 def test_render_no_file(self):
     result = render_arrow(ArrowTable(), P(), "tab-x", None,
                           self.output_path)
     assert_arrow_table_equals(result.table, ArrowTable())
     self.assertEqual(result.errors, [])
def render_arrow(params):
    with tempfile_context(suffix=".arrow") as output_path:
        return pastecsv.render_arrow(ArrowTable(), params, "tab-x", None,
                                     output_path)