コード例 #1
0
def fetch_arrow(
    params: Dict[str, Any],
    secrets: Dict[str, Any],
    last_fetch_result: Optional[types.FetchResult],
    input_table_parquet_path: Optional[Path],
    output_path: Path,
) -> types.FetchResult:
    """Render using `cjwkernel.types` data types.

    The result will be encoded as a Parquet file.

    Module authors are encouraged to replace this function, because the
    `fetch()` signature deals in dataframes instead of in raw data.
    """
    pandas_result: Union[ptypes.ProcessResult,
                         types.FetchResult] = fetch_pandas(
                             params=__arrow_param_to_pandas_param(params),
                             secrets=secrets,
                             last_fetch_result=last_fetch_result,
                             input_table_parquet_path=input_table_parquet_path,
                             output_path=output_path,
                         )
    if isinstance(pandas_result, ptypes.ProcessResult):
        pandas_result.truncate_in_place_if_too_big()
        # ProcessResult => FetchResult isn't a thing; but we can hack it using
        # ProcessResult => RenderResult => FetchResult.
        with tempfile_context(suffix=".arrow") as arrow_path:
            hacky_result = pandas_result.to_arrow(arrow_path)
        if hacky_result.table.path:
            cjwparquet.write(output_path, hacky_result.table.table)
        else:
            output_path.write_bytes(b"")
        return types.FetchResult(output_path, hacky_result.errors)
    else:  # it's already a types.FetchResult
        return pandas_result
コード例 #2
0
 def test_parquet_same_data_different_bytes(self):
     cjwparquet.write(self.old_path, make_table(make_column("A", ["a"])))
     cjwparquet.write(self.new_path,
                      make_table(make_column("A", ["a"], dictionary=True)))
     self.assertTrue(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))
コード例 #3
0
 def test_slice_zero_rows(self):
     with tempfile_context() as path:
         # ensure at least 1 row group
         cjwparquet.write(
             path,
             pa.table({
                 "A":
                 pa.array([], pa.string()),
                 "B":
                 pa.DictionaryArray.from_arrays(pa.array([], pa.int32()),
                                                pa.array([], pa.string())),
                 "C":
                 pa.array([], pa.timestamp("ns")),
                 "D":
                 pa.array([], pa.float64()),
             }),
         )
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="csv",
                                           only_columns=range(4),
                                           only_rows=range(0)),
             "A,B,C,D",
         )
         self.assertEqual(
             cjwparquet.read_slice_as_text(path,
                                           format="json",
                                           only_columns=range(4),
                                           only_rows=range(0)),
             "[]",
         )
コード例 #4
0
 def _test_read_write_table(self, table, expected=None):
     if isinstance(table, dict):
         table = pa.table(table)
     if expected is None:
         expected = table
     with tempfile_context(prefix="parquet-text") as temp_path:
         cjwparquet.write(temp_path, table)
         result = cjwparquet.read(temp_path)
     assert_arrow_table_equals(result, table)
コード例 #5
0
ファイル: test_versions.py プロジェクト: Sonatrix/cjworkbench
 def test_parquet_same_data_different_bytes(self):
     cjwparquet.write(self.old_path, arrow_table({"A": ["a"]}).table)
     cjwparquet.write(
         self.new_path,
         arrow_table({
             "A": pyarrow.array(["a"]).dictionary_encode()
         }).table,
     )
     self.assertTrue(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))
コード例 #6
0
def cache_render_result(workflow: Workflow, wf_module: WfModule, delta_id: int,
                        result: RenderResult) -> None:
    """
    Save `result` for later viewing.

    Raise AssertionError if `delta_id` is not what we expect.

    Since this alters data, be sure to call it within a lock:

        with workflow.cooperative_lock():
            wf_module.refresh_from_db()  # may change delta_id
            cache_render_result(workflow, wf_module, delta_id, result)
    """
    assert delta_id == wf_module.last_relevant_delta_id
    assert result is not None

    json_bytes = json_encode(result.json).encode("utf-8")
    if not result.table.metadata.columns:
        if result.errors:
            status = "error"
        else:
            status = "unreachable"
    else:
        status = "ok"

    wf_module.cached_render_result_delta_id = delta_id
    wf_module.cached_render_result_errors = result.errors
    wf_module.cached_render_result_status = status
    wf_module.cached_render_result_json = json_bytes
    wf_module.cached_render_result_columns = result.table.metadata.columns
    wf_module.cached_render_result_nrows = result.table.metadata.n_rows

    # Now we get to the part where things can end up inconsistent. Try to
    # err on the side of not-caching when that happens.
    delete_parquet_files_for_wf_module(
        workflow.id, wf_module.id)  # makes old cache inconsistent
    wf_module.save(
        update_fields=WF_MODULE_FIELDS)  # makes new cache inconsistent
    if result.table.metadata.columns:  # only write non-zero-column tables
        with tempfile_context() as parquet_path:
            cjwparquet.write(parquet_path, result.table.table)
            minio.fput_file(BUCKET,
                            parquet_key(workflow.id, wf_module.id, delta_id),
                            parquet_path)  # makes new cache consistent
コード例 #7
0
def parquet_file(d):
    arrow_table = pyarrow.table(d)
    with tempfile.NamedTemporaryFile(suffix=".parquet") as tf:
        path = Path(tf.name)
        cjwparquet.write(path, arrow_table)
        yield path
コード例 #8
0
def call_fetch(fetch: Callable,
               request: ttypes.FetchRequest) -> ttypes.FetchResult:
    """Call `fetch()` and validate the result.

    Module code may contain errors. This function and `fetch()` should strive
    to raise developer-friendly errors in the case of bugs -- including
    unexpected input.
    """
    # thrift => pandas
    basedir = Path(request.basedir)
    params: Dict[str, Any] = thrift_json_object_to_pydict(request.params)
    output_path = basedir / request.output_filename

    spec = inspect.getfullargspec(fetch)
    kwargs = {}
    varkw = bool(spec.varkw)  # if True, function accepts **kwargs
    kwonlyargs = spec.kwonlyargs

    if varkw or "secrets" in kwonlyargs:
        kwargs["secrets"] = thrift_json_object_to_pydict(request.secrets)
    if varkw or "settings" in kwonlyargs:
        kwargs["settings"] = settings
    if varkw or "get_input_dataframe" in kwonlyargs:

        async def get_input_dataframe():
            if request.input_table_parquet_filename is None:
                return None
            else:
                return _parquet_to_pandas(basedir /
                                          request.input_table_parquet_filename)

        kwargs["get_input_dataframe"] = get_input_dataframe

    if varkw or "output_path" in kwonlyargs:
        kwargs["output_path"] = output_path

    result = fetch(params, **kwargs)
    if asyncio.iscoroutine(result):
        result = asyncio.run(result)

    if isinstance(result, tuple) and len(result) == 2 and isinstance(
            result[0], Path):
        errors = ptypes.coerce_RenderError_list(result[1])
    elif isinstance(result, Path):
        errors = []
    elif isinstance(result, list):
        errors = ptypes.coerce_RenderError_list(result)
    else:
        pandas_result = ptypes.ProcessResult.coerce(result)
        pandas_result.truncate_in_place_if_too_big()
        # ProcessResult => FetchResult isn't a thing; but we can hack it using
        # ProcessResult => RenderResult => FetchResult.
        with tempfile_context(suffix=".arrow") as arrow_path:
            if pandas_result.columns:
                hacky_result = pandas_result.to_arrow(arrow_path)
                table = load_trusted_arrow_file(arrow_path)
                cjwparquet.write(output_path, table)
                errors = hacky_result.errors
            else:
                output_path.write_bytes(b"")
                errors = pandas_result.errors

    return ttypes.FetchResult(
        filename=request.output_filename,
        errors=[arrow_render_error_to_thrift(e) for e in errors],
    )
コード例 #9
0
ファイル: test_versions.py プロジェクト: Sonatrix/cjworkbench
 def test_parquet_vs_non_parquet(self):
     cjwparquet.write(self.old_path, arrow_table({"A": ["a"]}).table)
     self.new_path.write_bytes(b"12345")
     self.assertFalse(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))
コード例 #10
0
ファイル: test_versions.py プロジェクト: Sonatrix/cjworkbench
 def test_parquet_different(self):
     cjwparquet.write(self.old_path, arrow_table({"A": [1]}).table)
     cjwparquet.write(self.new_path, arrow_table({"A": [2]}).table)
     self.assertFalse(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))
コード例 #11
0
 def test_parquet_vs_non_parquet(self):
     cjwparquet.write(self.old_path, make_table(make_column("A", ["a"])))
     self.new_path.write_bytes(b"12345")
     self.assertFalse(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))
コード例 #12
0
 def test_parquet_different(self):
     cjwparquet.write(self.old_path, make_table(make_column("A", [1])))
     cjwparquet.write(self.new_path, make_table(make_column("A", [2])))
     self.assertFalse(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))