Example #1
0
 def test_pydict_zero_rows(self):
     with tempfile_context() as path:
         # ensure at least 1 row group
         parquet.write(
             path,
             pyarrow.table({
                 "A":
                 pyarrow.array([], type=pyarrow.string()),
                 "B":
                 pyarrow.DictionaryArray.from_arrays(
                     pyarrow.array([], type=pyarrow.int32()),
                     pyarrow.array([], type=pyarrow.string()),
                 ),
                 "C":
                 pyarrow.array([], type=pyarrow.timestamp("ns")),
                 "D":
                 pyarrow.array([], type=pyarrow.float64()),
             }),
         )
         self.assertEqual(
             parquet.read_pydict(path, range(4), range(0)),
             {
                 "A": [],
                 "B": [],
                 "C": [],
                 "D": []
             },
         )
 def _test_read_write_table(self, table, expected=None):
     table = arrow_table(table).table
     if expected is None:
         expected = table
     else:
         expected = arrow_table(expected).table
     parquet.write(self.temp_path, table)
     result = parquet.read(self.temp_path)
     assert_arrow_table_equals(result, table)
Example #3
0
 def test_parquet_same_data_different_bytes(self):
     parquet.write(self.old_path, arrow_table({"A": ["a"]}).table)
     parquet.write(
         self.new_path,
         arrow_table({
             "A": pyarrow.array(["a"]).dictionary_encode()
         }).table,
     )
     self.assertTrue(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))
Example #4
0
def cache_render_result(workflow: Workflow, wf_module: WfModule, delta_id: int,
                        result: RenderResult) -> None:
    """
    Save `result` for later viewing.

    Raise AssertionError if `delta_id` is not what we expect.

    Since this alters data, be sure to call it within a lock:

        with workflow.cooperative_lock():
            wf_module.refresh_from_db()  # may change delta_id
            cache_render_result(workflow, wf_module, delta_id, result)
    """
    assert delta_id == wf_module.last_relevant_delta_id
    assert result is not None

    json_bytes = json_encode(result.json).encode("utf-8")
    if not result.table.metadata.columns:
        if result.errors:
            status = "error"
        else:
            status = "unreachable"
    else:
        status = "ok"

    wf_module.cached_render_result_delta_id = delta_id
    wf_module.cached_render_result_errors = result.errors
    wf_module.cached_render_result_error = ""  # DELETEME
    wf_module.cached_render_result_quick_fixes = []  # DELETEME
    wf_module.cached_render_result_status = status
    wf_module.cached_render_result_json = json_bytes
    wf_module.cached_render_result_columns = result.table.metadata.columns
    wf_module.cached_render_result_nrows = result.table.metadata.n_rows

    # Now we get to the part where things can end up inconsistent. Try to
    # err on the side of not-caching when that happens.
    delete_parquet_files_for_wf_module(
        workflow.id, wf_module.id)  # makes old cache inconsistent
    wf_module.save(
        update_fields=WF_MODULE_FIELDS)  # makes new cache inconsistent
    if result.table.metadata.columns:  # only write non-zero-column tables
        with tempfile_context() as parquet_path:
            parquet.write(parquet_path, result.table.table)
            minio.fput_file(BUCKET,
                            parquet_key(workflow.id, wf_module.id, delta_id),
                            parquet_path)  # makes new cache consistent
 def test_slice_zero_rows(self):
     with tempfile_context() as path:
         # ensure at least 1 row group
         parquet.write(
             path,
             pa.table({
                 "A":
                 pa.array([], pa.string()),
                 "B":
                 pa.DictionaryArray.from_arrays(pa.array([], pa.int32()),
                                                pa.array([], pa.string())),
                 "C":
                 pa.array([], pa.timestamp("ns")),
                 "D":
                 pa.array([], pa.float64()),
             }),
         )
         self.assertEqual(
             parquet.read_slice_as_text(path, "csv", range(4), range(0)),
             "A,B,C,D")
         self.assertEqual(
             parquet.read_slice_as_text(path, "json", range(4), range(0)),
             "[]")
Example #6
0
def fetch_arrow(
    params: Dict[str, Any],
    secrets: Dict[str, Any],
    last_fetch_result: Optional[types.FetchResult],
    input_table_parquet_path: Optional[Path],
    output_path: Path,
) -> types.FetchResult:
    """
    Render using `cjwkernel.types` data types.

    The result will be encoded as a Parquet file.

    Module authors are encouraged to replace this function, because the
    `fetch()` signature deals in dataframes instead of in raw data.
    """
    pandas_result: Union[ptypes.ProcessResult,
                         types.FetchResult] = fetch_pandas(
                             params=_arrow_param_to_pandas_param(params),
                             secrets=secrets,
                             last_fetch_result=last_fetch_result,
                             input_table_parquet_path=input_table_parquet_path,
                             output_path=output_path,
                         )
    if isinstance(pandas_result, ptypes.ProcessResult):
        pandas_result.truncate_in_place_if_too_big()
        # ProcessResult => FetchResult isn't a thing; but we can hack it using
        # ProcessResult => RenderResult => FetchResult.
        with tempfile_context(suffix=".arrow") as arrow_path:
            hacky_result = pandas_result.to_arrow(arrow_path)
        if hacky_result.table.path:
            parquet.write(output_path, hacky_result.table.table)
        else:
            output_path.write_bytes(b"")
        return types.FetchResult(output_path, hacky_result.errors)
    else:  # it's already a types.FetchResult
        return pandas_result
Example #7
0
 def test_parquet_vs_non_parquet(self):
     parquet.write(self.old_path, arrow_table({"A": ["a"]}).table)
     self.new_path.write_bytes(b"12345")
     self.assertFalse(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))
Example #8
0
 def test_parquet_different(self):
     parquet.write(self.old_path, arrow_table({"A": [1]}).table)
     parquet.write(self.new_path, arrow_table({"A": [2]}).table)
     self.assertFalse(
         are_fetch_results_equal(FetchResult(self.old_path),
                                 FetchResult(self.new_path)))