예제 #1
0
    def test_fetch_get_stored_dataframe_happy_path(self):
        async def fetch(params, *, get_stored_dataframe):
            df = await get_stored_dataframe()
            assert_frame_equal(df, pd.DataFrame({"A": [1]}))

        with parquet_file({"A": [1]}, dir=self.basedir) as parquet_path:
            self._test_fetch(fetch, last_fetch_result=FetchResult(parquet_path, []))
예제 #2
0
    def test_fetch_get_input_dataframe_happy_path(self):
        async def fetch(params, *, get_input_dataframe):
            df = await get_input_dataframe()
            assert_frame_equal(df, pd.DataFrame({"A": [1]}))

        with parquet_file({"A": [1]}, dir=self.basedir) as parquet_path:
            self._test_fetch(fetch, input_table_parquet_path=parquet_path)
예제 #3
0
 def test_default_render_returns_fetch_result(self):
     # Functionality used by libraryofcongress
     with ExitStack() as ctx:
         input_arrow_table = ctx.enter_context(
             arrow_table_context({"A": [1]}, dir=self.basedir)
         )
         parquet_filename = Path(
             ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name
         ).name
         out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name
         thrift_result = module.render_thrift(
             ttypes.RenderRequest(
                 str(self.basedir),
                 input_arrow_table.to_thrift(),
                 Params({}).to_thrift(),
                 ttypes.Tab("tab-1", "Tab 1"),
                 ttypes.FetchResult(
                     parquet_filename,
                     [RenderError(I18nMessage.TODO_i18n("A warning")).to_thrift()],
                 ),
                 out_filename,
             )
         )
         result = RenderResult.from_thrift(thrift_result, self.basedir)
         assert_render_result_equals(
             result,
             RenderResult(
                 arrow_table({"A": [2]}),
                 [RenderError(I18nMessage.TODO_i18n("A warning"))],
             ),
         )
예제 #4
0
 def test_slice_lots_of_types(self):
     dt1 = datetime(2019, 12, 18, 23, 33, 55, 123000)
     dt2 = datetime(2019, 12, 18)
     with parquet_file({
             "str": ["x", "y", None, ""],
             "cat":
             pa.array(["x", "y", None, ""]).dictionary_encode(),
             "dt":
             pa.array([dt1, None, dt2, None], pa.timestamp("ns")),
             "int32": [1, 2, None, 2**31],
             "float": [1.1, None, 3.3, 4.4],
     }) as path:
         self.assertEqual(
             parquet.read_slice_as_text(path, "csv", range(5), range(4)),
             "\n".join([
                 "str,cat,dt,int32,float",
                 "x,x,2019-12-18T23:33:55.123Z,1,1.1",
                 "y,y,,2,",
                 ",,2019-12-18,,3.3",
                 ",,,2147483648,4.4",
             ]),
         )
         self.assertEqual(
             parquet.read_slice_as_text(path, "json", range(5), range(4)),
             "".join([
                 "[",
                 '{"str":"x","cat":"x","dt":"2019-12-18T23:33:55.123Z","int32":1,"float":1.1},',
                 '{"str":"y","cat":"y","dt":null,"int32":2,"float":null},',
                 '{"str":null,"cat":null,"dt":"2019-12-18","int32":null,"float":3.3},',
                 '{"str":"","cat":"","dt":null,"int32":2147483648,"float":4.4}',
                 "]",
             ]),
         )
예제 #5
0
    def test_fetch_result_happy_path(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        wf_module = tab.wf_modules.create(
            order=0,
            slug="step-1",
            module_id_name="x",
            last_relevant_delta_id=workflow.last_delta_id,
            fetch_error="maybe an error",
        )
        with parquet_file({"A": [1]}) as path:
            so = create_stored_object(workflow.id, wf_module.id, path)
        wf_module.stored_data_version = so.stored_at
        wf_module.save(update_fields=["stored_data_version"])

        def render(*args, fetch_result, **kwargs):
            self.assertEqual(
                fetch_result.errors,
                [RenderError(I18nMessage.TODO_i18n("maybe an error"))],
            )
            assert_arrow_table_equals(
                pyarrow.parquet.read_table(str(fetch_result.path)), {"A": [1]})
            return RenderResult()

        with self._stub_module(render):
            self.run_with_async_db(
                execute_wfmodule(
                    workflow,
                    wf_module,
                    {},
                    Tab(tab.slug, tab.name),
                    RenderResult(),
                    {},
                    Path("/unused"),
                ))
예제 #6
0
 def test_render_deprecated_parquet(self):
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         result = render_arrow(
             ArrowTable(), P(), "tab-x", FetchResult(fetched_path), self.output_path
         )
     assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]})
     self.assertEqual(result.errors, [])
    def test_fetch_result_deleted_file_means_none(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        wf_module = tab.wf_modules.create(
            order=0,
            slug="step-1",
            module_id_name="x",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        with parquet_file({"A": [1]}) as path:
            so = create_stored_object(workflow.id, wf_module.id, path)
        wf_module.stored_data_version = so.stored_at
        wf_module.save(update_fields=["stored_data_version"])
        # Now delete the file on S3 -- but leave the DB pointing to it.
        minio.remove(so.bucket, so.key)

        def render(*args, fetch_result, **kwargs):
            self.assertIsNone(fetch_result)
            return RenderResult()

        with self._stub_module(render):
            self.run_with_async_db(
                execute_wfmodule(
                    self.chroot_context,
                    workflow,
                    wf_module,
                    {},
                    Tab(tab.slug, tab.name),
                    RenderResult(),
                    {},
                    self.output_path,
                ))
예제 #8
0
 def test_pydict_zero_row_groups(self):
     table = pyarrow.Table.from_batches([],
                                        schema=pyarrow.schema([
                                            ("A", pyarrow.string())
                                        ]))
     with parquet_file(table) as path:
         self.assertEqual(parquet.read_pydict(path, range(1), range(0)),
                          {"A": []})
예제 #9
0
    def test_render_with_parquet_fetch_result(self):
        def render(*args, fetch_result):
            return fetch_result

        with parquet_file({"A": ["fetched"]}, dir=self.basedir) as pf:
            result = self._test_render(render, fetch_result=FetchResult(pf))
            assert_render_result_equals(
                result, RenderResult(arrow_table({"A": ["fetched"]})))
예제 #10
0
 def test_render_deprecated_parquet(self):
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         with self.render(P(), FetchResult(fetched_path)) as result:
             assert_arrow_table_equals(result.table, {
                 "A": [1, 2],
                 "B": [3, 4]
             })
             self.assertEqual(result.errors, [])
예제 #11
0
 def test_slice_ignore_missing_columns(self):
     with parquet_file({"A": [1]}) as path:
         self.assertEqual(
             parquet.read_slice_as_text(path, "csv", range(3), range(1)),
             "A\n1")
         self.assertEqual(
             parquet.read_slice_as_text(path, "json", range(3), range(1)),
             '[{"A":1}]',
         )
예제 #12
0
 def test_render_deprecated_parquet_warning(self):
     errors = [RenderError(I18nMessage.TODO_i18n("truncated table"))]
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         with self.render(P(), FetchResult(fetched_path, errors)) as result:
             assert_arrow_table_equals(result.table, {
                 "A": [1, 2],
                 "B": [3, 4]
             })
             self.assertEqual(result.errors, errors)
예제 #13
0
 def test_load_selected_stored_object(self):
     workflow = Workflow.create_and_init()
     step = workflow.tabs.first().steps.create(
         order=0, slug="step-1", module_id_name="foodeleted"
     )
     with parquet_file({"A": [1]}) as path1:
         storedobjects.create_stored_object(workflow.id, step.id, path1)
     with parquet_file({"A": [2]}) as path2:
         so2 = storedobjects.create_stored_object(workflow.id, step.id, path2)
     with parquet_file({"A": [3]}) as path3:
         storedobjects.create_stored_object(workflow.id, step.id, path3)
     step.stored_data_version = so2.stored_at
     step.save(update_fields=["stored_data_version"])
     result = self.run_with_async_db(
         fetch.load_database_objects(workflow.id, step.id)
     )
     self.assertEqual(result[3], so2)
     self.assertEqual(result.stored_object, so2)
예제 #14
0
    def test_storage_limits(self, limit):
        workflow = Workflow.create_and_init()
        wf_module = workflow.tabs.first().wf_modules.create(order=0,
                                                            slug="step-1")

        with parquet_file({"A": [1], "B": ["x"]}) as parquet_path:
            self.run_with_async_db(
                save.create_result(workflow.id, wf_module,
                                   FetchResult(parquet_path), timezone.now()))
        limit.assert_called_with(wf_module)
예제 #15
0
 def test_slice_zero_row_groups(self):
     table = pa.Table.from_batches([],
                                   schema=pa.schema([("A", pa.string())]))
     with parquet_file(table) as path:
         self.assertEqual(
             parquet.read_slice_as_text(path, "csv", range(1), range(0)),
             "A")
         self.assertEqual(
             parquet.read_slice_as_text(path, "json", range(1), range(0)),
             "[]")
예제 #16
0
 def test_pydict_nan(self):
     with parquet_file({
             "A":
             pyarrow.array([1.1, float("nan"), None],
                           type=pyarrow.float64())
     }) as path:
         result = parquet.read_pydict(path, range(1), range(3))
         self.assertEqual(result["A"][0], 1.1)
         self.assert_(math.isnan(result["A"][1]))
         self.assert_(math.isnan(result["A"][2]))
예제 #17
0
    def test_fetch_result_happy_path(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        step = tab.steps.create(
            order=0,
            slug="step-1",
            module_id_name="x",
            last_relevant_delta_id=workflow.last_delta_id,
            fetch_errors=[
                RenderError(I18nMessage("foo", {}, "module")),
                RenderError(I18nMessage("bar", {"x": "y"}, "cjwmodule")),
            ],
        )
        with parquet_file({"A": [1]}) as path:
            so = create_stored_object(workflow.id, step.id, path)
        step.stored_data_version = so.stored_at
        step.save(update_fields=["stored_data_version"])

        module_zipfile = create_module_zipfile(
            "x",
            spec_kwargs={"loads_data": True},
            python_code=textwrap.dedent(
                """
                import pyarrow as pa
                import pandas as pd
                from pandas.testing import assert_frame_equal
                from cjwkernel.types import RenderError, I18nMessage

                def render(table, params, *, fetch_result, **kwargs):
                    assert fetch_result.errors == [
                        RenderError(I18nMessage("foo", {}, "module")),
                        RenderError(I18nMessage("bar", {"x": "y"}, "cjwmodule")),
                    ]
                    fetch_dataframe = pa.parquet.read_table(str(fetch_result.path))
                    assert_frame_equal(fetch_dataframe, pd.DataFrame({"A": [1]}))
                    return pd.DataFrame()
                """
            ),
        )

        with self.assertLogs(level=logging.INFO):
            self.run_with_async_db(
                execute_step(
                    chroot_context=self.chroot_context,
                    workflow=workflow,
                    step=step,
                    module_zipfile=module_zipfile,
                    params={},
                    tab_name=tab.name,
                    input_path=self.empty_table_path,
                    input_table_columns=[],
                    tab_results={},
                    output_path=self.output_path,
                )
            )
예제 #18
0
    def test_race_hard_deleted_wf_module(self):
        workflow = Workflow.create_and_init()
        wf_module = workflow.tabs.first().wf_modules.create(order=0,
                                                            slug="step-1")
        WfModule.objects.filter(id=wf_module.id).delete()

        # Don't crash
        with parquet_file({"A": [1], "B": ["x"]}) as parquet_path:
            self.run_with_async_db(
                save.create_result(workflow.id, wf_module,
                                   FetchResult(parquet_path), timezone.now()))
예제 #19
0
    def test_render_with_parquet_fetch_result(self):
        def render(table, params, *, fetch_result):
            return fetch_result

        with ModuleTestEnv(render=render) as env:
            with parquet_file({"A": ["fetched"]}, dir=env.basedir) as pf:
                outcome = env.call_render(make_table(), {},
                                          fetch_result=FetchResult(pf))
                assert_arrow_table_equals(
                    outcome.read_table(),
                    make_table(make_column("A", ["fetched"])))
예제 #20
0
 def test_slice_rows(self):
     with parquet_file({"A": [0, 1, 2, 3, 4, 5, 6, 7]}) as path:
         self.assertEqual(
             parquet.read_slice_as_text(path, "csv", range(1), range(2, 5)),
             "A\n2\n3\n4",
         )
         self.assertEqual(
             parquet.read_slice_as_text(path, "json", range(1), range(2,
                                                                      5)),
             '[{"A":2},{"A":3},{"A":4}]',
         )
 def test_render_deprecated_parquet_warning(self):
     errors = [RenderError(I18nMessage.TODO_i18n("truncated table"))]
     with parquet_file({"A": [1, 2], "B": [3, 4]}) as fetched_path:
         result = render_arrow(
             ArrowTable(),
             P(),
             "tab-x",
             FetchResult(fetched_path, errors=errors),
             self.output_path,
         )
     assert_arrow_table_equals(result.table, {"A": [1, 2], "B": [3, 4]})
     self.assertEqual(result.errors, errors)
예제 #22
0
    def test_storage_limits(self, limit):
        workflow = Workflow.create_and_init()
        step = workflow.tabs.first().steps.create(order=0, slug="step-1")

        with parquet_file({"A": [1], "B": ["x"]}) as parquet_path:
            self.run_with_async_db(
                save.create_result(
                    workflow.id,
                    step,
                    FetchResult(parquet_path),
                    datetime.datetime.now(),
                ))
        limit.assert_called_with(step=step)
예제 #23
0
    def test_race_soft_deleted_wf_module(self):
        workflow = Workflow.create_and_init()
        wf_module = workflow.tabs.first().wf_modules.create(order=0,
                                                            slug="step-1",
                                                            is_deleted=True)
        workflow_id = workflow.id
        workflow.delete()

        # Don't crash
        with parquet_file({"A": [1], "B": ["x"]}) as parquet_path:
            self.run_with_async_db(
                save.create_result(workflow_id, wf_module,
                                   FetchResult(parquet_path), timezone.now()))
        self.assertEqual(wf_module.stored_objects.count(), 0)
예제 #24
0
    def test_race_hard_deleted_step(self):
        workflow = Workflow.create_and_init()
        step = workflow.tabs.first().steps.create(order=0, slug="step-1")
        Step.objects.filter(id=step.id).delete()

        # Don't crash
        with parquet_file({"A": [1], "B": ["x"]}) as parquet_path:
            self.run_with_async_db(
                save.create_result(
                    workflow.id,
                    step,
                    FetchResult(parquet_path),
                    datetime.datetime.now(),
                ))
예제 #25
0
 def test_default_render_returns_fetch_result(self):
     # Functionality used by libraryofcongress
     #
     # TODO nix this functionality.
     with ModuleTestEnv() as env:
         with parquet_file({"A": [2]}, dir=env.basedir) as parquet_path:
             outcome = env.call_render(
                 make_table(),
                 {},
                 fetch_result=FetchResult(
                     path=parquet_path,
                     errors=[FetchError(TODO_i18n("A warning"))]),
             )
         self.assertEqual(
             outcome.result,
             RenderResult([RenderError(TODO_i18n("A warning"))]))
         assert_arrow_table_equals(outcome.read_table(),
                                   make_table(make_column("A", [2])))
예제 #26
0
    def test_fetch_result_deleted_file_means_none(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        step = tab.steps.create(
            order=0,
            slug="step-1",
            module_id_name="x",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        with parquet_file({"A": [1]}) as path:
            so = create_stored_object(workflow.id, step.id, path)
        step.stored_data_version = so.stored_at
        step.save(update_fields=["stored_data_version"])
        # Now delete the file on S3 -- but leave the DB pointing to it.
        s3.remove(s3.StoredObjectsBucket, so.key)

        module_zipfile = create_module_zipfile(
            "x",
            spec_kwargs={"loads_data": True},
            python_code=textwrap.dedent(
                """
                import pandas as pd
                def render(table, params, *, fetch_result, **kwargs):
                    assert fetch_result is None
                    return pd.DataFrame()
                """
            ),
        )

        with self.assertLogs(level=logging.INFO):
            self.run_with_async_db(
                execute_step(
                    chroot_context=self.chroot_context,
                    workflow=workflow,
                    step=step,
                    module_zipfile=module_zipfile,
                    params={},
                    tab_name=tab.name,
                    input_path=self.empty_table_path,
                    input_table_columns=[],
                    tab_results={},
                    output_path=self.output_path,
                )
            )
예제 #27
0
    def test_fetch_result_deleted_file_means_none(self):
        workflow = Workflow.create_and_init()
        tab = workflow.tabs.first()
        wf_module = tab.wf_modules.create(
            order=0,
            slug="step-1",
            module_id_name="x",
            last_relevant_delta_id=workflow.last_delta_id,
        )
        with parquet_file({"A": [1]}) as path:
            so = create_stored_object(workflow.id, wf_module.id, path)
        wf_module.stored_data_version = so.stored_at
        wf_module.save(update_fields=["stored_data_version"])
        # Now delete the file on S3 -- but leave the DB pointing to it.
        minio.remove(minio.StoredObjectsBucket, so.key)

        def render(*args, fetch_result, **kwargs):
            self.assertIsNone(fetch_result)
            return RenderResult()

        module_zipfile = create_module_zipfile(
            "x",
            python_code=textwrap.dedent("""
                import pandas as pd
                def render(table, params, *, fetch_result, **kwargs):
                    assert fetch_result is None
                    return pd.DataFrame()
                """),
        )

        with self.assertLogs(level=logging.INFO):
            self.run_with_async_db(
                execute_wfmodule(
                    self.chroot_context,
                    workflow,
                    wf_module,
                    module_zipfile,
                    {},
                    Tab(tab.slug, tab.name),
                    RenderResult(),
                    {},
                    self.output_path,
                ))
예제 #28
0
 def test_default_render_returns_fetch_result(self):
     # Functionality used by libraryofcongress
     with ExitStack() as ctx:
         input_arrow_table = ctx.enter_context(
             arrow_table_context({"A": [1]}, dir=self.basedir)
         )
         parquet_filename = Path(
             ctx.enter_context(parquet_file({"A": [2]}, dir=self.basedir)).name
         ).name
         out_filename = ctx.enter_context(tempfile_context(dir=self.basedir)).name
         thrift_result = module.render_thrift(
             ttypes.RenderRequest(
                 str(self.basedir),
                 arrow_arrow_table_to_thrift(input_arrow_table),
                 {},  # params
                 ttypes.Tab("tab-1", "Tab 1"),
                 ttypes.FetchResult(
                     parquet_filename,
                     [
                         ttypes.RenderError(
                             ttypes.I18nMessage(
                                 "TODO_i18n",
                                 {
                                     "text": ttypes.I18nArgument(
                                         string_value="A warning"
                                     )
                                 },
                             ),
                             [],
                         )
                     ],
                 ),
                 out_filename,
             )
         )
         result = thrift_render_result_to_arrow(thrift_result, self.basedir)
         assert_render_result_equals(
             result,
             RenderResult(
                 arrow_table({"A": [2]}),
                 [RenderError(I18nMessage.TODO_i18n("A warning"))],
             ),
         )
예제 #29
0
    def test_create_result(self, send_update):
        send_update.side_effect = async_noop

        workflow = Workflow.create_and_init()
        wf_module = workflow.tabs.first().wf_modules.create(
            order=0,
            slug="step-1",
            is_busy=True,
            fetch_errors=[RenderError(I18nMessage("foo", {}, "module"))],
        )
        now = timezone.datetime(2019, 10, 22, 12, 22, tzinfo=timezone.utc)

        with parquet_file({"A": [1], "B": ["x"]}) as parquet_path:
            self.run_with_async_db(
                save.create_result(
                    workflow.id, wf_module, FetchResult(parquet_path), now
                )
            )
        self.assertEqual(wf_module.stored_objects.count(), 1)

        self.assertEqual(wf_module.fetch_errors, [])
        self.assertEqual(wf_module.is_busy, False)
        self.assertEqual(wf_module.last_update_check, now)
        wf_module.refresh_from_db()
        self.assertEqual(wf_module.fetch_errors, [])
        self.assertEqual(wf_module.is_busy, False)
        self.assertEqual(wf_module.last_update_check, now)

        send_update.assert_called_with(
            workflow.id,
            clientside.Update(
                steps={
                    wf_module.id: clientside.StepUpdate(
                        is_busy=False, last_fetched_at=now
                    )
                }
            ),
        )

        workflow.refresh_from_db()
        self.assertIsInstance(workflow.last_delta, ChangeDataVersionCommand)
예제 #30
0
 def test_pydict_lots_of_types(self):
     dt1 = datetime.now()
     dt2 = datetime.now()
     with parquet_file({
             "str": ["x", "y", None, "z"],
             "cat":
             pyarrow.array(["x", "y", None, "x"]).dictionary_encode(),
             "dt": [dt1, None, dt2, None],
             "int32": [1, 2, 3, 2**31],
             "float": [1.1, 2.2, 3.3, 4.4],
     }) as path:
         self.assertEqual(
             parquet.read_pydict(path, range(5), range(4)),
             {
                 "str": ["x", "y", None, "z"],
                 "cat": ["x", "y", None, "x"],
                 "dt": [dt1, None, dt2, None],
                 "int32": [1, 2, 3, 2**31],
                 "float": [1.1, 2.2, 3.3, 4.4],
             },
         )