def test_parse_renames_rename_too_long_columns(): assert _parse_renames( { "A": "BBBBBBBBBB", "BBBBBBBBBB": "BBBBBBBBBB" }, ["A", "BBBBBBBBBB"], settings=Settings(MAX_BYTES_PER_COLUMN_NAME=10), ) == ( { "A": "BBBBBBBB 2" }, [ RenderError( cjwmodule_i18n_message( "util.colnames.warnings.truncated", { "n_columns": 1, "first_colname": "BBBBBBBB 2", "n_bytes": 10 }, )), RenderError( cjwmodule_i18n_message( "util.colnames.warnings.numbered", { "n_columns": 1, "first_colname": "BBBBBBBB 2" }, )), ], )
def test_duplicate_copies_fresh_cache(self): # The cache's filename depends on workflow_id and step_id. # Duplicating it would need more complex code :). table = make_table(make_column("A", [1], format="${:,.2f}")) write_to_rendercache( self.workflow, self.step, 1, table=table, errors=[RenderError(I18nMessage("X", {}, None))], json={"foo": "bar"}, ) workflow2 = Workflow.objects.create() tab2 = workflow2.tabs.create(position=0) dup = self.step.duplicate_into_new_workflow(tab2) dup_cached_result = dup.cached_render_result self.assertEqual( dup_cached_result, replace( self.step.cached_render_result, workflow_id=workflow2.id, step_id=dup.id, delta_id=0, ), ) with open_cached_render_result(dup_cached_result) as result2: assert_arrow_table_equals(result2.table, table) self.assertEqual(result2.errors, [RenderError(I18nMessage("X", {}, None))]) self.assertEqual(result2.json, {"foo": "bar"})
def test_assert_result_equals_ok(): table1 = make_table(make_column("A", [1])) table2 = make_table(make_column("A", [1])) assert_result_equals( ArrowRenderResult( table1, errors=[RenderError(I18nMessage("foo", {}, "module"))], json={"foo": "bar"}, ), ArrowRenderResult( table2, errors=[RenderError(I18nMessage("foo", {}, "module"))], json={"foo": "bar"}, ), )
def test_group_date_prompt_all_is_well_when_date_column_present(): assert_result_equals( render( make_table( make_column("A", [datetime.date(2021, 5, 10)], unit="week"), make_column("B", [1]), ), P( groups=dict( colnames=["A", "B"], group_dates=True, date_granularities={} ), aggregations=[dict(operation="size", colname="", outname="size")], ), ), ArrowRenderResult( make_table( make_column("A", [datetime.date(2021, 5, 10)], unit="week"), make_column("B", [1]), make_column("size", [1], format="{:,d}"), ), [ RenderError( i18n_message( "group_dates.date_selected", dict(columns=1, column0="A", unit0="week"), ) ) ], ), )
def test_group_date_prompt_upgrade_timestamp_to_date(): assert_result_equals( render( make_table(make_column("A", [datetime.datetime(2021, 5, 5)])), P( groups=dict( colnames=["A"], group_dates=True, date_granularities={"A": "Y"} ), aggregations=[dict(operation="size", colname="", outname="size")], ), ), ArrowRenderResult( make_table( make_column("A", [datetime.datetime(2021, 1, 1)]), make_column("size", [1], format="{:,d}"), ), [ RenderError( i18n_message("group_dates.granularity_deprecated.need_dates"), [ QuickFix( i18n_message( "group_dates.granularity_deprecated.quick_fix.convert_to_date" ), QuickFixAction.PrependStep( "converttimestamptodate", dict(colnames=["A"], unit="year"), ), ) ], ) ], ), )
def test_ignore_non_date_timestamps(): # Steps for the user to get here: # 1. Make a date column, 'A' # 2. Check "Group Dates". The column appears. # 3. Select column 'A', and select a date granularity for it # 4. Alter the input DataFrame such that 'A' is no longer datetime # # Expected results: you can't group it by date any more. assert_result_equals( render( make_table( make_column("A", [1]), # "used to be a datetime" make_column( "B", [datetime.datetime(2019, 1, 4)] ), # so we don't need quickfix ), P( groups=dict( colnames=["A"], group_dates=True, date_granularities={"A": "T"} ), aggregations=[dict(operation="size", colname="", outname="size")], ), ), ArrowRenderResult( make_table(make_column("A", [1]), make_column("size", [1], format="{:,d}")), [RenderError(i18n_message("group_dates.select_date_columns"))], ), )
def _parse_renames( renames: Dict[str, str], table_columns: List[str], *, settings: Settings) -> Tuple[Dict[str, str], List[RenderError]]: """Convert `renames` into a valid mapping for `table_columns`, plus warnings. Ignore any renames to "". That column name is not allowed. Return a minimal and valid dict from old colname to new colname. `renames` is a dict mapping old colname to new colname. It may contain missing origin column names and it may duplicate destination column names. The logic to handle this: do _all_ the user's renames at once, and then queue extra renames for columns that end up with duplicate names. Those extra renames are handled left-to-right (the order of `table_columns` matters). """ # "renames.get(c) or c" means: # * If renames[c] exists and is "", return c # * If renames[c] does not exist, return c # * If renames[c] exists and is _not_ "", return renames[c] nix_colnames = [c for c in table_columns if (renames.get(c) or c) != c] nix_colnames_set = frozenset(nix_colnames) existing_colnames = [c for c in table_columns if c not in nix_colnames_set] try_new_colnames = [ renames[c] for c in table_columns if c in nix_colnames_set ] new_colnames, errors = gen_unique_clean_colnames_and_warn( try_new_colnames, existing_names=existing_colnames, settings=settings) return {k: v for k, v in zip(nix_colnames, new_colnames) }, [RenderError(message) for message in errors]
def test_assert_result_equals_check_errors(): with pytest.raises(AssertionError, match=r"-\[Render.*\n\+\[\]"): assert_result_equals( ArrowRenderResult(make_table()), ArrowRenderResult( make_table(), errors=[RenderError(I18nMessage("foo", {}, "module"))]), )
def call_render(module_spec: ModuleSpec, render: Callable, request: ttypes.RenderRequest) -> ttypes.RenderResult: basedir = Path(request.basedir) input_path = basedir / request.input_filename table = load_trusted_arrow_file(input_path) dataframe = cjwpandasmodule.convert.arrow_table_to_pandas_dataframe(table) tab_outputs = { k: _thrift_tab_output_to_pandas(v, basedir) for k, v in request.tab_outputs.items() } params = _prepare_params(module_spec, thrift_json_object_to_pydict(request.params), basedir, tab_outputs) spec = inspect.getfullargspec(render) kwargs = {} varkw = bool(spec.varkw) # if True, function accepts **kwargs kwonlyargs = spec.kwonlyargs if varkw or "fetch_result" in kwonlyargs: if request.fetch_result is None: fetch_result = None else: fetch_result_path = basedir / request.fetch_result.filename errors = [ # Data comes in as FetchError and we return RenderError. RenderError(thrift_i18n_message_to_arrow(e.message)) for e in request.fetch_result.errors ] if (fetch_result_path.stat().st_size == 0 or cjwparquet.file_has_parquet_magic_number( fetch_result_path)): fetch_result = ptypes.ProcessResult( dataframe=_parquet_to_pandas(fetch_result_path), errors=errors, # infer columns -- the fetch interface doesn't handle formats # (TODO nix pandas_v0 fetching altogether by rewriting all modules) ) else: # TODO nix pandas Fetch modules. (Do any use files, even?) fetch_result = types.FetchResult(path=fetch_result_path, errors=errors) kwargs["fetch_result"] = fetch_result if varkw or "settings" in kwonlyargs: kwargs["settings"] = settings if varkw or "tab_name" in kwonlyargs: kwargs["tab_name"] = request.tab_name if varkw or "input_columns" in kwonlyargs: kwargs["input_columns"] = arrow_schema_to_render_columns(table.schema) input_columns = read_columns(table, full=False) raw_result = render(dataframe, params, **kwargs) # raise ValueError if invalid pandas_result = ptypes.ProcessResult.coerce( raw_result, try_fallback_columns=input_columns) pandas_result.truncate_in_place_if_too_big() arrow_result = pandas_result.to_arrow(basedir / request.output_filename) return arrow_render_result_to_thrift(arrow_result)
def test_delete_step(self): write_to_rendercache( self.workflow, self.step, 1, table=make_table(make_column("A", [1])), errors=[RenderError(I18nMessage("X", {}, None), [])], json={"foo": "bar"}, ) parquet_key = crr_parquet_key(self.step.cached_render_result) self.step.delete() self.assertFalse(s3.exists(BUCKET, parquet_key))
def test_group_date_prompt_when_nothing_selected(): assert_result_equals( render( make_table(make_column("A", [1])), P( groups=dict(colnames=[], group_dates=True, date_granularities={}), aggregations=[dict(operation="sum", colname="A", outname="sum")], ), ), ArrowRenderResult( make_table(make_column("sum", [1])), [RenderError(i18n_message("group_dates.select_date_columns"))], ), )
def test_group_dates_prompt_select_date_column(): assert_result_equals( render( make_table(make_column("A", [1])), P( groups=dict(colnames=["A"], group_dates=True, date_granularities={}), aggregations=[dict(operation="size", colname="", outname="size")], ), ), ArrowRenderResult( make_table(make_column("A", [1]), make_column("size", [1], format="{:,d}")), errors=[RenderError(i18n_message("group_dates.select_date_columns"))], ), )
def test_group_date_prompt_convert_text_to_date(): assert_result_equals( render( make_table( make_column("A", ["2021-05-05"]), make_column("B", ["2021-05-05"]), ), P( groups=dict( colnames=["A", "B"], group_dates=True, date_granularities={} ), aggregations=[dict(operation="size", colname="", outname="size")], ), ), ArrowRenderResult( make_table( make_column("A", ["2021-05-05"]), make_column("B", ["2021-05-05"]), make_column("size", [1], format="{:,d}"), ), [ RenderError( i18n_message( "group_dates.text_selected", dict(columns=2, column0="A") ), [ QuickFix( i18n_message("group_dates.quick_fix.convert_text_to_date"), QuickFixAction.PrependStep( "converttexttodate", dict(colnames=["A", "B"]) ), ), QuickFix( i18n_message( "group_dates.quick_fix.convert_text_to_timestamp" ), QuickFixAction.PrependStep( "convert-date", dict(colnames=["A", "B"]) ), ), ], ) ], ), )
def _parse_custom_list( custom_list: str, table_columns: List[str], *, settings: Settings) -> Tuple[Dict[str, str], List[i18n.I18nMessage]]: """Convert `custom_list` into a valid mapping for `table_columns`. Return a minimal and valid dict from old colname to new colname. Raise `ValueError` if the user entered too many column names. `custom_list` is a textarea filled in by a user, separated by commas/newlines. (We prefer newlines, but if the user writes a comma-separated list we use commas.) The logic to handle this: do _all_ the user's renames at once, and then queue extra renames for columns that end up with duplicate names. Those extra renames are handled left-to-right (the order of `table_columns` matters). """ # Chomp trailing newline, in case the user enters "A,B,C\n". custom_list = custom_list.rstrip() # Split by newline (preferred) or comma (if the user wants that) if "\n" in custom_list: split_char = "\n" else: split_char = "," rename_list = [s.strip() for s in custom_list.split(split_char)] # Convert to dict try: renames = {table_columns[i]: s for i, s in enumerate(rename_list) if s} except IndexError: raise RenderErrorException( RenderError( i18n.trans( "badParam.custom_list.wrongNumberOfNames", "You supplied {n_names, plural, other {# column names} one {# column name}}, " "but the table has {n_columns, plural, other {# columns} one {# column}}.", { "n_names": len(rename_list), "n_columns": len(table_columns) }, ))) # Use _parse_renames() logic to consider missing columns and uniquify return _parse_renames(renames, table_columns, settings=settings)
def render_arrow_v1(arrow_table, params, *, uploaded_files, **kwargs): if params["file"] is None: return ArrowRenderResult(pa.table({})) path = uploaded_files[params["file"]].path try: arrow_table, errors = _build_arrow_table(path, params["query_slug"]) except (InvalidLz4File, sqlite3.DatabaseError): return ArrowRenderResult( pa.table({}), [ RenderError( i18n.trans("error.invalidFile", "Please upload a valid .sqlite3.lz4 file.")) ], ) return ArrowRenderResult(arrow_table, errors=errors)
def test_render_result(self): error = RenderError( message=I18nMessage("x", {"y": 1}, "module"), quick_fixes=[ QuickFix( button_text=I18nMessage("z", {}, "module"), action=QuickFixAction.PrependStep("converttotext", {"a": "b"}), ) ], ) # we're testing it is serialized+deserialized correctly def render_arrow_v1(table, params, **kwargs): return ArrowRenderResult(make_table(make_column("A", ["x"])), [error]) with ModuleTestEnv(render_arrow_v1=render_arrow_v1) as env: outcome = env.call_render(make_table(), {}) self.assertEqual(outcome.result, RenderResult([error]))
def test_parse_renames_avoid_duplicates(): assert _parse_renames({ "A": "B", "C": "B" }, ["A", "B", "C"], settings=Settings()) == ( { "A": "B 2", "C": "B 3" }, [ RenderError( cjwmodule_i18n_message( id="util.colnames.warnings.numbered", arguments={ "n_columns": 2, "first_colname": "B 2" }, ), ) ], )
def test_parse_renames_avoid_duplicates_without_original(): assert _parse_renames({ "A": "C", "B": "C" }, ["A", "B"], settings=Settings()) == ( { "A": "C", "B": "C 2" }, [ RenderError( cjwmodule_i18n_message( id="util.colnames.warnings.numbered", arguments={ "n_columns": 1, "first_colname": "C 2" }, )) ], )
def test_render_rename_custom_list_too_many_columns_is_error(): result = render( make_table(make_column("A", ["x"])), P(custom_list=True, list_string="X,Y"), settings=Settings(), ) assert_result_equals( result, ArrowRenderResult( make_table(), [ RenderError( i18n_message( "badParam.custom_list.wrongNumberOfNames", { "n_names": 2, "n_columns": 1 }, )) ], ), )
def test_group_date_prompt_upgrade_timestampmath(): assert_result_equals( render( make_table(make_column("A", [datetime.datetime(2021, 5, 5, 1, 2, 3, 4)])), P( groups=dict( colnames=["A"], group_dates=True, date_granularities={"A": "S"} ), aggregations=[dict(operation="size", colname="", outname="size")], ), ), ArrowRenderResult( make_table( make_column("A", [datetime.datetime(2021, 5, 5, 1, 2, 3)]), make_column("size", [1], format="{:,d}"), ), [ RenderError( i18n_message("group_dates.granularity_deprecated.need_rounding"), [ QuickFix( i18n_message( "group_dates.granularity_deprecated.quick_fix.round_timestamps" ), QuickFixAction.PrependStep( "timestampmath", dict( colnames=["A"], operation="startof", roundunit="second", ), ), ) ], ) ], ), )
def test_startof_out_of_bounds(): assert_result_equals( render( make_table( make_column( "A", [dt(1970, 1, 1), dt(1677, 9, 21, 0, 12, 43, 145500)], ) ), P(operation="startof", colnames=["A"], roundunit="minute"), ), ArrowRenderResult( make_table(make_column("A", [dt(1970, 1, 1), None])), [ RenderError( i18n_message( "warning.convertedOutOfBoundsToNull", {"timestamp": "1677-09-21T00:12Z"}, ) ) ], ), )
def _render_startof(table: pa.Table, colnames: List[str], unit: str) -> ArrowRenderResult: truncated = False for colname in colnames: i = table.column_names.index(colname) column_result = _startof(table.columns[i], unit) table = table.set_column(i, colname, column_result.column) if column_result.truncated: truncated = True if truncated: errors = [ RenderError( trans( "warning.convertedOutOfBoundsToNull", "Converted timestamp {timestamp} to null because it is out of bounds.", {"timestamp": _out_of_bounds_timestamp(unit)}, )) ] else: errors = [] return ArrowRenderResult(table, errors=errors)
def test_quickfix_convert_value_strings_to_numbers(): assert_result_equals( render( make_table( make_column("A", [1, 1, 1]), make_column("B", ["a", "b", "a"]), make_column("C", ["a", "b", "a"]), ), P( groups=dict(colnames=["A"], group_dates=False, date_granularities={}), aggregations=[ dict(operation="mean", colname="B", outname="mean"), dict(operation="sum", colname="C", outname="sum"), ], ), ), ArrowRenderResult( make_table(), [ RenderError( i18n_message( "non_numeric_colnames.error", {"n_columns": 2, "first_colname": "B"}, ), quick_fixes=[ QuickFix( i18n_message("non_numeric_colnames.quick_fix.text"), QuickFixAction.PrependStep( "converttexttonumber", {"colnames": ["B", "C"]} ), ) ], ) ], ), )
def thrift_render_error_to_arrow(value: ttypes.RenderError) -> RenderError: return RenderError( thrift_i18n_message_to_arrow(value.message), [thrift_quick_fix_to_arrow(qf) for qf in value.quick_fixes], )
def _warn_if_using_deprecated_date_granularity( table: pa.Table, groups: List[Group]) -> List[RenderError]: errors = [] deprecated_need_upgrade_to_date: List[Group] = [] deprecated_need_timestampmath: List[Group] = [] for group in groups: if group.date_granularity is not None and pa.types.is_timestamp( table.schema.field(group.colname).type): if group.date_granularity in { DateGranularity.DAY, DateGranularity.WEEK, DateGranularity.MONTH, DateGranularity.QUARTER, DateGranularity.YEAR, }: deprecated_need_upgrade_to_date.append(group) elif not _timestamp_is_rounded(table[group.colname], group.date_granularity): deprecated_need_timestampmath.append(group) if deprecated_need_upgrade_to_date: errors.append( RenderError( i18n.trans( "group_dates.granularity_deprecated.need_dates", "The “Group Dates” feature has changed. Please click to upgrade from Timestamps to Dates. Workbench will force-upgrade in January 2022.", ), quick_fixes=[ QuickFix( i18n.trans( "group_dates.granularity_deprecated.quick_fix.convert_to_date", "Upgrade", ), QuickFixAction.PrependStep( "converttimestamptodate", dict( colnames=[group.colname], unit=group.date_granularity.date_unit, ), ), ) for group in deprecated_need_upgrade_to_date ], )) if deprecated_need_timestampmath: errors.append( RenderError( i18n.trans( "group_dates.granularity_deprecated.need_rounding", "The “Group Dates” feature has changed. Please click to upgrade to Timestamp Math. Workbench will force-upgrade in January 2022.", ), quick_fixes=[ QuickFix( i18n.trans( "group_dates.granularity_deprecated.quick_fix.round_timestamps", "Upgrade", ), QuickFixAction.PrependStep( "timestampmath", dict( colnames=[group.colname], operation="startof", roundunit=group.date_granularity.rounding_unit, ), ), ) for group in deprecated_need_timestampmath ], )) return errors
def render_arrow_v1(table: pa.Table, params: Dict[str, Any], **kwargs) -> ArrowRenderResult: colnames = table.column_names date_colnames = frozenset(colname for colname in colnames if pa.types.is_timestamp(table[colname].type)) groups = parse_groups(date_colnames=date_colnames, **params["groups"]) aggregations = parse_aggregations(params["aggregations"]) # HACK: set the same default aggregations as we do in our JavaScript component. if not aggregations: aggregations.append( Aggregation(Operation.SIZE, "", Operation.SIZE.default_outname(""))) # This is a "Group By" module so we need to support the obvious operation, # 'SELECT COUNT(*) FROM input'. The obvious way to display that is to select # "Count" and not select a Group By column. # # ... and unfortunately, that form setup -- no columns selected, one # "Count" aggregation selected -- is exactly what the user sees by default # after adding the module, before step 1 of the onboarding path. # # So we get a tough choice: either make "no aggregations" a no-op to give # us the ideal onboarding path, _OR_ make "no aggregations" default to # "count", to support the obvious operation. Pick one: complete+simple, or # onboarding-friendly. # # For now, we're onboarding-friendly and we don't allow SELECT COUNT(*). # When we solve https://www.pivotaltracker.com/story/show/163264164 we # should change to be complete+simple (because the onboarding will have # another answer). That's # https://www.pivotaltracker.com/story/show/164375318 if not groups and aggregations == [ Aggregation(Operation.SIZE, "", Operation.SIZE.default_outname("")) ]: return ArrowRenderResult( table) # no-op: users haven't entered any params # Error out with a quickfix if aggregations need number and we're not number non_numeric_colnames = [] for aggregation in aggregations: if aggregation.operation.needs_numeric_column(): colname = aggregation.colname column = table[colname] if (not pa.types.is_integer(column.type) and not pa.types.is_floating( column.type)) and colname not in non_numeric_colnames: non_numeric_colnames.append(colname) if non_numeric_colnames: return ArrowRenderResult( pa.table({}), errors=[ RenderError( i18n.trans( "non_numeric_colnames.error", "{n_columns, plural," ' one {Column "{first_colname}"}' ' other {# columns (see "{first_colname}")}} ' "must be Numbers", { "n_columns": len(non_numeric_colnames), "first_colname": non_numeric_colnames[0], }, ), quick_fixes=[ QuickFix( i18n.trans("non_numeric_colnames.quick_fix.text", "Convert"), action=QuickFixAction.PrependStep( "converttexttonumber", {"colnames": non_numeric_colnames}, ), ) ], ) ], ) errors = _warn_if_using_deprecated_date_granularity(table, groups) if not errors and params["groups"]["group_dates"]: errors = [ _generate_group_dates_help_warning( table.schema, frozenset(group.colname for group in groups)) ] result_table = groupby(table, groups, aggregations) return ArrowRenderResult(result_table, errors=errors)
def _generate_group_dates_help_warning( schema: pa.Schema, colnames: FrozenSet[str]) -> RenderError: timestamp_colnames = [] text_colnames = [] date_colnames_and_units = [] for field in schema: if field.name not in colnames: continue if pa.types.is_date32(field.type): date_colnames_and_units.append( (field.name, field.metadata[b"unit"].decode("ascii"))) elif pa.types.is_timestamp(field.type): timestamp_colnames.append(field.name) elif pa.types.is_string(field.type) or pa.types.is_dictionary( field.type): text_colnames.append(field.name) if date_colnames_and_units: return RenderError( i18n.trans( "group_dates.date_selected", "“{column0}” is Date – {unit0, select, day {day} week {week} month {month} quarter {quarter} year {year} other {}}. Edit earlier steps or use “Convert date unit” to change units.", dict( columns=len(date_colnames_and_units), column0=date_colnames_and_units[0][0], unit0=date_colnames_and_units[0][1], ), )) if timestamp_colnames: return RenderError( i18n.trans( "group_dates.timestamp_selected", "{columns, plural, offset:1 =1 {“{column0}” is Timestamp.}=2 {“{column0}” and one other column are Timestamp.}other {“{column0}” and # other columns are Timestamp.}}", dict(columns=len(timestamp_colnames), column0=timestamp_colnames[0]), ), [ QuickFix( i18n.trans( "group_dates.quick_fix.convert_timestamp_to_date", "Convert to Date", ), QuickFixAction.PrependStep( "converttimestamptodate", dict(colnames=timestamp_colnames)), ) ], ) if text_colnames: return RenderError( i18n.trans( "group_dates.text_selected", "{columns, plural, offset:1 =1 {“{column0}” is Text.}=2 {“{column0}” and one other column are Text.}other {“{column0}” and # other columns are Text.}}", dict(columns=len(text_colnames), column0=text_colnames[0]), ), [ QuickFix( i18n.trans( "group_dates.quick_fix.convert_text_to_date", "Convert to Date", ), QuickFixAction.PrependStep("converttexttodate", dict(colnames=text_colnames)), ), QuickFix( i18n.trans( "group_dates.quick_fix.convert_text_to_timestamp", "Convert to Timestamp first", ), QuickFixAction.PrependStep("convert-date", dict(colnames=text_colnames)), ), ], ) return RenderError( i18n.trans("group_dates.select_date_columns", "Select a Date column."))