def test_rename(pipeline_executor): df, _ = pipeline_executor( Pipeline(steps=[ { 'name': 'domain', 'domain': 'domain_a' }, { 'name': 'rename', 'toRename': [['colA', 'col_a'], ['colB', 'col_b']] }, ])) assert_dataframes_equals( df, pd.DataFrame({ 'col_a': ['toto', 'tutu', 'tata'], 'col_b': [1, 2, 3], 'colC': [100, 50, 25] }), )
def test_simple_condition_strings(): sample_df = DataFrame({'a_str': ["test", "test", "autre chose"]}) result_df = IfthenelseStep( **{ 'name': 'ifthenelse', 'newColumn': 'test', 'if': { 'column': 'a_str', 'value': "test", 'operator': 'eq' }, 'then': "\"foo\"", 'else': "\"bar\"", }).execute(sample_df) expected_df = DataFrame({ 'a_str': ["test", "test", "autre chose"], 'test': ["foo", "foo", "bar"] }) assert_dataframes_equals(result_df, expected_df)
def test_concatenate(): sample_df = DataFrame({'NAME': ['foo', 'bar'], 'AGE': [42, 43], 'SCORE': [100, 200]}) step = ConcatenateStep( name='concatenate', columns=['NAME', 'AGE', 'SCORE'], separator=' - ', new_column_name='newcol', ) df_result = execute_concatenate(step, sample_df) expected_result = DataFrame( { 'NAME': ['foo', 'bar'], 'AGE': [42, 43], 'SCORE': [100, 200], 'newcol': ['foo - 42 - 100', 'bar - 43 - 200'], } ) assert_dataframes_equals(df_result, expected_result)
def test_isnull(): df = DataFrame({'a_bool': [True, False, None]}) step = IfthenelseStep( **{ "name": "ifthenelse", "if": { "column": "a_bool", "operator": "isnull", "value": None }, "newColumn": "test", "then": "1", "else": "0", }) result = step.execute(df) assert_dataframes_equals( result, DataFrame({ 'a_bool': [True, False, None], 'test': [0, 0, 1] }))
def test_rollup(sample_df: DataFrame): df_result = RollupStep( name='rollup', hierarchy=['CONTINENT', 'COUNTRY', 'CITY'], aggregations=[ { 'newcolumns': ['VALUE'], 'aggfunction': 'sum', 'columns': ['VALUE'] }, ], ).execute(sample_df) columns = [ 'CITY', 'COUNTRY', 'CONTINENT', 'label', 'level', 'parent', 'VALUE' ] expected_data = [ [None, None, 'Europe', 'Europe', 'CONTINENT', None, 64], [None, None, 'North America', 'North America', 'CONTINENT', None, 112], [None, 'France', 'Europe', 'France', 'COUNTRY', 'Europe', 36], [None, 'Spain', 'Europe', 'Spain', 'COUNTRY', 'Europe', 28], [ None, 'Canada', 'North America', 'Canada', 'COUNTRY', 'North America', 40 ], [None, 'USA', 'North America', 'USA', 'COUNTRY', 'North America', 72], ['Bordeaux', 'France', 'Europe', 'Bordeaux', 'CITY', 'France', 13], ['Paris', 'France', 'Europe', 'Paris', 'CITY', 'France', 23], ['Barcelona', 'Spain', 'Europe', 'Barcelona', 'CITY', 'Spain', 19], ['Madrid', 'Spain', 'Europe', 'Madrid', 'CITY', 'Spain', 9], [ 'Montreal', 'Canada', 'North America', 'Montreal', 'CITY', 'Canada', 20 ], ['Ottawa', 'Canada', 'North America', 'Ottawa', 'CITY', 'Canada', 20], ['Boston', 'USA', 'North America', 'Boston', 'CITY', 'USA', 27], ['New-York', 'USA', 'North America', 'New-York', 'CITY', 'USA', 45], ] expected_result = DataFrame(expected_data, columns=columns) assert_dataframes_equals(df_result, expected_result)
def test_unpivot_with_dropna_false(sample_df: DataFrame): step = UnpivotStep( name='unpivot', keep=['COMPANY', 'COUNTRY'], unpivot=['NB_CLIENTS', 'REVENUES'], unpivot_column_name='KPI', value_column_name='VALUE', dropna=False, ) result = execute_unpivot(step, sample_df, domain_retriever=None, execute_pipeline=None) expected_result = DataFrame({ 'COMPANY': ['Company 1'] * 2 + ['Company 2'] * 2 + ['Company 1'] * 2 + ['Company 2'] * 2, 'COUNTRY': ['France'] * 4 + ['USA'] * 4, 'KPI': ['NB_CLIENTS', 'REVENUES'] * 4, 'VALUE': [7, 10, 2, None, 12, 6, 1, 3], }) assert_dataframes_equals(result.sort_values(['COUNTRY', 'COMPANY', 'KPI']), expected_result)
def test_append_with_domain_name( sample_df: DataFrame, mock_domain_retriever: DomainRetriever, mock_execute_pipeline: PipelineExecutor, ): """ It should accept a domain name instead of a complete pipeline """ df_result = AppendStep( name='append', pipelines=['miam'], ).execute(sample_df, domain_retriever=mock_domain_retriever, execute_pipeline=mock_execute_pipeline) expected_result = DataFrame({ 'name': ['foo', 'bar', 'miam'], 'age': [42, 43, None], 'score': [100, 200, 999], 'lambda': [None, None, 'p'], }) assert_dataframes_equals(df_result, expected_result)
def test_missing_date(today): dates = [today + timedelta(days=nb_day) for nb_day in list(range(1, 10)) + list(range(12, 20))] missing_dates = [today + timedelta(days=10), today + timedelta(days=11)] values = [idx for (idx, value) in enumerate(dates)] df = pd.DataFrame( { 'date': dates, 'value': values, } ) step = AddMissingDatesStep( name='addmissingdates', datesColumn='date', datesGranularity='day', groups=[] ) result = step.execute(df) expected_result = pd.concat( [df, pd.DataFrame({'date': missing_dates, 'value': [None, None]})] ).sort_values(by='date') assert_dataframes_equals(result, expected_result)
def test_with_original_granularity(sample_df): df_result = AggregateStep( name='aggregate', keepOriginalGranularity=True, on=['Group'], aggregations=[ Aggregation(aggfunction='sum', columns=['Value1'], newcolumns=['Total']), ], ).execute(sample_df) assert_dataframes_equals( df_result, DataFrame( { 'Label': ['Label 1', 'Label 2', 'Label 3', 'Label 4', 'Label 5', 'Label 6'], 'Group': ['Group 1'] * 3 + ['Group 2'] * 3, 'Value1': [13, 7, 20, 1, 10, 5], 'Total': [40] * 3 + [16] * 3, 'Value2': [10, 21, 4, 17, 12, 2], } ), )
def test_keep_less_columns(sample_df): result_df = SplitStep(name='split', column='Label', delimiter='-', number_cols_to_keep=2).execute(sample_df) expected_df = pd.DataFrame({ 'Label': [ 'Label 1 - Groupe 1 - France', 'Label 2 - Groupe 1 - Spain', 'Label 3 - Groupe 1 - USA', 'Label 4 - Groupe 2 - France', 'Label 5 - Groupe 2 - Spain', 'Label 6 - Groupe 2 - USA', ], 'Label_1': [ 'Label 1 ', 'Label 2 ', 'Label 3 ', 'Label 4 ', 'Label 5 ', 'Label 6 ' ], 'Label_2': [' Groupe 1 '] * 3 + [' Groupe 2 '] * 3, 'Values': [13, 7, 20, 1, 10, 5], }) assert_dataframes_equals(result_df, expected_df)
def test_then_should_support_formulas(): base_df = DataFrame({'a_bool': [True, False, True], 'a_number': [1, 2, 3]}) result_df = IfthenelseStep( **{ 'name': 'ifthenelse', 'newColumn': 'result', 'if': { 'column': 'a_bool', 'value': True, 'operator': 'eq' }, 'then': 'a_number', 'else': 'a_number * -1', }).execute(base_df) expected_df = DataFrame({ 'a_bool': [True, False, True], 'a_number': [1, 2, 3], 'result': [1, -2, 3] }) assert_dataframes_equals(result_df, expected_df)
def test_duration(time_delta_parameters: Dict[str, int], duration_in: str, expected_result: float): step = DurationStep( name='duration', newColumnName='DURATION', startDateColumn='START_DATE', endDateColumn='END_DATE', durationIn=duration_in, ) now = datetime.now() delta = timedelta(**time_delta_parameters) sample_df = pd.DataFrame({'START_DATE': [now], 'END_DATE': [now + delta]}) result_df = execute_duration(step, sample_df) expected_result = pd.DataFrame({ 'START_DATE': [now], 'END_DATE': [now + delta], 'DURATION': [expected_result], }) assert_dataframes_equals(result_df, expected_result)
def test_or_logical_conditions(sample_df): step = FilterStep( name='filter', condition={ 'or': [ { 'column': 'colA', 'operator': 'eq', 'value': 'toto', }, { 'column': 'colC', 'operator': 'lt', 'value': 33, }, ] }, ) df_result = execute_filter(step, sample_df) assert_dataframes_equals( df_result, DataFrame({'colA': ['toto', 'tata'], 'colB': [1, 3], 'colC': [100, 25]}) )
def test_missing_date_with_groups_various_length(today): dates = [ datetime.datetime(year=2020, month=nb_month, day=1) for nb_month in list(range(1, 5)) + list(range(8, 10)) ] missing_dates = [datetime.datetime(year=2020, month=nb_month, day=1) for nb_month in [5, 6, 7]] values = [idx for (idx, value) in enumerate(dates)] df = pd.DataFrame( { 'date': dates + dates[0:-1], 'country': ['France'] * len(dates) + ['USA'] * (len(dates) - 1), 'value': values + values[0:-1], } ) step = AddMissingDatesStep( name='addmissingdates', datesColumn='date', datesGranularity='month', groups=['country'] ) result = step.execute(df) expected_result = pd.concat( [ df, pd.DataFrame( { 'country': cast( List[Optional[Any]], ['France'] * len(missing_dates) + ['USA'] * len(missing_dates), ), 'date': missing_dates * 2, 'value': [None] * len(missing_dates) * 2, } ), ] ).sort_values(by=['country', 'date']) assert_dataframes_equals(result, expected_result)
def test_join_left( sample_df: DataFrame, mock_domain_retriever: DomainRetriever, mock_execute_pipeline: PipelineExecutor, ): step = JoinStep( name='join', right_pipeline=[{'name': 'domain', 'domain': 'buzz'}], on=[ ['NAME', 'name'], ], type='left', ) df_result = execute_join( step, sample_df, domain_retriever=mock_domain_retriever, execute_pipeline=mock_execute_pipeline, ) expected_result = DataFrame( {'NAME': ['foo', 'bar'], 'name': [None, 'bar'], 'AGE': [42, 43], 'score': [None, 100]} ) assert_dataframes_equals(df_result, expected_result)
def test_cumsum_with_groups(): sample_df = DataFrame({ 'date': ['2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06'] * 2, 'country': ['France'] * 6 + ['USA'] * 6, 'value': [2, 5, 3, 8, 9, 6] + [10, 6, 6, 4, 8, 7], }) df_result = CumSumStep( name='cumsum', valueColumn='value', referenceColumn='date', groupby=['country'], newColumn='my_cumsum', ).execute(sample_df) expected_result = DataFrame({ 'date': ['2019-01', '2019-02', '2019-03', '2019-04', '2019-05', '2019-06'] * 2, 'country': ['France'] * 6 + ['USA'] * 6, 'value': [2, 5, 3, 8, 9, 6] + [10, 6, 6, 4, 8, 7], 'my_cumsum': [2, 7, 10, 18, 27, 33] + [10, 16, 22, 26, 34, 41], }) assert_dataframes_equals(df_result, expected_result.sort_values('date'))
def test_simple_with_aggregation(): sample_df = pd.DataFrame({ 'city': ['Bordeaux', 'Boston', 'New-York', 'Paris', 'Paris'] * 2, 'year': [2019] * 5 + [2018] * 5, 'revenue': [135, 275, 115, 450, 10, 98, 245, 103, 385, 10], }) result_df = WaterfallStep( name='waterfall', valueColumn='revenue', milestonesColumn='year', start=2018, end=2019, labelsColumn='city', sortBy='value', order='desc', ).execute(sample_df) expected_df = pd.DataFrame({ 'LABEL_waterfall': ['2018', 'Paris', 'Bordeaux', 'Boston', 'New-York', '2019'], 'TYPE_waterfall': [None, 'Parent', 'Parent', 'Parent', 'Parent', None], 'revenue': [841, 65, 37, 30, 12, 985], }) assert_dataframes_equals(result_df, expected_df)
def test_moving_average_with_groups(): df = DataFrame({ 'country': ['France'] * 6 + ['USA'] * 6, 'date': [f'2018-01-0{i}' for i in range(1, 7)] * 2, 'value': [75, 80, 82, 83, 80, 86] + [69, 73, 73, 75, 70, 76], }) df['date'] = pd.to_datetime(df['date']) step = MovingAverageStep( name='movingaverage', valueColumn='value', columnToSort='date', movingWindow=3, groups=['country'], newColumnName='rolling_average', ) df_result = execute_moving_average(step, df) expected_result = df.assign( **{ 'rolling_average': [None, None, 79, 81.6667, 81.6667, 83] + [None, None, 71.6667, 73.6667, 72.6667, 73.6667] }) assert_dataframes_equals(df_result, expected_result)
def test_waterfall_bug_drill(): """ Tuple (label, parent) should be unique only among one "group by" sub-df. """ base_df = pd.DataFrame({ 'grand parent': ['Food', 'Vegetarian', 'Fruits'] * 2, 'parent': ['Vegetarian', 'Fruits', 'Berries'] * 2, 'label': ['Fruits', 'Berries', 'Blueberries'] * 2, 'variable': ['A'] * 3 + ['B'] * 3, 'value': [1, 2, 3, 11, 12, 13], }) step = WaterfallStep( name='waterfall', valueColumn='value', milestonesColumn='variable', start='A', end='B', labelsColumn='label', parentsColumn='parent', groupby=['grand parent'], sortBy='label', order='asc', ) result = execute_waterfall(step, base_df) assert_dataframes_equals( result, pd.DataFrame({ 'grand parent': [ 'Food', 'Vegetarian', 'Fruits', 'Vegetarian', 'Fruits', 'Fruits', 'Food', 'Vegetarian', 'Food', 'Food', 'Vegetarian', 'Fruits', ], 'LABEL_waterfall': ['A'] * 3 + [ 'Berries', 'Berries', 'Blueberries', 'Fruits', 'Fruits', 'Vegetarian' ] + ['B'] * 3, 'value': [1, 2, 3] + [10] * 6 + [11, 12, 13], 'GROUP_waterfall': ['A'] * 3 + [ 'Fruits', 'Berries', 'Berries', 'Vegetarian', 'Fruits', 'Vegetarian' ] + ['B'] * 3, 'TYPE_waterfall': [ None, None, None, 'child', 'parent', 'child', 'child', 'parent', 'parent', None, None, None, ], }), )
def test_complex_rollup(sample_df: DataFrame): sample_df = sample_df.assign(COUNT=1) step = RollupStep( name='rollup', hierarchy=['CONTINENT', 'COUNTRY', 'CITY'], aggregations=[ { 'newcolumns': ['VALUE-sum', 'COUNT'], 'aggfunction': 'sum', 'columns': ['VALUE', 'COUNT'], }, {'newcolumns': ['VALUE-avg'], 'aggfunction': 'avg', 'columns': ['VALUE']}, ], groupby=['YEAR'], labelCol='MY_LABEL', levelCol='MY_LEVEL', parentLabelCol='MY_PARENT', ) df_result = execute_rollup(step, sample_df) columns = [ 'CITY', 'COUNTRY', 'CONTINENT', 'YEAR', 'MY_LABEL', 'MY_LEVEL', 'MY_PARENT', 'VALUE-sum', 'VALUE-avg', 'COUNT', ] expected_data = [ [None, None, 'Europe', 2018, 'Europe', 'CONTINENT', None, 26, 6.5, 4], [None, None, 'North America', 2018, 'North America', 'CONTINENT', None, 50, 12.5, 4], [None, None, 'Europe', 2019, 'Europe', 'CONTINENT', None, 38, 9.5, 4], [None, None, 'North America', 2019, 'North America', 'CONTINENT', None, 62, 15.5, 4], [None, 'France', 'Europe', 2018, 'France', 'COUNTRY', 'Europe', 15, 7.5, 2], [None, 'Spain', 'Europe', 2018, 'Spain', 'COUNTRY', 'Europe', 11, 5.5, 2], [None, 'Canada', 'North America', 2018, 'Canada', 'COUNTRY', 'North America', 17, 8.5, 2], [None, 'USA', 'North America', 2018, 'USA', 'COUNTRY', 'North America', 33, 16.5, 2], [None, 'France', 'Europe', 2019, 'France', 'COUNTRY', 'Europe', 21, 10.5, 2], [None, 'Spain', 'Europe', 2019, 'Spain', 'COUNTRY', 'Europe', 17, 8.5, 2], [None, 'Canada', 'North America', 2019, 'Canada', 'COUNTRY', 'North America', 23, 11.5, 2], [None, 'USA', 'North America', 2019, 'USA', 'COUNTRY', 'North America', 39, 19.5, 2], ['Bordeaux', 'France', 'Europe', 2018, 'Bordeaux', 'CITY', 'France', 5, 5, 1], ['Paris', 'France', 'Europe', 2018, 'Paris', 'CITY', 'France', 10, 10, 1], ['Barcelona', 'Spain', 'Europe', 2018, 'Barcelona', 'CITY', 'Spain', 8, 8, 1], ['Madrid', 'Spain', 'Europe', 2018, 'Madrid', 'CITY', 'Spain', 3, 3, 1], ['Montreal', 'Canada', 'North America', 2018, 'Montreal', 'CITY', 'Canada', 10, 10, 1], ['Ottawa', 'Canada', 'North America', 2018, 'Ottawa', 'CITY', 'Canada', 7, 7, 1], ['Boston', 'USA', 'North America', 2018, 'Boston', 'CITY', 'USA', 12, 12, 1], ['New-York', 'USA', 'North America', 2018, 'New-York', 'CITY', 'USA', 21, 21, 1], ['Bordeaux', 'France', 'Europe', 2019, 'Bordeaux', 'CITY', 'France', 8, 8, 1], ['Paris', 'France', 'Europe', 2019, 'Paris', 'CITY', 'France', 13, 13, 1], ['Barcelona', 'Spain', 'Europe', 2019, 'Barcelona', 'CITY', 'Spain', 11, 11, 1], ['Madrid', 'Spain', 'Europe', 2019, 'Madrid', 'CITY', 'Spain', 6, 6, 1], ['Montreal', 'Canada', 'North America', 2019, 'Montreal', 'CITY', 'Canada', 10, 10, 1], ['Ottawa', 'Canada', 'North America', 2019, 'Ottawa', 'CITY', 'Canada', 13, 13, 1], ['Boston', 'USA', 'North America', 2019, 'Boston', 'CITY', 'USA', 15, 15, 1], ['New-York', 'USA', 'North America', 2019, 'New-York', 'CITY', 'USA', 24, 24, 1], ] expected_result = DataFrame(expected_data, columns=columns) assert_dataframes_equals(df_result, expected_result)
def test_date_extract_(sample_df: DataFrame): step = DateExtractStep( name='dateextract', column='date', dateInfo=[ 'year', 'month', 'day', 'week', 'quarter', 'dayOfWeek', 'dayOfYear', 'isoYear', 'isoWeek', 'isoDayOfWeek', 'firstDayOfYear', 'firstDayOfMonth', 'firstDayOfWeek', 'firstDayOfQuarter', 'firstDayOfIsoWeek', 'previousDay', 'firstDayOfPreviousYear', 'firstDayOfPreviousMonth', 'firstDayOfPreviousWeek', 'firstDayOfPreviousQuarter', 'firstDayOfPreviousIsoWeek', 'previousYear', 'previousMonth', 'previousWeek', 'previousQuarter', 'previousIsoWeek', 'hour', 'minutes', 'seconds', 'milliseconds', ], newColumns=[ 'date_year', 'date_month', 'date_day', 'date_week', 'date_quarter', 'date_dayOfWeek', 'date_dayOfYear', 'date_isoYear', 'date_isoWeek', 'date_isoDayOfWeek', 'date_firstDayOfYear', 'date_firstDayOfMonth', 'date_firstDayOfWeek', 'date_firstDayOfQuarter', 'date_firstDayOfIsoWeek', 'date_previousDay', 'date_firstDayOfPreviousYear', 'date_firstDayOfPreviousMonth', 'date_firstDayOfPreviousWeek', 'date_firstDayOfPreviousQuarter', 'date_firstDayOfPreviousIsoWeek', 'date_previousYear', 'date_previousMonth', 'date_previousWeek', 'date_previousQuarter', 'date_previousIsoWeek', 'date_hour', 'date_minutes', 'date_seconds', 'date_milliseconds', ], ) df_result = execute_date_extract(step, sample_df) expected_result = DataFrame( { 'date': to_datetime( [ '2021-03-29T00:00:00.000Z', '2020-12-13T00:00:00.000Z', '2020-07-29T00:00:00.000Z', '2019-04-09T01:02:03.004Z', '2017-01-02T00:00:00.000Z', '2016-01-01T00:00:00.000Z', None, ] ), 'date_year': [2021, 2020, 2020, 2019, 2017, 2016, None], 'date_month': [3, 12, 7, 4, 1, 1, None], 'date_day': [29, 13, 29, 9, 2, 1, None], 'date_week': [13, 50, 30, 14, 1, 0, None], 'date_quarter': [1, 4, 3, 2, 1, 1, None], 'date_dayOfWeek': [2, 1, 4, 3, 2, 6, None], 'date_dayOfYear': [88, 348, 211, 99, 2, 1, None], 'date_isoYear': [2021, 2020, 2020, 2019, 2017, 2015, None], 'date_isoWeek': [13, 50, 31, 15, 1, 53, None], 'date_isoDayOfWeek': [1, 7, 3, 2, 1, 5, None], 'date_firstDayOfYear': to_datetime( [ "2021-01-01T00:00:00.000Z", "2020-01-01T00:00:00.000Z", "2020-01-01T00:00:00.000Z", "2019-01-01T00:00:00.000Z", "2017-01-01T00:00:00.000Z", "2016-01-01T00:00:00.000Z", None, ] ), 'date_firstDayOfMonth': to_datetime( [ "2021-03-01T00:00:00.000Z", "2020-12-01T00:00:00.000Z", "2020-07-01T00:00:00.000Z", "2019-04-01T00:00:00.000Z", "2017-01-01T00:00:00.000Z", "2016-01-01T00:00:00.000Z", None, ] ), 'date_firstDayOfWeek': to_datetime( [ "2021-03-28T00:00:00.000Z", "2020-12-13T00:00:00.000Z", "2020-07-26T00:00:00.000Z", "2019-04-07T00:00:00.000Z", "2017-01-01T00:00:00.000Z", "2015-12-27T00:00:00.000Z", None, ] ), 'date_firstDayOfQuarter': to_datetime( [ "2021-01-01T00:00:00.000Z", "2020-10-01T00:00:00.000Z", "2020-07-01T00:00:00.000Z", "2019-04-01T00:00:00.000Z", "2017-01-01T00:00:00.000Z", "2016-01-01T00:00:00.000Z", None, ] ), 'date_firstDayOfIsoWeek': to_datetime( [ "2021-03-29T00:00:00.000Z", "2020-12-07T00:00:00.000Z", "2020-07-27T00:00:00.000Z", "2019-04-08T00:00:00.000Z", "2017-01-02T00:00:00.000Z", "2015-12-28T00:00:00.000Z", None, ] ), 'date_previousDay': to_datetime( [ "2021-03-28T00:00:00.000Z", "2020-12-12T00:00:00.000Z", "2020-07-28T00:00:00.000Z", "2019-04-08T00:00:00.000Z", "2017-01-01T00:00:00.000Z", "2015-12-31T00:00:00.000Z", None, ] ), 'date_firstDayOfPreviousYear': to_datetime( [ "2020-01-01T00:00:00.000Z", "2019-01-01T00:00:00.000Z", "2019-01-01T00:00:00.000Z", "2018-01-01T00:00:00.000Z", "2016-01-01T00:00:00.000Z", "2015-01-01T00:00:00.000Z", None, ] ), 'date_firstDayOfPreviousMonth': to_datetime( [ "2021-02-01T00:00:00.000Z", "2020-11-01T00:00:00.000Z", "2020-06-01T00:00:00.000Z", "2019-03-01T00:00:00.000Z", "2016-12-01T00:00:00.000Z", "2015-12-01T00:00:00.000Z", None, ] ), 'date_firstDayOfPreviousWeek': to_datetime( [ "2021-03-21T00:00:00.000Z", "2020-12-06T00:00:00.000Z", "2020-07-19T00:00:00.000Z", "2019-03-31T00:00:00.000Z", "2016-12-25T00:00:00.000Z", "2015-12-20T00:00:00.000Z", None, ] ), 'date_firstDayOfPreviousQuarter': to_datetime( [ "2020-10-01T00:00:00.000Z", "2020-07-01T00:00:00.000Z", "2020-04-01T00:00:00.000Z", "2019-01-01T00:00:00.000Z", "2016-10-01T00:00:00.000Z", "2015-10-01T00:00:00.000Z", None, ] ), 'date_firstDayOfPreviousIsoWeek': to_datetime( [ "2021-03-22T00:00:00.000Z", "2020-11-30T00:00:00.000Z", "2020-07-20T00:00:00.000Z", "2019-04-01T00:00:00.000Z", "2016-12-26T00:00:00.000Z", "2015-12-21T00:00:00.000Z", None, ] ), 'date_previousYear': [2020, 2019, 2019, 2018, 2016, 2015, None], 'date_previousMonth': [2, 11, 6, 3, 12, 12, None], 'date_previousQuarter': [4, 3, 2, 1, 4, 4, None], 'date_previousWeek': [12, 49, 29, 13, 52, 51, None], 'date_previousIsoWeek': [12, 49, 30, 14, 52, 52, None], 'date_hour': [0, 0, 0, 1, 0, 0, None], 'date_minutes': [0, 0, 0, 2, 0, 0, None], 'date_seconds': [0, 0, 0, 3, 0, 0, None], 'date_milliseconds': [0, 0, 0, 4, 0, 0, None], } ) assert_dataframes_equals(df_result, expected_result) # Ensure there are no unsigned int types in result: assert UInt32Dtype() not in list(df_result.dtypes)
def test_duplicate(): input_df = DataFrame({'x': [100, 200]}) step = DuplicateStep(name='duplicate', column='x', new_column_name='y') df_result = execute_duplicate(step, input_df) expected_result = DataFrame({'x': [100, 200], 'y': [100, 200]}) assert_dataframes_equals(df_result, expected_result)
def test_convert_to_text(sample_df: DataFrame): step = ConvertStep(name='convert', columns=['value'], data_type='text') df_result = execute_convert(step, sample_df) expected_result = DataFrame({'value': ['41', '42', '43.5', '43.6', 'None', 'meh']}) assert_dataframes_equals(df_result, expected_result)
def test_convert_to_integer(sample_df: DataFrame): step = ConvertStep(name='convert', columns=['value'], data_type='integer') df_result = execute_convert(step, sample_df) expected_result = DataFrame({'value': [41, 42, 43, 43, None, None]}) assert_dataframes_equals(df_result, expected_result)
def test_convert_to_float(sample_df: DataFrame): step = ConvertStep(name='convert', columns=['value'], data_type='float') df_result = execute_convert(step, sample_df) expected_result = DataFrame({'value': [41.0, 42.0, 43.5, 43.6, None, None]}) assert_dataframes_equals(df_result, expected_result)