Ejemplo n.º 1
0
    def test_create_nested_transform_pipeline_with_variable_subset(self):
        self.create_csv_for_2()
        ds2_cols = self.create_indexed_columns_for_2()
        a, b, c = ds2_cols
        ds2 = self.create_source(df=None,
                                 location=self.csv_path2,
                                 columns=ds2_cols,
                                 name='two')
        dtp = self.create_transformation_pipeline(
            source=ds2, subset=[a.variable, b.variable])

        self.create_csv_for_3()
        ds3_cols = self.create_indexed_columns_for_3()
        ds3 = self.create_source(df=None,
                                 location=self.csv_path3,
                                 columns=ds3_cols,
                                 name='three')

        co = CombineOptions(rows=False)
        dp2 = self.create_combine_pipeline(data_sources=[dtp, ds3],
                                           combine_options_list=[co])
        dp2.execute()

        assert_frame_equal(dp2.df, self.expect_combined_cols_2_3)
        self.assert_ordered_pipeline_operations(dp2, [dtp, dp2])
Ejemplo n.º 2
0
    def test_create_nested_transform_pipeline(self):
        self.create_csv_for_2()
        ds2_cols = self.create_indexed_columns_for_2()
        ds2 = self.create_source(df=None,
                                 location=self.csv_path2,
                                 columns=ds2_cols,
                                 name='two')
        dtp = self.create_transformation_pipeline(source=ds2)

        self.create_csv_for_3()
        ds3_cols = self.create_indexed_columns_for_3()
        ds3 = self.create_source(df=None,
                                 location=self.csv_path3,
                                 columns=ds3_cols,
                                 name='three')

        co = CombineOptions(rows=False)
        dp2 = self.create_combine_pipeline(data_sources=[dtp, ds3],
                                           combine_options_list=[co])

        with self.assertRaises(ValueError) as cm:
            dp2.execute()
            exc = cm.exception
            assert 'can only combine columns of data sources with overlapping indices. Column c has' in str(
                exc)
Ejemplo n.º 3
0
    def test_create_and_run_combine_cols_pipeline_with_indices(self):
        co = CombineOptions(rows=False)
        dp = self.create_combine_pipeline(indexed=True,
                                          combine_options_list=[co])
        dp.execute()

        assert_frame_equal(dp.df, self.expect_merged_1_2_c_index)
        self.assert_all_pipeline_operations_have_pipeline(dp)
Ejemplo n.º 4
0
    def test_create_and_run_combine_cols_pipeline_from_sources(self):
        co = CombineOptions(rows=False)
        dp = self.create_combine_pipeline(combine_options_list=[co])

        with self.assertRaises(ValueError) as cm:
            dp.execute()
            exc = cm.exception
            assert 'exists in multiple data sources' in str(exc)
Ejemplo n.º 5
0
    def test_create_and_run_combine_rows_drop_rows_pipeline_from_sources(self):
        a, b, c = self.create_variables()
        co = CombineOptions(row_duplicate_vars=[c])
        dp = self.create_combine_pipeline(combine_options_list=[co])
        dp.execute()

        assert_frame_equal(dp.df, self.expect_combined_rows_1_2_row_drop_c)
        self.assert_all_pipeline_operations_have_pipeline(dp)
Ejemplo n.º 6
0
    def test_raises_error_for_mismatching_data_sources_merge_options(self):
        co = CombineOptions()
        dp = self.create_combine_pipeline(include_indices=(0, 1, 2),
                                          combine_options_list=[co])

        with self.assertRaises(ValueError) as cm:
            dp.execute()
            exc = cm.exception
            assert 'must have one fewer combine options than data sources' in str(
                exc)
Ejemplo n.º 7
0
    def create_combine_pipeline(
            self,
            include_indices: Sequence[int] = (0, 1),
            data_sources: Optional[Sequence[DataSource]] = None,
            combine_options_list: Optional[Sequence[CombineOptions]] = None,
            indexed: bool = False,
            all_option_config: Optional[Dict[str, Any]] = None,
            last_option_config: Optional[Dict[str, Any]] = None,
            pipeline_kwargs: Optional[Dict[str, Any]] = None,
            create_csv: bool = True):
        if indexed:
            col_func_1 = self.create_indexed_columns
            col_func_2 = self.create_indexed_columns_for_2
            col_func_3 = self.create_indexed_columns_for_3
        else:
            col_func_1 = self.create_columns
            col_func_2 = self.create_columns_for_2
            col_func_3 = self.create_columns_for_3

        if data_sources is None:
            if create_csv:
                self.create_csv()
                self.create_csv_for_2()
                self.create_csv_for_3()
            ds1_cols = col_func_1()
            ds1 = self.create_source(df=None, columns=ds1_cols, name='one')
            ds2_cols = col_func_2()
            ds2 = self.create_source(df=None,
                                     location=self.csv_path2,
                                     columns=ds2_cols,
                                     name='two')
            ds3_cols = col_func_3()
            ds3 = self.create_source(df=None,
                                     location=self.csv_path3,
                                     columns=ds3_cols,
                                     name='three')
            data_sources = [ds1, ds2, ds3]
            selected_data_sources = []
            for i, ds in enumerate(data_sources):
                if i in include_indices:
                    selected_data_sources.append(ds)
        else:
            selected_data_sources = data_sources

        if all_option_config is None:
            all_cols = []
            for ds in selected_data_sources:
                if isinstance(ds, DataSource) and ds.columns is not None:
                    for col in ds.columns:
                        if col not in all_cols:
                            all_cols.append(col)
            all_option_config = dict(result_kwargs=dict(columns=all_cols))
        if last_option_config is None:
            last_option_config = dict(out_path=self.csv_path_output)
        if pipeline_kwargs is None:
            pipeline_kwargs = {}

        if combine_options_list is None:
            mo = CombineOptions(**all_option_config)
            combine_options_list = [
                mo for _ in range(len(selected_data_sources) - 1)
            ]

        for key, value in last_option_config.items():
            setattr(combine_options_list[-1], key, value)

        dp = DataCombinationPipeline(selected_data_sources,
                                     combine_options_list, **pipeline_kwargs)
        return dp