def test_1_1_permutations(self, tmp_path, engine): df_1_1 = pd.DataFrame(data, columns=reg_columns, index=reg_index) dset_1_1 = mp.Dataset(df_1_1) dset_1_1.to_excel(tmp_path / "dset_1_0.xlsx", engine=engine, header=True, index=False) dset_1_1.to_excel(tmp_path / "dset_0_1.xlsx", engine=engine, header=False, index=True) dset_1_1.to_excel(tmp_path / "dset_1_1.xlsx", engine=engine, header=True, index=True) dset_1_1.to_excel(tmp_path / "dset_0_0.xlsx", engine=engine, header=False, index=False) # 1_1: test 1 header 1 index dset_1_1_parsed = mp.read_excel(tmp_path / "dset_1_1.xlsx") assert_frame_equal(dset_1_1, dset_1_1_parsed) # 1_0: test 1 header 0 index dset_1_1_parsed = mp.read_excel(tmp_path / "dset_1_0.xlsx") dset_1_1_parsed.index = reg_index assert_frame_equal(dset_1_1, dset_1_1_parsed) # 0_1: test 0 header 1 index dset_1_1_parsed = mp.read_excel(tmp_path / "dset_0_1.xlsx") dset_1_1_parsed.index.name = None # if no header, pandas inserts a default index name of 0 dset_1_1_parsed.columns = reg_columns assert_frame_equal(dset_1_1, dset_1_1_parsed) # 0_0: test 0 header 0 index dset_1_1_parsed = mp.read_excel(tmp_path / "dset_0_0.xlsx") dset_1_1_parsed.index = reg_index dset_1_1_parsed.columns = reg_columns assert_frame_equal(dset_1_1, dset_1_1_parsed)
def test_2_1(self, tmp_path, engine): df_2_1 = pd.DataFrame(data, columns=mi_columns, index=reg_index) dset_2_1 = mp.Dataset(df_2_1) dset_2_1.to_excel(tmp_path / "dset_2_1.xlsx", engine=engine, header=True, index=True) # 2_1: test 2 header 1 index dset_2_1_parsed = mp.read_excel(tmp_path / "dset_2_1.xlsx") assert_frame_equal(dset_2_1, dset_2_1_parsed)
def test_1_2(self, tmp_path, engine): df_1_2 = pd.DataFrame(data, columns=reg_columns, index=mi_index) dset_1_2 = mp.Dataset(df_1_2) dset_1_2.to_excel(tmp_path / "dset_1_2.xlsx", engine=engine, header=True, index=True) # 1_2: test 1 header 2 index dset_1_2_parsed = mp.read_excel(tmp_path / "dset_1_2.xlsx") assert_frame_equal(dset_1_2, dset_1_2_parsed)
def test_basiclist(tmp_path): dset1 = mp.Dataset({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) dset1.to_excel(tmp_path / "dset1.xlsx") with mp.MACPieExcelFile(tmp_path / "dset1.xlsx") as reader: dset1_from_file = mp.read_excel(reader, sheet_name="NO_NAME") # dset1_from_file = mp.read_excel(tmp_path / "dset1.xlsx", sheet_name="NO_NAME") assert dset1.equals(dset1_from_file) dset2 = mp.Dataset( { "A": [1, 2, 3], "albert": [4, 5, 6], "C": [7, 8, 9] }, id_col_name="albert", name="renee", tags=["a", "b"], ) basic_list = mp.BasicList([dset1, dset2]) with mp.MACPieExcelWriter(tmp_path / "basic_list.xlsx") as writer: basic_list.to_excel(writer) with mp.MACPieExcelFile(tmp_path / "basic_list.xlsx") as reader: basic_list_from_file = mp.read_excel(reader, as_collection=True) # basic_list_from_file = mp.read_excel(tmp_path / "basic_list.xlsx", as_collection=True) assert len(basic_list_from_file) == 2 assert basic_list_from_file[0].equals(dset1) assert basic_list_from_file[1].equals(dset2)
def test_display_name_generator(): dset = mp.Dataset( { "A": [1, 2, 3], "albert": [4, 5, 6], "C": [7, 8, 9] }, id_col_name="albert", name="renee", tags=["a", "b"], ) dset.display_name == "renee_a_b" dset.display_name_generator = mp.MergeableAnchoredList.dataset_display_name_generator dset.display_name == "renee"
def group_by_keep_one(dset: mp.Dataset, keep: str = "all", drop_duplicates: bool = False) -> None: """Given a :class:`Dataset` object, group on the :attr:`Dataset.id2_col_name` column and keep only the earliest or latest row in each group as determined by the date in the :attr:`Dataset.date_col_name` column. This is the :class:`Dataset` analog of :func:`macpie.pandas.group_by_keep_one`. :param dset: the :class:`Dataset` to operate on. Its ``df`` attribute gets updated with the result of this operation. :param keep: specify which row of each group to keep ``all`` keep all rows ``earliest`` in each group, keep only the earliest (i.e. oldest) row ``latest`` in each group, keep only the latest (i.e. most recent) row :param drop_duplicates: if ``True``, then if more than one row is determined to be earliest or or latest in each group, drop all duplicates except the first occurrence. ``dset``'s ``id_col_name`` will be used for identifying duplicates """ result_df = mp.pandas.operators.group_by_keep_one.group_by_keep_one( df=dset, group_by_col=dset.id2_col_name, date_col_name=dset.date_col_name, keep=keep, id_col_name=dset.id_col_name, drop_duplicates=drop_duplicates, ) return mp.Dataset(data=result_df)
def test_2_2_permutations(self, tmp_path, engine): df_2_2 = pd.DataFrame(data, columns=mi_columns, index=mi_index) dset_2_2 = mp.Dataset(df_2_2) dset_2_2.to_excel(tmp_path / "dset_2_2.xlsx", engine=engine, header=True, index=True) dset_2_2.to_excel(tmp_path / "dset_2_0.xlsx", engine=engine, header=True, index=False) dset_2_2.to_excel(tmp_path / "dset_0_2.xlsx", engine=engine, header=False, index=True) dset_2_2.to_excel(tmp_path / "dset_0_0.xlsx", engine=engine, header=False, index=False) # 2_2: test 2 header 2 index dset_2_2_parsed = mp.read_excel(tmp_path / "dset_2_2.xlsx") assert_frame_equal(dset_2_2, dset_2_2_parsed) # 2_0: test 2 header 0 index dset_2_2_parsed = mp.read_excel(tmp_path / "dset_2_0.xlsx") dset_2_2_parsed.index = mi_index assert_frame_equal(dset_2_2, dset_2_2_parsed) # 0_2: test 0 header 2 index dset_2_2_parsed = mp.read_excel(tmp_path / "dset_0_2.xlsx") dset_2_2_parsed.index.names = mi_index.names dset_2_2_parsed.columns = mi_columns assert_frame_equal(dset_2_2, dset_2_2_parsed) # 0_0: test 0 header 0 index dset_2_2_parsed = mp.read_excel(tmp_path / "dset_0_0.xlsx") dset_2_2_parsed.index = mi_index dset_2_2_parsed.columns = mi_columns assert_frame_equal(dset_2_2, dset_2_2_parsed) # test legacy format of merge_cells=False dset_2_2.to_excel( tmp_path / "dset_2_2_no_merge.xlsx", engine=engine, merge_cells=False, header=True, index=True, ) dset_2_2.to_excel( tmp_path / "dset_2_0_no_merge.xlsx", engine=engine, merge_cells=False, header=True, index=False, ) dset_2_2.to_excel( tmp_path / "dset_0_2_no_merge.xlsx", engine=engine, merge_cells=False, header=False, index=True, ) dset_2_2.to_excel( tmp_path / "dset_0_0_no_merge.xlsx", engine=engine, merge_cells=False, header=False, index=False, ) # 2_2: test 2 header 2 index dset_2_2_parsed = mp.read_excel(tmp_path / "dset_2_2_no_merge.xlsx") dset_2_2_parsed.columns = mi_columns assert_frame_equal(dset_2_2, dset_2_2_parsed) # 2_0: test 2 header 0 index dset_2_2_parsed = mp.read_excel(tmp_path / "dset_2_0_no_merge.xlsx") dset_2_2_parsed.index = mi_index dset_2_2_parsed.columns = mi_columns assert_frame_equal(dset_2_2, dset_2_2_parsed) # 0_2: test 0 header 2 index dset_2_2_parsed = mp.read_excel(tmp_path / "dset_0_2_no_merge.xlsx") dset_2_2_parsed.index.names = mi_index.names dset_2_2_parsed.columns = mi_columns assert_frame_equal(dset_2_2, dset_2_2_parsed) # 0_0: test 0 header 0 index dset_2_2_parsed = mp.read_excel(tmp_path / "dset_0_0_no_merge.xlsx") dset_2_2_parsed.index = mi_index dset_2_2_parsed.columns = mi_columns assert_frame_equal(dset_2_2, dset_2_2_parsed)
current_dir = Path(__file__).parent.absolute() data = [ [1, 4, 7, "1/1/2001", 1, "1/2/2001"], [2, 5, 8, "2/2/2002", 2, "2/3/2003"], [3, 6, 9, "3/3/2003", 3, "3/4/2003"], ] reg_columns = ["col1", "col2", "col3", "date", "ids", "date2"] reg_index = [1, 2, 3] reg_df = pd.DataFrame(data, index=reg_index, columns=reg_columns) reg_dset = mp.Dataset(reg_df, id_col_name="ids", date_col_name="date") mi_columns = pd.MultiIndex.from_product([["level"], reg_columns]) mi_index = pd.MultiIndex.from_tuples([("a", 1), ("b", 2), ("c", 3)]) mi_df = pd.DataFrame(data=data, index=mi_index, columns=mi_columns) mi_dset = mp.Dataset( mi_df, id_col_name=("level", "ids"), date_col_name=("level", "date"), name="mi_test_name" ) @pytest.mark.parametrize("engine", ["openpyxl", "xlsxwriter"]) class TestPandasExcel: def test_dfs(self, tmp_path, engine):
csvstream = io.StringIO(csvdata) df = pd.read_csv(csvstream) csvstream.close() return df start = timer() df_orig = load_csv() end = timer() print(f"load csv to df: {end - start} sec") with tempfile.TemporaryDirectory() as tmpdirname: tmpdirname = pathlib.Path(tmpdirname) dset = mp.Dataset(data=df_orig.copy()) start = timer() dset.to_excel(tmpdirname / "dset_xlsxwriter.xlsx", engine="mp_xlsxwriter") end = timer() print(f"dset xlsxwriter: {end - start} sec") dset = mp.Dataset(data=df_orig.copy()) start = timer() dset.to_excel(tmpdirname / "dset_openpyxl.xlsx", engine="mp_openpyxl") end = timer() print(f"dset openpyxl: {end - start} sec") df = df_orig.copy() start = timer() df.to_excel(tmpdirname / "df_xlsxwriter.xlsx", engine="xlsxwriter") end = timer()
def date_proximity( left: mp.Dataset, right: mp.Dataset, get: str = "all", when: str = "earlier_or_later", days: int = 90, dropna: bool = False, drop_duplicates: bool = False, duplicates_indicator: bool = False, merge_suffixes=get_option("operators.binary.column_suffixes"), prepend_level_name: bool = True, ) -> None: """Links data across two :class:`Dataset` objects by date proximity, first joining them on their :attr:`Dataset.id2_col_name`. Specifically, a `left` Dataset contains a timepoint anchor, and a `right` Dataset is linked to the `left` by retrieving all rows that match on :attr:`Dataset.id2_col_name`, and whose :attr:`Dataset.date_col_name` fields are within a certain time range of each other. This is the :class:`Dataset` analog of :func:`macpie.pandas.date_proximity`. :param left: the :class:`Dataset` containing the timepoint anchor :param right: the :class:`Dataset` to link. Its :attr:`Dataset.df` attribute gets updated with the results of this operation :param get: which rows of the right :class:`Dataset` to link in reference to the timepoint anchor: ``all`` keep all rows ``closest`` get only the closest row that is within ``days`` days of the right DataFrame timepoint anchor :param when: which rows of the right Dataset to link in temporal relation to the timepoint anchor ``earlier`` get only rows that are earlier than the timepoint anchor ``later`` get only rows that are lter (more recent) than the timepoint anchor ``earlier_or_later`` get rows that are earlier or later than the timepoint anchor :param days: the time range measured in days :param dropna: whether to exclude rows that did not find any match :param duplicates_indicator: if True, adds a boolean column to the output Dataset called "_mp_duplicates" (True if duplicate, false if not). The column can be given a different name by providing a string argument. :param merge_suffixes: A length-2 sequence where the first element is suffix to add to the left Dataset columns, and second element is suffix to add to the right Dataset columns. """ if prepend_level_name: prepend_levels = (left.name, right.name) else: prepend_levels = (None, None) result_df = mp.pandas.operators.date_proximity.date_proximity( left, right, id_left_on=left.id2_col_name, id_right_on=right.id2_col_name, date_left_on=left.date_col_name, date_right_on=right.date_col_name, get=get, when=when, days=days, left_link_id=left.id_col_name, dropna=dropna, drop_duplicates=drop_duplicates, duplicates_indicator=duplicates_indicator, merge="partial", merge_suffixes=merge_suffixes, prepend_levels=prepend_levels, ) if prepend_level_name: new_id_col_name = (right.name, right.id_col_name) new_date_col_name = (right.name, right.date_col_name) new_id2_col_name = (right.name, right.id2_col_name) else: new_id_col_name = right.id_col_name new_date_col_name = right.date_col_name new_id2_col_name = right.id2_col_name return mp.Dataset( result_df, id_col_name=new_id_col_name, date_col_name=new_date_col_name, id2_col_name=new_id2_col_name, name=right.name, )