Ejemplo n.º 1
0
    def test_1_1_permutations(self, tmp_path, engine):
        df_1_1 = pd.DataFrame(data, columns=reg_columns, index=reg_index)

        dset_1_1 = mp.Dataset(df_1_1)

        dset_1_1.to_excel(tmp_path / "dset_1_0.xlsx", engine=engine, header=True, index=False)
        dset_1_1.to_excel(tmp_path / "dset_0_1.xlsx", engine=engine, header=False, index=True)
        dset_1_1.to_excel(tmp_path / "dset_1_1.xlsx", engine=engine, header=True, index=True)
        dset_1_1.to_excel(tmp_path / "dset_0_0.xlsx", engine=engine, header=False, index=False)

        # 1_1: test 1 header 1 index
        dset_1_1_parsed = mp.read_excel(tmp_path / "dset_1_1.xlsx")
        assert_frame_equal(dset_1_1, dset_1_1_parsed)

        # 1_0: test 1 header 0 index
        dset_1_1_parsed = mp.read_excel(tmp_path / "dset_1_0.xlsx")
        dset_1_1_parsed.index = reg_index
        assert_frame_equal(dset_1_1, dset_1_1_parsed)

        # 0_1: test 0 header 1 index
        dset_1_1_parsed = mp.read_excel(tmp_path / "dset_0_1.xlsx")
        dset_1_1_parsed.index.name = None  # if no header, pandas inserts a default index name of 0
        dset_1_1_parsed.columns = reg_columns
        assert_frame_equal(dset_1_1, dset_1_1_parsed)

        # 0_0: test 0 header 0 index
        dset_1_1_parsed = mp.read_excel(tmp_path / "dset_0_0.xlsx")
        dset_1_1_parsed.index = reg_index
        dset_1_1_parsed.columns = reg_columns
        assert_frame_equal(dset_1_1, dset_1_1_parsed)
Ejemplo n.º 2
0
    def test_2_1(self, tmp_path, engine):
        df_2_1 = pd.DataFrame(data, columns=mi_columns, index=reg_index)
        dset_2_1 = mp.Dataset(df_2_1)
        dset_2_1.to_excel(tmp_path / "dset_2_1.xlsx", engine=engine, header=True, index=True)

        # 2_1: test 2 header 1 index
        dset_2_1_parsed = mp.read_excel(tmp_path / "dset_2_1.xlsx")
        assert_frame_equal(dset_2_1, dset_2_1_parsed)
Ejemplo n.º 3
0
    def test_1_2(self, tmp_path, engine):
        df_1_2 = pd.DataFrame(data, columns=reg_columns, index=mi_index)
        dset_1_2 = mp.Dataset(df_1_2)
        dset_1_2.to_excel(tmp_path / "dset_1_2.xlsx", engine=engine, header=True, index=True)

        # 1_2: test 1 header 2 index
        dset_1_2_parsed = mp.read_excel(tmp_path / "dset_1_2.xlsx")
        assert_frame_equal(dset_1_2, dset_1_2_parsed)
Ejemplo n.º 4
0
def test_basiclist(tmp_path):
    dset1 = mp.Dataset({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})

    dset1.to_excel(tmp_path / "dset1.xlsx")

    with mp.MACPieExcelFile(tmp_path / "dset1.xlsx") as reader:
        dset1_from_file = mp.read_excel(reader, sheet_name="NO_NAME")
    # dset1_from_file = mp.read_excel(tmp_path / "dset1.xlsx", sheet_name="NO_NAME")

    assert dset1.equals(dset1_from_file)

    dset2 = mp.Dataset(
        {
            "A": [1, 2, 3],
            "albert": [4, 5, 6],
            "C": [7, 8, 9]
        },
        id_col_name="albert",
        name="renee",
        tags=["a", "b"],
    )

    basic_list = mp.BasicList([dset1, dset2])

    with mp.MACPieExcelWriter(tmp_path / "basic_list.xlsx") as writer:
        basic_list.to_excel(writer)

    with mp.MACPieExcelFile(tmp_path / "basic_list.xlsx") as reader:
        basic_list_from_file = mp.read_excel(reader, as_collection=True)
    # basic_list_from_file = mp.read_excel(tmp_path / "basic_list.xlsx", as_collection=True)

    assert len(basic_list_from_file) == 2

    assert basic_list_from_file[0].equals(dset1)

    assert basic_list_from_file[1].equals(dset2)
Ejemplo n.º 5
0
def test_display_name_generator():
    dset = mp.Dataset(
        {
            "A": [1, 2, 3],
            "albert": [4, 5, 6],
            "C": [7, 8, 9]
        },
        id_col_name="albert",
        name="renee",
        tags=["a", "b"],
    )

    dset.display_name == "renee_a_b"
    dset.display_name_generator = mp.MergeableAnchoredList.dataset_display_name_generator
    dset.display_name == "renee"
Ejemplo n.º 6
0
def group_by_keep_one(dset: mp.Dataset,
                      keep: str = "all",
                      drop_duplicates: bool = False) -> None:
    """Given a :class:`Dataset` object, group on the :attr:`Dataset.id2_col_name` column
    and keep only the earliest or latest row in each group as determined by the date
    in the :attr:`Dataset.date_col_name` column.

    This is the :class:`Dataset` analog of :func:`macpie.pandas.group_by_keep_one`.

    :param dset: the :class:`Dataset` to operate on. Its ``df`` attribute
                 gets updated with the result of this operation.
    :param keep: specify which row of each group to keep

        ``all``
             keep all rows

        ``earliest``
             in each group, keep only the earliest (i.e. oldest) row

        ``latest``
             in each group, keep only the latest (i.e. most recent) row

    :param drop_duplicates: if ``True``, then if more than one row is determined to be
                            earliest or or latest in each group, drop all duplicates
                            except the first occurrence. ``dset``'s ``id_col_name`` will
                            be used for identifying duplicates
    """
    result_df = mp.pandas.operators.group_by_keep_one.group_by_keep_one(
        df=dset,
        group_by_col=dset.id2_col_name,
        date_col_name=dset.date_col_name,
        keep=keep,
        id_col_name=dset.id_col_name,
        drop_duplicates=drop_duplicates,
    )

    return mp.Dataset(data=result_df)
Ejemplo n.º 7
0
    def test_2_2_permutations(self, tmp_path, engine):
        df_2_2 = pd.DataFrame(data, columns=mi_columns, index=mi_index)

        dset_2_2 = mp.Dataset(df_2_2)

        dset_2_2.to_excel(tmp_path / "dset_2_2.xlsx", engine=engine, header=True, index=True)
        dset_2_2.to_excel(tmp_path / "dset_2_0.xlsx", engine=engine, header=True, index=False)
        dset_2_2.to_excel(tmp_path / "dset_0_2.xlsx", engine=engine, header=False, index=True)
        dset_2_2.to_excel(tmp_path / "dset_0_0.xlsx", engine=engine, header=False, index=False)

        # 2_2: test 2 header 2 index
        dset_2_2_parsed = mp.read_excel(tmp_path / "dset_2_2.xlsx")
        assert_frame_equal(dset_2_2, dset_2_2_parsed)

        # 2_0: test 2 header 0 index
        dset_2_2_parsed = mp.read_excel(tmp_path / "dset_2_0.xlsx")
        dset_2_2_parsed.index = mi_index
        assert_frame_equal(dset_2_2, dset_2_2_parsed)

        # 0_2: test 0 header 2 index
        dset_2_2_parsed = mp.read_excel(tmp_path / "dset_0_2.xlsx")
        dset_2_2_parsed.index.names = mi_index.names
        dset_2_2_parsed.columns = mi_columns
        assert_frame_equal(dset_2_2, dset_2_2_parsed)

        # 0_0: test 0 header 0 index
        dset_2_2_parsed = mp.read_excel(tmp_path / "dset_0_0.xlsx")
        dset_2_2_parsed.index = mi_index
        dset_2_2_parsed.columns = mi_columns
        assert_frame_equal(dset_2_2, dset_2_2_parsed)

        # test legacy format of merge_cells=False
        dset_2_2.to_excel(
            tmp_path / "dset_2_2_no_merge.xlsx",
            engine=engine,
            merge_cells=False,
            header=True,
            index=True,
        )
        dset_2_2.to_excel(
            tmp_path / "dset_2_0_no_merge.xlsx",
            engine=engine,
            merge_cells=False,
            header=True,
            index=False,
        )
        dset_2_2.to_excel(
            tmp_path / "dset_0_2_no_merge.xlsx",
            engine=engine,
            merge_cells=False,
            header=False,
            index=True,
        )
        dset_2_2.to_excel(
            tmp_path / "dset_0_0_no_merge.xlsx",
            engine=engine,
            merge_cells=False,
            header=False,
            index=False,
        )

        # 2_2: test 2 header 2 index
        dset_2_2_parsed = mp.read_excel(tmp_path / "dset_2_2_no_merge.xlsx")
        dset_2_2_parsed.columns = mi_columns
        assert_frame_equal(dset_2_2, dset_2_2_parsed)

        # 2_0: test 2 header 0 index
        dset_2_2_parsed = mp.read_excel(tmp_path / "dset_2_0_no_merge.xlsx")
        dset_2_2_parsed.index = mi_index
        dset_2_2_parsed.columns = mi_columns
        assert_frame_equal(dset_2_2, dset_2_2_parsed)

        # 0_2: test 0 header 2 index
        dset_2_2_parsed = mp.read_excel(tmp_path / "dset_0_2_no_merge.xlsx")
        dset_2_2_parsed.index.names = mi_index.names
        dset_2_2_parsed.columns = mi_columns
        assert_frame_equal(dset_2_2, dset_2_2_parsed)

        # 0_0: test 0 header 0 index
        dset_2_2_parsed = mp.read_excel(tmp_path / "dset_0_0_no_merge.xlsx")
        dset_2_2_parsed.index = mi_index
        dset_2_2_parsed.columns = mi_columns
        assert_frame_equal(dset_2_2, dset_2_2_parsed)
Ejemplo n.º 8
0
current_dir = Path(__file__).parent.absolute()


data = [
    [1, 4, 7, "1/1/2001", 1, "1/2/2001"],
    [2, 5, 8, "2/2/2002", 2, "2/3/2003"],
    [3, 6, 9, "3/3/2003", 3, "3/4/2003"],
]

reg_columns = ["col1", "col2", "col3", "date", "ids", "date2"]

reg_index = [1, 2, 3]

reg_df = pd.DataFrame(data, index=reg_index, columns=reg_columns)

reg_dset = mp.Dataset(reg_df, id_col_name="ids", date_col_name="date")

mi_columns = pd.MultiIndex.from_product([["level"], reg_columns])

mi_index = pd.MultiIndex.from_tuples([("a", 1), ("b", 2), ("c", 3)])

mi_df = pd.DataFrame(data=data, index=mi_index, columns=mi_columns)

mi_dset = mp.Dataset(
    mi_df, id_col_name=("level", "ids"), date_col_name=("level", "date"), name="mi_test_name"
)


@pytest.mark.parametrize("engine", ["openpyxl", "xlsxwriter"])
class TestPandasExcel:
    def test_dfs(self, tmp_path, engine):
    csvstream = io.StringIO(csvdata)
    df = pd.read_csv(csvstream)
    csvstream.close()
    return df


start = timer()
df_orig = load_csv()
end = timer()
print(f"load csv to df: {end - start} sec")


with tempfile.TemporaryDirectory() as tmpdirname:
    tmpdirname = pathlib.Path(tmpdirname)

    dset = mp.Dataset(data=df_orig.copy())
    start = timer()
    dset.to_excel(tmpdirname / "dset_xlsxwriter.xlsx", engine="mp_xlsxwriter")
    end = timer()
    print(f"dset xlsxwriter: {end - start} sec")

    dset = mp.Dataset(data=df_orig.copy())
    start = timer()
    dset.to_excel(tmpdirname / "dset_openpyxl.xlsx", engine="mp_openpyxl")
    end = timer()
    print(f"dset openpyxl: {end - start} sec")

    df = df_orig.copy()
    start = timer()
    df.to_excel(tmpdirname / "df_xlsxwriter.xlsx", engine="xlsxwriter")
    end = timer()
Ejemplo n.º 10
0
def date_proximity(
    left: mp.Dataset,
    right: mp.Dataset,
    get: str = "all",
    when: str = "earlier_or_later",
    days: int = 90,
    dropna: bool = False,
    drop_duplicates: bool = False,
    duplicates_indicator: bool = False,
    merge_suffixes=get_option("operators.binary.column_suffixes"),
    prepend_level_name: bool = True,
) -> None:
    """Links data across two :class:`Dataset` objects by date proximity, first joining
    them on their :attr:`Dataset.id2_col_name`.

    Specifically, a `left` Dataset contains a timepoint anchor, and a `right` Dataset
    is linked to the `left` by retrieving all rows that match on :attr:`Dataset.id2_col_name`, and
    whose :attr:`Dataset.date_col_name` fields are within a certain time range of each other.

    This is the :class:`Dataset` analog of :func:`macpie.pandas.date_proximity`.

    :param left: the :class:`Dataset` containing the timepoint anchor
    :param right: the :class:`Dataset` to link. Its :attr:`Dataset.df` attribute gets updated with
                  the results of this operation
    :param get: which rows of the right :class:`Dataset` to link in reference to the
                timepoint anchor:

        ``all``
             keep all rows

        ``closest``
             get only the closest row that is within ``days`` days of the
             right DataFrame timepoint anchor

    :param when: which rows of the right Dataset to link in temporal relation
                 to the timepoint anchor

        ``earlier``
             get only rows that are earlier than the timepoint anchor

        ``later``
             get only rows that are lter (more recent) than the timepoint anchor

        ``earlier_or_later``
             get rows that are earlier or later than the timepoint anchor

    :param days: the time range measured in days
    :param dropna: whether to exclude rows that did not find any match
    :param duplicates_indicator: if True, adds a boolean column to the output Dataset called
                                 "_mp_duplicates" (True if duplicate, false if not). The column
                                 can be given a different name by providing a string argument.
    :param merge_suffixes: A length-2 sequence where the first element is
                           suffix to add to the left Dataset columns, and
                           second element is suffix to add to the right Dataset columns.
    """

    if prepend_level_name:
        prepend_levels = (left.name, right.name)
    else:
        prepend_levels = (None, None)

    result_df = mp.pandas.operators.date_proximity.date_proximity(
        left,
        right,
        id_left_on=left.id2_col_name,
        id_right_on=right.id2_col_name,
        date_left_on=left.date_col_name,
        date_right_on=right.date_col_name,
        get=get,
        when=when,
        days=days,
        left_link_id=left.id_col_name,
        dropna=dropna,
        drop_duplicates=drop_duplicates,
        duplicates_indicator=duplicates_indicator,
        merge="partial",
        merge_suffixes=merge_suffixes,
        prepend_levels=prepend_levels,
    )

    if prepend_level_name:
        new_id_col_name = (right.name, right.id_col_name)
        new_date_col_name = (right.name, right.date_col_name)
        new_id2_col_name = (right.name, right.id2_col_name)
    else:
        new_id_col_name = right.id_col_name
        new_date_col_name = right.date_col_name
        new_id2_col_name = right.id2_col_name

    return mp.Dataset(
        result_df,
        id_col_name=new_id_col_name,
        date_col_name=new_date_col_name,
        id2_col_name=new_id2_col_name,
        name=right.name,
    )