Esempio n. 1
0
def test_keep_latest_csv():
    # test latest
    df = file_to_dataframe(data_dir / "instr1_primaryall.csv")

    result = df.mac.group_by_keep_one(group_by_col="pidn",
                                      date_col_name="dcdate",
                                      keep="latest")

    expected_result = file_to_dataframe(data_dir / "instr1_primarylatest.csv")

    assert_dfs_equal(result,
                     expected_result,
                     cols_ignore=cols_ignore,
                     output_dir=output_dir)
Esempio n. 2
0
def test_keepone(cli_keepone_big):
    G = ExecutableGraph()

    prim_filepath = data_dir / "instr1_primaryall.csv"

    primary = LavaDataset.from_file(prim_filepath)

    G.add_node(
        primary,
        operation=partial(
            group_by_keep_one,
            group_by_col=primary.id2_col_name,
            date_col_name=primary.date_col_name,
            keep="earliest",
            drop_duplicates=False,
        ),
    )

    G.execute()

    nodes_with_operations = G.get_all_node_data("operation")

    result = nodes_with_operations[0]["operation_result"]

    expected_result = file_to_dataframe(cli_keepone_big)

    assert_dfs_equal(result,
                     expected_result,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)
Esempio n. 3
0
def test_keep_earliest_csv():
    # test earliest
    df = file_to_dataframe(data_dir / "instr1_primaryall.csv")

    result = df.mac.group_by_keep_one(group_by_col="pidn",
                                      date_col_name="dcdate",
                                      keep="earliest")

    assert get_option("column.system.duplicates") in result.columns

    expected_result = file_to_dataframe(data_dir /
                                        "instr1_primaryearliest.csv")

    assert_dfs_equal(result,
                     expected_result,
                     cols_ignore=cols_ignore,
                     output_dir=output_dir)
Esempio n. 4
0
def test_dupes():

    primary = file_to_dataframe(current_dir / "primary.xlsx")
    secondary = file_to_dataframe(current_dir / "secondary.xlsx")

    dupes_result = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        merge='full',
        duplicates_indicator=True
    )

    # dupes_result.to_excel(current_dir / "dupes_result.xlsx", index=False)
    dupes_expected_result = file_to_dataframe(current_dir / "dupes_expected_result.xlsx")
    assert_dfs_equal(dupes_result, dupes_expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
Esempio n. 5
0
def test_merge_partial():
    # partial merge

    merge_partial_result = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        merge='partial'
    )

    # merge_partial_result.to_excel(current_dir / "merge_partial_result.xlsx", index=False)
    merge_partial_expected_result = file_to_dataframe(current_dir / "merge_partial_expected_result.xlsx")
    assert_dfs_equal(merge_partial_result,
                     merge_partial_expected_result,
                     cols_ignore=cols_ignore,
                     output_dir=output_dir)

    # test that results are same when using equivalent id and date params
    test_id_on_params = primary.mac.date_proximity(
        secondary,
        id_left_on='pidn',
        id_right_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        merge='partial'
    )

    test_date_on_params = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_left_on='dcdate',
        date_right_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        merge='partial'
    )

    # test using id_left_on and id_right_on params
    assert merge_partial_result.equals(test_id_on_params)

    # test using date_left_on and date_right_on params
    assert merge_partial_result.equals(test_date_on_params)
Esempio n. 6
0
def test_filter_by_id():

    df = file_to_dataframe(current_dir / "basic.xlsx")
    # ids list with invalid integer should raise ValueError
    ids = [1, 2, "hello"]
    with pytest.raises(ValueError):
        df.mac.filter_by_id("pidn", ids)

    # number of rows of filtered result should match number of ids
    ids = [2, 3, "4"]
    result = df.mac.filter_by_id("pidn", ids)
    # result.to_excel(Path("tests/pandas/operators/filter_by_id/result.xlsx"), index=False)

    assert result.mac.row_count() == 4
Esempio n. 7
0
def test_merge_full():
    # full merge

    merge_full_result = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        merge='full'
    )

    # merge_full_result.to_excel(current_dir / "merge_full_result.xlsx", index=False)
    merge_full_expected_result = file_to_dataframe(current_dir / "merge_full_expected_result.xlsx")
    assert_dfs_equal(merge_full_result, merge_full_expected_result, cols_ignore=cols_ignore, output_dir=output_dir)
def test_left_link_id_blank_merge_partial():
    # partial merge

    result = primary.mac.date_proximity(secondary,
                                        id_on='pidn',
                                        date_on='dcdate',
                                        get='closest',
                                        when='earlier_or_later',
                                        days=90,
                                        merge='partial')

    # result.to_excel(current_dir / "left_link_id_blank_merge_partial_result.xlsx", index=False)
    expected_result = file_to_dataframe(
        current_dir / "left_link_id_blank_merge_partial_expected_result.xlsx")
    assert_dfs_equal(result,
                     expected_result,
                     cols_ignore=cols_ignore,
                     output_dir=output_dir)
Esempio n. 9
0
def test_secondary_instr1():

    secondary_instr1 = file_to_dataframe(data_dir / "instr1_all.csv")

    # test closest; earlier_or_later; 90 days
    instr1_result = primary.mac.date_proximity(
        secondary_instr1,
        id_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        duplicates_indicator=True
    )

    # instr1_result.to_excel(current_dir / "instr1_result.xlsx", index=False)

    instr1_expected_result = dfs_dict['INSTR1_linked']
    assert_dfs_equal(instr1_result,
                     instr1_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat)
Esempio n. 10
0
from pathlib import Path

from macpie.pandas import file_to_dataframe
from macpie.testing import assert_dfs_equal


current_dir = Path(__file__).parent.absolute()

# output_dir = current_dir
output_dir = None

primary = file_to_dataframe(current_dir / "primary.xlsx")

secondary = file_to_dataframe(current_dir / "secondary.xlsx")

cols_ignore = []


def test_merge_partial():
    # partial merge

    merge_partial_result = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90,
        left_link_id='instrid',
        merge='partial'
    )
Esempio n. 11
0
def test_instr1():

    dfs_dict = pd.read_excel(data_dir / "instr1.xlsx",
                             sheet_name=[
                                 'primary', 'closest_earlier_or_later_90',
                                 'closest_later_90', 'closest_earlier_90',
                                 'all_earlier_or_later_90', 'all_later_90',
                                 'all_earlier_90'
                             ])

    primary = dfs_dict['primary']
    secondary = file_to_dataframe(data_dir / "instr1_all.csv")

    # test closest; earlier_or_later; 90 days
    closest_earlier_or_later_90_result = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_on='dcdate',
        get='closest',
        when='earlier_or_later',
        days=90)
    # closest_earlier_or_later_90_result.to_excel(current_dir / "closest_earlier_or_later_90_result.xlsx", index=False)
    closest_earlier_or_later_90_expected_result = dfs_dict[
        'closest_earlier_or_later_90']
    assert_dfs_equal(closest_earlier_or_later_90_result,
                     closest_earlier_or_later_90_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)

    # test closest; later; 90 days
    closest_later_90_result = primary.mac.date_proximity(secondary,
                                                         id_on='pidn',
                                                         date_on='dcdate',
                                                         get='closest',
                                                         when='later',
                                                         days=90)
    # closest_later_90_result.to_excel(current_dir / "closest_later_90_result.xlsx", index=False)
    closest_later_90_expected_result = dfs_dict['closest_later_90']
    assert_dfs_equal(closest_later_90_result,
                     closest_later_90_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)

    # test closest; earlier; 90 days
    closest_earlier_90_result = primary.mac.date_proximity(secondary,
                                                           id_on='pidn',
                                                           date_on='dcdate',
                                                           get='closest',
                                                           when='earlier',
                                                           days=90)

    # closest_earlier_90_result.to_excel(current_dir / "closest_earlier_90_result.xlsx", index=False)
    closest_earlier_90_expected_result = dfs_dict['closest_earlier_90']
    assert_dfs_equal(closest_earlier_90_result,
                     closest_earlier_90_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)

    # test all; earlier_or_later; 90 days
    all_earlier_or_later_90_result = primary.mac.date_proximity(
        secondary,
        id_on='pidn',
        date_on='dcdate',
        get='all',
        when='earlier_or_later',
        days=90)

    # all_earlier_or_later_90_result.to_excel(current_dir / "all_earlier_or_later_90_result.xlsx", index=False)
    all_earlier_or_later_90_expected_result = dfs_dict[
        'all_earlier_or_later_90']
    assert_dfs_equal(all_earlier_or_later_90_result,
                     all_earlier_or_later_90_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)

    # test all; later; 90 days
    all_later_90_result = primary.mac.date_proximity(secondary,
                                                     id_on='pidn',
                                                     date_on='dcdate',
                                                     get='all',
                                                     when='later',
                                                     days=90)

    # all_later_90_result.to_excel(current_dir / "all_later_90_result.xlsx", index=False)
    all_later_90_expected_result = dfs_dict['all_later_90']
    assert_dfs_equal(all_later_90_result,
                     all_later_90_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)

    # test all; earlier; 90 days
    all_earlier_90_result = primary.mac.date_proximity(secondary,
                                                       id_on='pidn',
                                                       date_on='dcdate',
                                                       get='all',
                                                       when='earlier',
                                                       days=90)

    # all_earlier_90_result.to_excel(current_dir / "all_earlier_90_result.xlsx", index=False)
    all_earlier_90_expected_result = dfs_dict['all_earlier_90']
    assert_dfs_equal(all_earlier_90_result,
                     all_earlier_90_expected_result,
                     cols_ignore=cols_ignore,
                     cols_ignore_pat=cols_ignore_pat,
                     output_dir=output_dir)
Esempio n. 12
0
from pathlib import Path

import pandas as pd

from macpie.pandas import file_to_dataframe
from macpie.testing import assert_dfs_equal

current_dir = Path("tests/pandas/operators/merge/basic/").resolve()

# output_dir = current_dir
output_dir = None

primary = file_to_dataframe(current_dir / "small.xlsx")

dfs_dict = pd.read_excel(
    current_dir / "small.xlsx",
    sheet_name=['small_anchor', 'instr2_all_linked', 'instr3_all_linked'])

small_anchor = dfs_dict['small_anchor']
instr2_all_linked = dfs_dict['instr2_all_linked']
instr3_all_linked = dfs_dict['instr3_all_linked']


def test_add_suffixes_false():
    result = small_anchor.mac.merge(
        instr2_all_linked,
        left_on=['pidn', 'dcdate', 'instrid'],
        right_on=['pidn_x', 'dcdate_x', 'instrid_x'],
        merge_suffixes=('_a', '_b'),
        add_suffixes=False).mac.merge(
            instr3_all_linked,