Beispiel #1
0
def test_init_one_path_one_df():
    path2 = Path("home") / "path" / "to" / "data2"
    comparer = Comparer(dataset1=pd.DataFrame(), dataset2=path2)
    assert not comparer.path1  # None
    assert isinstance(comparer.path2, Path)
    assert isinstance(comparer.df1, pd.DataFrame)
    assert not comparer.df2  # None
Beispiel #2
0
def test_init_both_strings():
    comparer = Comparer(
        dataset1="this/is/a/path/string", dataset2="path/string/to/second/dataset"
    )
    assert isinstance(comparer.path1, str)
    assert isinstance(comparer.path2, str)
    assert not comparer.df1  # None
    assert not comparer.df2  # None
Beispiel #3
0
def test_init_both_os_paths():
    path1 = os.path.join("data1/path")
    path2 = os.path.join("data2/path")
    comparer = Comparer(dataset1=path1, dataset2=path2)
    assert isinstance(comparer.path1, str)
    assert isinstance(comparer.path2, str)
    assert not comparer.df1  # None
    assert not comparer.df2  # None
Beispiel #4
0
def test_init_both_paths():
    path1 = Path("home") / "path" / "to" / "data1"
    path2 = Path("home") / "path" / "to" / "data2"
    comparer = Comparer(dataset1=path1, dataset2=path2)
    assert isinstance(comparer.path1, Path)
    assert isinstance(comparer.path2, Path)
    assert not comparer.df1  # None
    assert not comparer.df2  # None
Beispiel #5
0
def many_to_many_all_match_obj():
    data_path = (Path(__file__).resolve().parent / "juxta" / "tests" / "data" /
                 "no-mismatches" / "many-to-many")
    d1_path = data_path / "many_to_many-A1.csv"
    d2_path = d1_path
    comparer = Comparer(dataset1=d1_path, dataset2=d2_path)
    comparer.set_dataframes(parse_dates=["intake_dt", "exit_dt"])
    return comparer
Beispiel #6
0
def shuffled_one_to_one_unmatchable_obj():
    data_path = (Path(__file__).resolve().parent / "juxta" / "tests" / "data" /
                 "unmatchable" / "one-to-one")
    d1_path = data_path / "one_to_one-A1.csv"
    d2_path = data_path / "one_to_one-A2-shuffled.csv"
    comparer = Comparer(dataset1=d1_path, dataset2=d2_path)
    comparer.set_dataframes(parse_dates=["intake_dt", "exit_dt"])
    return comparer
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser(
        description="Compare two datasets by passing paths to their CSV files")
    parser.add_argument("path1",
                        type=lambda x: Path(x),
                        help="path/to/dataset1.csv file")
    parser.add_argument("path2",
                        type=lambda x: Path(x),
                        help="path/to/dataset2.csv file")
    parser.add_argument(
        "--use-config",
        action="store_true",
        help=
        "setting this flag will use the config file for setting the other optional args",
    )
    parser.add_argument(
        "--save-as-excel",
        action="store_true",
        help=
        "Set this to save result as excel. Otherwise, will save as CSV file.",
    )
    parser.add_argument("--ds1-keep-columns", nargs="+")
    parser.add_argument("--ds2-keep-columns", nargs="+")
    parser.add_argument("--mapper", nargs="+")
    parser.add_argument("--group-on", nargs="+")
    parser.add_argument("--compare-on", nargs="+")

    args = parser.parse_args()

    if args.use_config:
        from juxta import config

        print(" ---> filling out optional args with config file")
        ds1_columns = config.DS1_KEEP_COLUMNS
        ds2_columns = config.DS2_KEEP_COLUMNS
        mapper = config.MAPPER
        group_on = config.GROUP_ON
        compare_on = config.COMPARE_ON

    else:
        ds1_columns = args.ds1_keep_columns
        ds2_columns = args.ds2_keep_columns
        mapper = args.mapper
        group_on = args.group_on
        compare_on = args.compare_on

        if not ds1_columns:  # if they are None
            print(
                "COULD NOT RUN: at least --ds1-keep-columns must be set if not using config file"
            )
            sys.exit()

        if (not mapper) or (not group_on) or (not compare_on):
            print(
                "COULD NOT RUN: --mapper, --group-on, and --compare-on must all be set if not using config file"
            )
            sys.exit()

    prepper = Prepper(dataset1=args.path1, dataset2=args.path2)
    prepper.set_dataframes().keep_relevant_columns(
        ds1_columns=ds1_columns,
        ds2_columns=ds2_columns).map_to_ds1_column_names(mapper=mapper)

    comparer = Comparer(dataset1=prepper.df1, dataset2=prepper.df2)
    comparer.compare_dataframes(group_on=group_on, compare_on=compare_on)

    if args.save_as_excel:
        output_path = args.path1.parent / "results.xlsx"
        comparer.results_to_excel(output_path)
        sys.exit()

    output_path = args.path1.parent / "result.csv"
    comparer.results_to_csv(output_path)
Beispiel #8
0
def test_init_both_dfs():
    comparer = Comparer(dataset1=pd.DataFrame(), dataset2=pd.DataFrame())
    assert not comparer.path1  # None
    assert not comparer.path2  # None
    assert isinstance(comparer.df1, pd.DataFrame)
    assert isinstance(comparer.df2, pd.DataFrame)
Beispiel #9
0
def test_init_one_string_one_df():
    comparer = Comparer(dataset1="this/is/a/path/string", dataset2=pd.DataFrame())
    assert isinstance(comparer.path1, str)
    assert not comparer.path2  # None
    assert not comparer.df1  # None
    isinstance(comparer.df2, pd.DataFrame)