Beispiel #1
0
def test_merge_omd():
    target = [{'a': 'A'}, {'a': 'aleph'}]

    assert glom(target, Merge(init=OMD)) == OMD({'a': 'aleph'})
    assert glom(target, Merge(init=OMD,
                              op='update_extend')) == OMD([('a', 'A'),
                                                           ('a', 'aleph')])
Beispiel #2
0
def test_merge():

    target = [{'a': 'A'}, {'b': 'B'}]

    assert glom(target, Merge()) == {'a': 'A', 'b': 'B'}

    assert glom(target, Merge(op=dict.update)) == {'a': 'A', 'b': 'B'}

    with pytest.raises(ValueError):
        Merge(init=list)  # has no .update()

    with pytest.raises(ValueError):
        Merge(op='update_extend')  # doesn't work on base dict, the default init
def fuzzy_match(
    df: pd.DataFrame,
    source_column: Union[str, int],
    target: Iterable[str],
    scorer: Callable = textdistance.jaro_winkler.distance,
    key: Callable = lambda x: x,
    maximize: bool = False,
    debug=False,
):
    source = df[source_column].astype("str").unique()

    source_diff = list(set(source) - set(target))
    target_diff = list(set(target) - set(source))

    matches = fuzzy_matching_best(
        source=source_diff,
        target=target_diff,
        key=key,
        scorer=scorer,
        maximize=maximize,
    )

    replacements_spec = Merge([{T["source"]: "target"}])
    replacements_dict = glom(matches, replacements_spec)

    distances_spec = ([{T["source"]: "distance"}], Merge())
    distances_dict = glom(matches, distances_spec)

    if debug:
        debug_col_name = f"{source_column}_match_from_target"
        return df.pipe(
            lambda df: df.assign(
                **{
                    debug_col_name: df[source_column].replace(replacements_dict),
                    "distance": df[source_column]
                    .replace(distances_dict)
                    .replace(r"\D+", 0, regex=True),
                }
            )
        ).set_index([source_column, debug_col_name, "distance"])
    else:
        return df.pipe(
            lambda df: df.assign(
                **{source_column: df[source_column].replace(replacements_dict)}
            )
        )
Beispiel #4
0
def test_merge():

    target = [{'a': 'A'}, {'b': 'B'}]

    assert glom(target, Merge()) == {'a': 'A', 'b': 'B'}