def test_merge_omd(): target = [{'a': 'A'}, {'a': 'aleph'}] assert glom(target, Merge(init=OMD)) == OMD({'a': 'aleph'}) assert glom(target, Merge(init=OMD, op='update_extend')) == OMD([('a', 'A'), ('a', 'aleph')])
def test_merge(): target = [{'a': 'A'}, {'b': 'B'}] assert glom(target, Merge()) == {'a': 'A', 'b': 'B'} assert glom(target, Merge(op=dict.update)) == {'a': 'A', 'b': 'B'} with pytest.raises(ValueError): Merge(init=list) # has no .update() with pytest.raises(ValueError): Merge(op='update_extend') # doesn't work on base dict, the default init
def fuzzy_match( df: pd.DataFrame, source_column: Union[str, int], target: Iterable[str], scorer: Callable = textdistance.jaro_winkler.distance, key: Callable = lambda x: x, maximize: bool = False, debug=False, ): source = df[source_column].astype("str").unique() source_diff = list(set(source) - set(target)) target_diff = list(set(target) - set(source)) matches = fuzzy_matching_best( source=source_diff, target=target_diff, key=key, scorer=scorer, maximize=maximize, ) replacements_spec = Merge([{T["source"]: "target"}]) replacements_dict = glom(matches, replacements_spec) distances_spec = ([{T["source"]: "distance"}], Merge()) distances_dict = glom(matches, distances_spec) if debug: debug_col_name = f"{source_column}_match_from_target" return df.pipe( lambda df: df.assign( **{ debug_col_name: df[source_column].replace(replacements_dict), "distance": df[source_column] .replace(distances_dict) .replace(r"\D+", 0, regex=True), } ) ).set_index([source_column, debug_col_name, "distance"]) else: return df.pipe( lambda df: df.assign( **{source_column: df[source_column].replace(replacements_dict)} ) )
def test_merge(): target = [{'a': 'A'}, {'b': 'B'}] assert glom(target, Merge()) == {'a': 'A', 'b': 'B'}