Esempio n. 1
0
def _find_best_matching_rows(strings, right, right_on, na_ratio, two_na_ratio,
                             case_sensitivity, score_name, num_threads,
                             similarity_function, weights, num_results, echo):
    """
	:param strings:
	:param right:
	:param right_on:
	:param na_ratio:
	:param two_na_ratio:
	:param case_sensitivity:
	:param score_name:
	:param num_threads:
	:param num_results:
	:param echo:
	:rtype: DataFrame
	"""
    right = right.copy()

    if num_threads == 1:

        right[score_name] = ProgressBar.apply(
            data=right,
            function=lambda row: _get_similarity_between_strings_and_row(
                strings=strings,
                row=row,
                right_on=right_on,
                na_ratio=na_ratio,
                two_na_ratio=two_na_ratio,
                case_sensitivity=case_sensitivity,
                similarity_function=similarity_function,
                weights=weights),
            echo=echo)

    else:

        parallel = Parallel(n_jobs=num_threads,
                            backend='threading',
                            require='sharedmem')
        progress_bar = ProgressBar(total=len(right) + 1, echo=echo)
        right[score_name] = parallel(
            delayed(_get_similarity_between_strings_and_row)(
                strings=strings,
                row=row,
                right_on=right_on,
                na_ratio=na_ratio,
                two_na_ratio=two_na_ratio,
                case_sensitivity=case_sensitivity,
                similarity_function=similarity_function,
                weights=weights)
            for index, row in iterate(right.iterrows(),
                                      progress_bar=progress_bar))
        progress_bar.show(amount=len(right) + 1)

    right = right.sort_values(by=score_name, ascending=False)
    return right.iloc[0:num_results]
Esempio n. 2
0
def fuzzy_left_merge(left,
                     right,
                     left_on=None,
                     right_on=None,
                     on=None,
                     suffixes=('_x', '_y'),
                     score_name='match_ratio',
                     na_ratio=0.5,
                     two_na_ratio=0.75,
                     similarity_function=None,
                     weights=None,
                     case_sensitivity=0.5,
                     num_results=1,
                     num_threads=-1,
                     echo=1):
    """
	:type left: DataFrame
	:type right: DataFrame
	:type left_on: list[str] or str or NoneType
	:type right_on: list[str] or str or NoneType
	:type on: list[str] or str or NoneType
	:type how: str or NoneType
	:type case_sensitivity: float
	:type num_results: int
	:type similarity_function: callable
	:type echo: int or bool or ProgressBar
	:type num_threads: int
	:rtype: DataFrame
	"""
    if score_name in left.columns or score_name in right.columns:
        raise ValueError('use a score_name different from column names.')

    data1 = left.copy()
    data2 = right.copy()

    if on is None:
        on = data1.columns & data2.columns

    if left_on is None:
        left_on = on
    if right_on is None:
        right_on = on

    missing_left = [col for col in left_on if col not in data1.columns]
    if len(missing_left) > 0:
        raise KeyError(f'missing columns on left: {missing_left}')
    missing_right = [col for col in right_on if col not in data2.columns]
    if len(missing_right) > 0:
        raise KeyError(f'missing columns on right: {missing_right}')

    data1['fuzzy_id'] = range(len(data1))

    if num_threads == 1:
        results = ProgressBar.apply(
            data=data1,
            echo=echo,
            function=lambda row: _match_rows(row=row,
                                             right=data2,
                                             left_on=left_on,
                                             right_on=right_on,
                                             na_ratio=na_ratio,
                                             two_na_ratio=two_na_ratio,
                                             case_sensitivity=case_sensitivity,
                                             score_name=score_name,
                                             num_results=num_results,
                                             similarity_function=
                                             similarity_function,
                                             weights=weights,
                                             num_threads=1,
                                             echo=echo - 1))

    else:
        parallel = Parallel(n_jobs=num_threads,
                            backend='threading',
                            require='sharedmem')
        progress_bar = ProgressBar(total=len(data1) + 1, echo=echo)

        results = parallel(
            delayed(_match_rows)(row=row,
                                 right=data2,
                                 left_on=left_on,
                                 right_on=right_on,
                                 na_ratio=na_ratio,
                                 two_na_ratio=two_na_ratio,
                                 case_sensitivity=case_sensitivity,
                                 score_name=score_name,
                                 num_results=num_results,
                                 similarity_function=similarity_function,
                                 weights=weights,
                                 num_threads=1,
                                 echo=echo - 1)
            for index, row in iterate(data1.iterrows(),
                                      progress_bar=progress_bar))
        progress_bar.show(amount=len(data1) + 1)

    data2 = concat(results).reset_index(drop=True)

    return data1.merge(right=data2,
                       on='fuzzy_id',
                       how='left',
                       suffixes=suffixes).drop(columns='fuzzy_id')