Exemple #1
0
def test_output():
    headers1 = ['name', 'born']
    data1 = [['William Shakespeare', '1564'], ['Christopher Marlowe', '1583']]
    headers2 = ['person', 'died']
    data2 = [['Anne Hathaway', '1623'], ['William Shakespeare', '1616']]
    results, keys = csvmatch.run(data1,
                                 headers1,
                                 data2,
                                 headers2,
                                 fields1=['name'],
                                 fields2=['person'],
                                 output=['1*', '2.died', 'degree'])
    assert keys == ['name', 'born', 'died', 'degree']
    assert results == [['William Shakespeare', '1564', '1616', '1.0']]
Exemple #2
0
def test_fields():
    headers1 = ['name', 'born']
    data1 = [['William Shakespeare', '1564'], ['Christopher Marlowe', '1564']]
    headers2 = ['person', 'hometown']
    data2 = [['William Shakespeare', 'Stratford-upon-Avon'],
             ['Anne Hathaway', 'Stratford-upon-Avon']]
    results, keys = csvmatch.run(data1,
                                 headers1,
                                 data2,
                                 headers2,
                                 fields1=['name'],
                                 fields2=['person'])
    assert keys == ['name', 'person']
    assert results == [['William Shakespeare', 'William Shakespeare']]
Exemple #3
0
def test_multiple_ignores3():
    headers1 = ['name']
    data1 = [['E M Forster'], ['J D Salinger']]
    headers2 = ['person']
    data2 = [['H a r p e r, Lee'], ['F ó r s t e r, ÉM']]
    results, keys = csvmatch.run(data1,
                                 headers1,
                                 data2,
                                 headers2,
                                 ignore_nonlatin=True,
                                 ignore_nonalpha=True,
                                 ignore_order_letters=True)
    assert keys == ['name', 'person']
    assert results == [['E M Forster', 'F ó r s t e r, ÉM']]
Exemple #4
0
def test_multiple_ignores2():
    headers1 = ['name']
    data1 = [['John Shakespeare'], ['Mary Árden']]
    headers2 = ['person']
    data2 = [['Arden, Mary'], ['Hathaway, Anne']]
    results, keys = csvmatch.run(data1,
                                 headers1,
                                 data2,
                                 headers2,
                                 ignore_nonlatin=True,
                                 ignore_nonalpha=True,
                                 ignore_order_words=True)
    assert keys == ['name', 'person']
    assert results == [['Mary Árden', 'Arden, Mary']]
Exemple #5
0
def test_ignore_custom():
    headers1 = ['name']
    data1 = [['ONE Anne Hathaway'], ['TWO Christopher Marlowe']]
    headers2 = ['person']
    data2 = [['THREE Christopher Marlowe'], ['FOUR William Shakespeare']]
    results, keys = csvmatch.run(data1,
                                 headers1,
                                 data2,
                                 headers2,
                                 ignore_custom=['ONE', 'TWO', 'THREE', 'FOUR'])
    assert keys == ['name', 'person']
    assert results == [[
        'TWO Christopher Marlowe', 'THREE Christopher Marlowe'
    ]]
Exemple #6
0
def test_ignore_nonalpha():
    headers1 = ['name']
    data1 = [['William Shakespeare'], ['Anne-Hathaway'],
             ['Christopher Marlowe']]
    headers2 = ['person']
    data2 = [['Anne Hathaway!'], ['William Shakespeare.']]
    results, keys = csvmatch.run(data1,
                                 headers1,
                                 data2,
                                 headers2,
                                 ignore_nonalpha=True)
    assert keys == ['name', 'person']
    assert results == [['William Shakespeare', 'William Shakespeare.'],
                       ['Anne-Hathaway', 'Anne Hathaway!']]
Exemple #7
0
def test_ordering():
    headers1 = ['name', 'born']
    data1 = [['William Shakespeare', '1564'], ['Christopher Marlowe', '1564']]
    headers2 = ['birth', 'person']
    data2 = [['1564', 'William Shakespeare'], ['1556', 'Anne Hathaway']]
    results, keys = csvmatch.run(data1,
                                 headers1,
                                 data2,
                                 headers2,
                                 fields1=['name', 'born'],
                                 fields2=['person', 'birth'])
    assert keys == ['name', 'born', 'person', 'birth']
    assert results == [[
        'William Shakespeare', '1564', 'William Shakespeare', '1564'
    ]]
Exemple #8
0
def test_multiple_ignores4():
    headers1 = ['name']
    data1 = [['Prof. William Shakespeare'], ['Ms Anne Hathaway']]
    headers2 = ['person']
    data2 = [['Pröf William Shakespeare'], ['Christopher Marlowe']]
    results, keys = csvmatch.run(data1,
                                 headers1,
                                 data2,
                                 headers2,
                                 ignore_nonlatin=True,
                                 ignore_titles=True)
    assert keys == ['name', 'person']
    assert results == [[
        'Prof. William Shakespeare', 'Pröf William Shakespeare'
    ]]
Exemple #9
0
def test_ignore_titles():
    headers1 = ['name']
    data1 = [
        ['Ms. Anne Hathaway'],
        ['Mr. William Shakespeare']
    ]
    headers2 = ['person']
    data2 = [
        ['Mr. Christopher Marlowe'],
        ['Mrs. Anne Hathaway']
    ]
    results, keys = csvmatch.run(data1, headers1, data2, headers2, ignore_titles=True)
    assert keys == ['name', 'person']
    assert results == [
        ['Ms. Anne Hathaway', 'Mrs. Anne Hathaway']
    ]
Exemple #10
0
def test_fuzzy_metaphone():
    headers1 = ['name']
    data1 = [
        ['William Shakespeare'],
        ['Anne Hathaway']
    ]
    headers2 = ['person']
    data2 = [
        ['Ann Athawei'],
        ['Will Sheikhspere']
    ]
    results, keys = csvmatch.run(data1, headers1, data2, headers2, methods=['metaphone'])
    assert keys == ['name', 'person']
    assert results == [
        ['Anne Hathaway', 'Ann Athawei']
    ]
Exemple #11
0
def test_same_headers():
    headers1 = ['name']
    data1 = [
        ['Anne Hathaway'],
        ['Christopher Marlowe']
    ]
    headers2 = ['name']
    data2 = [
        ['William Shakespeare'],
        ['Christopher Marlowe']
    ]
    results, keys = csvmatch.run(data1, headers1, data2, headers2)
    assert keys == ['name', 'name']
    assert results == [
        ['Christopher Marlowe', 'Christopher Marlowe']
    ]
Exemple #12
0
def test_fuzzy_levenshtein_fields():
    headers1 = ['name', 'address', 'born']
    data1 = [
        ['William Shakespeare', 'Henley Street', '1564'],
        ['Christopher Marlowe', 'Corpus Christi', '1564']
    ]
    headers2 = ['birth', 'person', 'location']
    data2 = [
        ['1564', 'Will Sheikhspere', 'Henley Street'],
        ['1556', 'Anne Hathaway', 'Cottage Lane']
    ]
    results, keys = csvmatch.run(data1, headers1, data2, headers2, fields1=['name', 'address'], fields2=['person', 'location'], methods=['levenshtein'], output=['1.name', '1.address', '2.person', '2.location', 'degree'])
    assert keys == ['name', 'address', 'person', 'location', 'degree']
    assert results == [
        ['William Shakespeare', 'Henley Street', 'Will Sheikhspere', 'Henley Street', '0.8157894736842105']
    ]
Exemple #13
0
def test_fields():
    headers1 = ['name', 'born']
    data1 = [
        ['William Shakespeare', '1564'],
        ['Christopher Marlowe', '1583']
    ]
    headers2 = ['person', 'birth']
    data2 = [
        ['Christopher Marlowe', 'unknown'],
        ['William Shakespeare', '1564']
    ]
    results, keys = csvmatch.run(data1, headers1, data2, headers2)
    assert keys == ['name', 'born', 'person', 'birth']
    assert results == [
        ['William Shakespeare', '1564', 'William Shakespeare', '1564']
    ]
Exemple #14
0
def test_ignore_nonlatin():
    headers1 = ['name']
    data1 = [
        ['Charlotte Brontë'],
        ['Gabriel García Márquez']
    ]
    headers2 = ['person']
    data2 = [
        ['Gabriel Garcia Marquez'],
        ['Leo Tolstoy']
    ]
    results, keys = csvmatch.run(data1, headers1, data2, headers2, ignore_nonlatin=True)
    assert keys == ['name', 'person']
    assert results == [
        ['Gabriel García Márquez', 'Gabriel Garcia Marquez']
    ]
Exemple #15
0
def test_multiple_ignores1():
    headers1 = ['name']
    data1 = [['William Shakespeare'], ['Charlotte Brontë']]
    headers2 = ['person']
    data2 = [['BRONTE, CHARLOTTE'], ['SHAKESPEARE, WILLIAM']]
    results, keys = csvmatch.run(data1,
                                 headers1,
                                 data2,
                                 headers2,
                                 ignore_case=True,
                                 ignore_nonlatin=True,
                                 ignore_nonalpha=True,
                                 ignore_order_words=True)
    assert keys == ['name', 'person']
    assert results == [['William Shakespeare', 'SHAKESPEARE, WILLIAM'],
                       ['Charlotte Brontë', 'BRONTE, CHARLOTTE']]
Exemple #16
0
def test_ignore_case():
    headers1 = ['name']
    data1 = [
        ['Anne Hathaway'],
        ['Christopher Marlowe']
    ]
    headers2 = ['person']
    data2 = [
        ['william shakespeare'],
        ['christopher marlowe']
    ]
    results, keys = csvmatch.run(data1, headers1, data2, headers2, ignore_case=True)
    assert keys == ['name', 'person']
    assert results == [
        ['Christopher Marlowe', 'christopher marlowe']
    ]
Exemple #17
0
def test_join_right_outer():
    headers1 = ['name']
    data1 = [
        ['William Shakespeare'],
        ['Christopher Marlowe']
    ]
    headers2 = ['person']
    data2 = [
        ['Anne Hathaway'],
        ['William Shakespeare']
    ]
    results, keys = csvmatch.run(data1, headers1, data2, headers2, join='right-outer')
    assert keys == ['name', 'person']
    assert results == [
        ['William Shakespeare', 'William Shakespeare'],
        ['', 'Anne Hathaway']
    ]
Exemple #18
0
def test_fuzzy_jaro():
    headers1 = ['name']
    data1 = [
        ['William Shakespeare'],
        ['Christopher Marlowe']
    ]
    headers2 = ['person']
    data2 = [
        ['Chris Barlow'],
        ['Willy Shake-Spear']
    ]
    results, keys = csvmatch.run(data1, headers1, data2, headers2, methods=['jaro'])
    assert keys == ['name', 'person']
    assert results == [
        ['William Shakespeare', 'Willy Shake-Spear'],
        ['Christopher Marlowe', 'Chris Barlow']
    ]
Exemple #19
0
def test_ignore_order_words():
    headers1 = ['name']
    data1 = [
        ['William Shakespeare'],
        ['Anne Hathaway']
    ]
    headers2 = ['person']
    data2 = [
        ['Anne Hathaway'],
        ['Shakespeare William']
    ]
    results, keys = csvmatch.run(data1, headers1, data2, headers2, ignore_order_words=True)
    assert keys == ['name', 'person']
    assert results == [
        ['William Shakespeare', 'Shakespeare William'],
        ['Anne Hathaway', 'Anne Hathaway']
    ]
Exemple #20
0
def test_fuzzy_levenshtein():
    headers1 = ['name']
    data1 = [
        ['William Shakespeare'],
        ['Anne Hathaway']
    ]
    headers2 = ['person']
    data2 = [
        ['Ann Athawei'],
        ['Will Sheikhspere']
    ]
    results, keys = csvmatch.run(data1, headers1, data2, headers2, algorithm='levenshtein')
    assert keys == ['name', 'person']
    assert results == [
        ['William Shakespeare', 'Will Sheikhspere'],
        ['Anne Hathaway', 'Ann Athawei']
    ]
Exemple #21
0
def main():
    logging.captureWarnings(True)
    logging.basicConfig(level=logging.WARN, format='%(message)s')
    warnings.formatwarning = lambda e, *args: str(e)
    sys.stderr.write('Starting up...\n')
    try:
        file1, file2, args = arguments()
        data1, headers1 = read(*file1)
        data2, headers2 = read(*file2)
        results, keys = csvmatch.run(data1,
                                     headers1,
                                     data2,
                                     headers2,
                                     ticker=ticker,
                                     **args)
        formatted = format(results, keys)
        print(formatted)
    except BaseException as e:
        sys.exit(e)
Exemple #22
0
def test_multiple():
    headers1 = ['name']
    data1 = [
        ['Anne Hathaway'],
        ['Anne Hathaway'],
        ['Christopher Marlowe']
    ]
    headers2 = ['person']
    data2 = [
        ['Anne Hathaway'],
        ['Christopher Marlowe'],
        ['Christopher Marlowe']
    ]
    results, keys = csvmatch.run(data1, headers1, data2, headers2)
    assert keys == ['name', 'person']
    assert results == [
        ['Anne Hathaway', 'Anne Hathaway'],
        ['Anne Hathaway', 'Anne Hathaway'],
        ['Christopher Marlowe', 'Christopher Marlowe'],
        ['Christopher Marlowe', 'Christopher Marlowe']
    ]
def fuzzy_merge(df1,
                df2,
                on=None,
                left_on=None,
                right_on=None,
                keep=None,
                keep_left='all',
                keep_right='all',
                method='exact',
                threshold=0.6,
                keep_degree=False,
                **kwargs):
    """Fuzzy matching between two dataframes

    Parameters
    ----------
    left : DataFrame
    right : DataFrame
        Object to merge left with
    on : str or list
        Column names to compare. These must be found in both DataFrames.
    left_on : str or list
        Column names to compare in the left DataFrame.
    right_on : str or list
        Column names to compare in the right DataFrame.
    keep : str { 'all', 'match' }
        Overrides keep_left and keep_right
    keep_left : str or list, default 'all'
        List of columns to preserve from the left DataFrame.
        If 'all', preserve all columns.
        If 'match', preserve left_on matching) column.
        If any other string, just keeps that one column.
    keep_right : str or list, default 'all'
        List of columns to preserve from the right DataFrame.
        If 'all', preserve all columns. Defaults to right_on.
        If 'match', preserve right_on (matching) column.
        If any other string, just keeps that one column.
    method : str or list, default 'exact'
        Perform a fuzzy match, and an optional specified algorithm.
        Multiple algorithms can be specified which will apply to each field
        respectively.

        * exact: exact matches
        * levenshtein: string distance metric
        * jaro: string distance metric
        * metaphone: phoenetic matching algorithm
        * bilenko: prompts for matches

    threshold : float or list, default 0.6
        The threshold for a fuzzy match as a number between 0 and 1
        Multiple numbers will be applied to each field respectively
    ignore_case : bool, default False
        Ignore case (default is case-sensitive)
    ignore_nonalpha : bool, default False
        Ignore non-alphanumeric characters
    ignore_nonlatin : bool, default False
        Ignore characters from non-latin alphabets
        Accented characters are compared to their unaccented equivalent
    ignore_order_words : bool, default False
        Ignore the order words are given in
    ignore_order_letters : bool, default False
        Ignore the order the letters are given in, regardless of word order
    ignore_titles : bool, default False
        Ignore a predefined list of name titles (such as Mr, Ms, etc)
    join : { 'inner', 'left-outer', 'right-outer', 'full-outer' }
    keep_degree : bool, default False
        Keep the record of match confindence in a separate column named degree
    
    Returns
    -------
    pd.DataFrame
        a DataFrame of matchine rows
    """
    data1 = df1.values.tolist()
    headers1 = list(df1.columns)

    data2 = df2.values.tolist()
    headers2 = list(df2.columns)

    if not isinstance(threshold, list):
        threshold = [threshold]

    if on:
        left_on = on
        right_on = on

    if not isinstance(left_on, list):
        left_on = [left_on]

    if not isinstance(right_on, list):
        right_on = [right_on]

    if keep:
        keep_left = keep
        keep_right = keep

    if keep_left == 'all':
        keep_left = headers1
    if keep_right == 'all':
        keep_right = headers2

    if keep_left == 'match':
        keep_left = left_on
    if keep_right == 'match':
        keep_right = right_on

    if isinstance(keep_left, str):
        keep_left = [keep_left]

    if isinstance(keep_right, str):
        keep_right = [keep_right]

    output = []
    output.extend(['1.' + col for col in (keep_left or left_on)])
    output.extend(['2.' + col for col in (keep_right or right_on)])

    if keep_degree:
        output.append('degree')

    if not isinstance(method, list):
        method = [method]

    output = kwargs.pop('output', output)

    results, keys = csvmatch.run(data1,
                                 headers1,
                                 data2,
                                 headers2,
                                 fields1=left_on,
                                 fields2=right_on,
                                 thresholds=threshold,
                                 output=output,
                                 methods=method,
                                 **kwargs)

    return pd.DataFrame(results, columns=keys)
Exemple #24
0
def fuzzy_merge(
    df1,
    df2,
    on=None,
    left_on=None,
    right_on=None,
    keep=None,
    keep_left='all',
    keep_right='all',
    method='exact',
    threshold=0.6,
    training_file=None,  #TODO: EITHER ADD A POSITIONAL ARGUMENT FOR THE LIST OF LIST OF TRAINING FILES: [trainingmatches1.csv,trainingmatches2.csv,...]
    **kwargs
):  #TODO: OR, YOU CAN ADD IT AS A KEYWORD ARGUMENT WITH THE REST OF THE OPTIONAL ARGUMENTS.
    """Fuzzy matching between two dataframes

    Parameters
    ----------
    left : DataFrame
    right : DataFrame
        Object to merge left with
    on : str or list
        Column names to compare. These must be found in both DataFrames.
    left_on : str or list
        Column names to compare in the left DataFrame.
    right_on : str or list
        Column names to compare in the right DataFrame.
    keep : str { 'all', 'match' }
        Overrides keep_left and keep_right
    keep_left : str or list, default 'all'
        List of columns to preserve from the left DataFrame.
        If 'all', preserve all columns.
        If 'match', preserve left_on matching) column.
        If any other string, just keeps that one column.
    keep_right : str or list, default 'all'
        List of columns to preserve from the right DataFrame.
        If 'all', preserve all columns. Defaults to right_on.
        If 'match', preserve right_on (matching) column.
        If any other string, just keeps that one column.
    method : str or list, default 'exact'
        Perform a fuzzy match, and an optional specified algorithm.
        Multiple algorithms can be specified which will apply to each field
        respectively.

        * exact: exact matches
        * levenshtein: string distance metric
        * jaro: string distance metric
        * metaphone: phoenetic matching algorithm
        * bilenko: prompts for matches

    threshold : float or list, default 0.6
        The threshold for a fuzzy match as a number between 0 and 1
        Multiple numbers will be applied to each field respectively
    ignore_case : bool, default False
        Ignore case (default is case-sensitive)
    ignore_nonalpha : bool, default False
        Ignore non-alphanumeric characters
    ignore_nonlatin : bool, default False
        Ignore characters from non-latin alphabets
        Accented characters are compared to their unaccented equivalent
    ignore_order_words : bool, default False
        Ignore the order words are given in
    ignore_order_letters : bool, default False
        Ignore the order the letters are given in, regardless of word order
    ignore_titles : bool, default False
        Ignore a predefined list of name titles (such as Mr, Ms, etc)
    join : { 'inner', 'left-outer', 'right-outer', 'full-outer' }

    Returns
    -------
    pd.DataFrame
        a DataFrame of matchine rows
    """

    #there is a transformation that happens here to the pandas dataframe thats
    #passed in... that means that i will probably have to do some transformation

    data1 = df1.values.tolist()
    headers1 = list(df1.columns)

    data2 = df2.values.tolist()
    headers2 = list(df2.columns)

    #TODO: PREPARE_TRAINING() TAKES IN A FILE NAME AND THEN USES:
    """
    deduper.prepare_training(data_d, 150000, .5)

    with open(training_file, 'rb') as f:
        deduper.prepare_training(data_d, training_file=f)
        uncertainPairs()
    """

    #THAT TO DO THE PREPARATION. SO, IF I PASS IN A LIST OF MATCH FILES,
    #THAT MEANS THAT I NEED TO COLLATE THE MATCHES INTO ONE BIG "MATCH"
    #FILE. I CAN EITHER CHOOSE TO DO THAT IN HERE, OR OUTSIDE OF THIS
    #PACKAGE.
    #TODO: IN THIS COLLATE FUNCTION, I ALSO NEED SOME WAY TO DEAL WITH
    #MULTIPLE "PAIRS" SO THIS MEANS.. WOULD I JUST PUT IN A PERMUTATION
    #OF ALL PAIRS... ? (INTEL/INTC) (INTEL/INTC CORP) (INTC CORP/INTC)?
    #NEED TO LOOK THIS UP. MAYBE ASK DAN THE BEST WAY TO PASS THIS INFO IN

    if not isinstance(threshold, list):
        threshold = [threshold]

    if on:
        left_on = on
        right_on = on

    if not isinstance(left_on, list):
        left_on = [left_on]

    if not isinstance(right_on, list):
        right_on = [right_on]

    if keep:
        keep_left = keep
        keep_right = keep

    if keep_left == 'all':
        keep_left = headers1
    if keep_right == 'all':
        keep_right = headers2

    if keep_left == 'match':
        keep_left = left_on
    if keep_right == 'match':
        keep_right = right_on

    if isinstance(keep_left, str):
        keep_left = [keep_left]

    if isinstance(keep_right, str):
        keep_right = [keep_right]

    output = []
    output.extend(['1.' + col for col in (keep_left or left_on)])
    output.extend(['2.' + col for col in (keep_right or right_on)])

    if not isinstance(method, list):
        method = [method]

    output = kwargs.pop('output', output)
    """
    so this means that if i dont list what the ouput key is in kwargs...
    then it will use the output that was created up there. Basically, the
    idea is that .pop() will look for that key 'output' in kwargs, but if
    its not there... then it will take the output created up there. So, long
    story short I dont actually need to know what the code does up there.

    """

    results, keys = csvmatch.run(  #TODO: PUT IN THE TRAINING MATCHES LIST AS AN ARGUMENT HERE:
        data1,
        headers1,
        data2,
        headers2,
        fields1=left_on,
        fields2=right_on,
        thresholds=threshold,
        output=output,
        methods=method,
        training_file=None,  #pass it in as None first
        #training_list = [trainingmatches1.csv,trainingmatches2.csv,...]
        **kwargs)
    #So, the kwargs that is right here.... is directly passed from the beginning of this method. Its not used at all.
    #no the above statement isn't true. It has some output thing that goes on?
    #yes it is true, I could pass something else than output into there

    return pd.DataFrame(results, columns=keys)