def test_fuzzy():
"""This function tests a series function that are used to predict the unknown ranking of string values using a 
training dataset in which the rankings are known for other string values. Corpora for each ranking are compiled and
then the unknown values are compared against these in order to predict the most likely ranking.

This functionality is tested by constructing a simulated dataframe in which we expect the predictions to be 100%
accurate. We will follow this dataframe through each function in the fuzzy modelling pipeline and then test to 
assert that all behaviour is as expected.
"""
    df_sim = pd.DataFrame({ 'piggy' : pd.Series(['straw', 'straws', 'stick', 'sticks', 'brick', 'bricks', 'brickz']),
                            'piggy_rank' : [1, 1, 2, 2, 3, 3, np.nan],
                            'piggy_rank_og' : [1, 1, 2, 2, 3, 3, 3],
                            'train' : [1, 1, 1, 1, 1, 1, 0]})
    sim_rank_list = [1,2,3] #save a list with the expected rank levels in your simulated df
    rank_dictionary = {'natural':1, 'rudimentary':2, 'finished':3}
    rank_values = list(rank_dictionary.values())
    rank_keys = list(rank_dictionary.keys())

    #build a corpus based on the simulated dataset
    str_list, idk_strings = fz.build_corpus(df_sim, 'piggy', 'piggy_rank', sim_rank_list)

    assert len(idk_strings) == 1

    #find distribution of scores for each string
    distrib = fz.fuzzy_scan(idk_strings, str_list)

    #the length of the output df should be equal to the length of the longest corpora
    assert len(distrib) == len(max(str_list, key=len)), "the output distribution df is not the correct length"

    #the output df should have the a# of columns that equals # of input rank categories + 1
    assert len(distrib.columns) == len(piggy_rank_list)+1, "the output distribution df is not the correct width"

    #the output df should have a column called word that contains only the values in idk_strings
    assert distrib.word.unique() in idk_strings

    #predict class based on probability of exceeding similarity cutoff of 75
    preds = fz.fuzzy_predict(distrib, rank_keys, 'word', 75, rank_dictionary)

    #the length of the prediction df should be equal to the length of the unknown words corpus
    assert len(preds) == len(idk_strings), "the output prediction df is not the correct length"

    #the prediction df should have # of columns that equals # of input rank categories + 1
    assert len(preds.columns) == len(piggy_rank_list)+1, "the output prediction df is not the correct width"

    #the prediction df should contain a column called "pred"
    assert ("pred" in preds.columns), "prediction column not being generated"

    #merge results back on the test data to validate
    out = df_sim[df_sim['train']==0]
    out = pd.merge(out,
                   preds,
                   left_on='piggy',
                   right_on='word',
                   how='left')

    #assert that the prediction was accurate, as expected
    assert np.allclose(out['piggy_rank_og'], out['pred'])
def fuzzy_semantic_cv(cv_list,
                      base_var,
                      rank_dictionary,
                      subset=None,
                      threshold=.5):
    # import packages
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    import pandas as pd
    import numpy as np

    # import custom modules
    import semantic.semantic as sem
    import model.fuzzy as fz

    # setup objects
    rank_var = base_var + '_rank'
    og_var = rank_var + '_og'

    # TODO validate syntax
    rank_values = list(rank_dictionary.values())
    rank_keys = list(rank_dictionary.keys())

    # create lists to store loop outputs
    cv_distrib = []

    # loop over each cross validation:
    for i in range(len(cv_list)):

        print('working on cv loop #', i)
        df = cv_list[i].copy()  # subset the cv list to the current df

        # build corpus of known and unknown strings
        str_list, idk_strings = fz.build_corpus(df, base_var, rank_var,
                                                rank_values)
        str_list_unique = []
        for x in range(3):
            str_list_unique.append(np.unique(str_list[x]))
        # subset the unknown strings to allow for faster testing
        if subset is not None:
            idk_strings = idk_strings[subset]

        # find distribution of scores for each string
        distrib = sem.semantic_similarity_scan(idk_strings, str_list_unique)

        # append results to prep for next loop
        cv_distrib.append(distrib)

    return cv_distrib, cv_preds, cv_results, cv_df
def test_build_corpus():
    """This function tests a function that is used to build corpora of known and unknown words from a df 
    that contains columns with string value descriptions. The testing is done to confirm that the resulting corpuses are built
    entirely from words that are present in the pandas df column that was passed in, and furthermore in the rows that result when
    subsetting by the rank class that they are supposed to be a part of.
    """

    import numpy as np
    
    for x in STR_VARS:

        rank_var = x + "_rank"
        str_list, idk_strings = fz.build_corpus(df_clean, x, rank_var, RANK_LIST)

        #verify that each of the unknown strings exist in the appropriate column in the input pandas df
        for y in idk_strings:
            assert (y in df_clean[x].unique()) == True

        #verify that each of the known strings exist in the appropriate column in the input pandas df
        #note that here we subset the pandas df to the correct rank before testing the column values
        for rank, rank_num in zip(RANK_LIST, range(len(RANK_LIST))):
            for z in np.random.choice(str_list[rank_num], 5): #only pull 5 random strings and test for speed purposes

                assert (z in df_clean[df_clean[rank_var] == rank][x].unique()) == True
Beispiel #4
0
def fuzzy_cv(cv_list,
             base_var,
             rank_dictionary,
             subset=None,
             threshold=75,
             jupyter=False):
    """This is the master function for this module. It is used to loop over our list of randomly sampled
    cross-validation dfs and run the fuzzy prediction pipeline on them in order to return results and accuracy metrics
    for each. It reads in our custom fuzzy module in order to use its functions in sequence on each cv run.

    TODO: ?

    :param cv_list: This is a list of pandas df, each containing a different cross-validation run.
    :param base_var: This is a string indicating the variable you want to analyze the string values of and predict rank
    :param rank_dictionary: This is a dictionary that can be used to map the str names of the ranks back to ordinal vals
    :param subset: This is an optional parameter that can be used to subset our list of unknown words for testing
    :param threshold: This is the similarity score threshold, above which we think implies sufficient semantic meaning
    in word similarity to accurately predict the words quality ranking.
    :param jupyter: This is a boolean that tells us if we are running in a jupyter nb. If so, we use a different tqdm
        progress bar.

    :return: cv_distrib: This is a list of len=len(cv_list), containing pandas dfs that have the distributions of scores
     for each unknown word
    :return: cv_preds: This is a list of len=len(cv_list), containing pandas dfs that have the prediction for each word
    based on the distributions of scores
    :return: cv_results: This is a list of len=len(cv_list), containing pandas crosstabs that indicate the accuracy
    score result for each cv run
    :return: cv_df: This is a list of len=len(cv_list), containing pandas dfs in which each of the unknown words has a
    prediction column added based on the fuzzy model process
    """

    #import packages
    from fuzzywuzzy import fuzz
    from fuzzywuzzy import process
    import pandas as pd
    import numpy as np

    if jupyter == True:
        from tqdm import tqdm_notebook as tqdm
    else:
        from tqdm import tqdm as tqdm

    #import custom modules
    import sys
    sys.path.append('../hp_classify')
    import model.fuzzy as fz

    #setup objects
    rank_var = base_var + '_rank'
    og_var = rank_var + '_og'

    #TODO validate syntax
    rank_values = list(rank_dictionary.values())
    rank_keys = list(rank_dictionary.keys())

    # initialize lists to store loop vals
    cv_distrib = []
    cv_preds = []
    cv_results = []
    cv_df = []

    #loop over each cross validation:
    for i in tqdm(range(len(cv_list)), desc="cv loop"):

        print('working on cv loop #', i)
        df = cv_list[i].copy()  #subset the cv list to the current df

        #build corpus of known and unknown strings
        str_list, idk_strings = fz.build_corpus(df, base_var, rank_var,
                                                rank_values)

        #subset the unknown strings to allow for faster testing
        if subset != None:
            idk_strings = idk_strings[subset]

        #find distribution of scores for each string
        distrib = fz.fuzzy_scan(idk_strings, str_list, jupyter=jupyter)

        #TODO, output plots of distribution for analysis

        #predict class based on probability of exceeding similarity cutoff
        preds = fz.fuzzy_predict(distrib, rank_keys, 'word', threshold,
                                 rank_dictionary)

        #merge results back on the test data to validate
        train = df[df['train'] == 0]
        out = pd.merge(train,
                       preds,
                       left_on=base_var,
                       right_on='word',
                       how='left')

        # Verify that rows have neither been added or lost by merging on predictions
        if len(train) != len(out):

            class RowCountException(Exception):
                """Custom exception class.

                This exception is raised when the rowcount is not as expected.

                """
                pass

            raise RowCountException(
                "Rowcount was modified by merge, output df is no longer representative"
            )

        #calculate success rate and tabulate
        out['success'] = np.where(out[og_var] == out['pred'], 1, 0)
        success_rate = pd.crosstab(out[~pd.isnull(out['pred'])]['success'],
                                   columns='count')

        #append results to prep for next loop
        cv_distrib.append(distrib)
        cv_preds.append(preds)
        cv_results.append(success_rate)
        cv_df.append(out)

    return (cv_distrib, cv_preds, cv_results, cv_df)