def test_fuzzy(): """This function tests a series function that are used to predict the unknown ranking of string values using a training dataset in which the rankings are known for other string values. Corpora for each ranking are compiled and then the unknown values are compared against these in order to predict the most likely ranking. This functionality is tested by constructing a simulated dataframe in which we expect the predictions to be 100% accurate. We will follow this dataframe through each function in the fuzzy modelling pipeline and then test to assert that all behaviour is as expected. """ df_sim = pd.DataFrame({ 'piggy' : pd.Series(['straw', 'straws', 'stick', 'sticks', 'brick', 'bricks', 'brickz']), 'piggy_rank' : [1, 1, 2, 2, 3, 3, np.nan], 'piggy_rank_og' : [1, 1, 2, 2, 3, 3, 3], 'train' : [1, 1, 1, 1, 1, 1, 0]}) sim_rank_list = [1,2,3] #save a list with the expected rank levels in your simulated df rank_dictionary = {'natural':1, 'rudimentary':2, 'finished':3} rank_values = list(rank_dictionary.values()) rank_keys = list(rank_dictionary.keys()) #build a corpus based on the simulated dataset str_list, idk_strings = fz.build_corpus(df_sim, 'piggy', 'piggy_rank', sim_rank_list) assert len(idk_strings) == 1 #find distribution of scores for each string distrib = fz.fuzzy_scan(idk_strings, str_list) #the length of the output df should be equal to the length of the longest corpora assert len(distrib) == len(max(str_list, key=len)), "the output distribution df is not the correct length" #the output df should have the a# of columns that equals # of input rank categories + 1 assert len(distrib.columns) == len(piggy_rank_list)+1, "the output distribution df is not the correct width" #the output df should have a column called word that contains only the values in idk_strings assert distrib.word.unique() in idk_strings #predict class based on probability of exceeding similarity cutoff of 75 preds = fz.fuzzy_predict(distrib, rank_keys, 'word', 75, rank_dictionary) #the length of the prediction df should be equal to the length of the unknown words corpus assert len(preds) == len(idk_strings), "the output prediction df is not the correct length" #the prediction df should have # of columns that equals # of input rank categories + 1 assert len(preds.columns) == len(piggy_rank_list)+1, "the output prediction df is not the correct width" #the prediction df should contain a column called "pred" assert ("pred" in preds.columns), "prediction column not being generated" #merge results back on the test data to validate out = df_sim[df_sim['train']==0] out = pd.merge(out, preds, left_on='piggy', right_on='word', how='left') #assert that the prediction was accurate, as expected assert np.allclose(out['piggy_rank_og'], out['pred'])
def fuzzy_semantic_cv(cv_list, base_var, rank_dictionary, subset=None, threshold=.5): # import packages from fuzzywuzzy import fuzz from fuzzywuzzy import process import pandas as pd import numpy as np # import custom modules import semantic.semantic as sem import model.fuzzy as fz # setup objects rank_var = base_var + '_rank' og_var = rank_var + '_og' # TODO validate syntax rank_values = list(rank_dictionary.values()) rank_keys = list(rank_dictionary.keys()) # create lists to store loop outputs cv_distrib = [] # loop over each cross validation: for i in range(len(cv_list)): print('working on cv loop #', i) df = cv_list[i].copy() # subset the cv list to the current df # build corpus of known and unknown strings str_list, idk_strings = fz.build_corpus(df, base_var, rank_var, rank_values) str_list_unique = [] for x in range(3): str_list_unique.append(np.unique(str_list[x])) # subset the unknown strings to allow for faster testing if subset is not None: idk_strings = idk_strings[subset] # find distribution of scores for each string distrib = sem.semantic_similarity_scan(idk_strings, str_list_unique) # append results to prep for next loop cv_distrib.append(distrib) return cv_distrib, cv_preds, cv_results, cv_df
def test_build_corpus(): """This function tests a function that is used to build corpora of known and unknown words from a df that contains columns with string value descriptions. The testing is done to confirm that the resulting corpuses are built entirely from words that are present in the pandas df column that was passed in, and furthermore in the rows that result when subsetting by the rank class that they are supposed to be a part of. """ import numpy as np for x in STR_VARS: rank_var = x + "_rank" str_list, idk_strings = fz.build_corpus(df_clean, x, rank_var, RANK_LIST) #verify that each of the unknown strings exist in the appropriate column in the input pandas df for y in idk_strings: assert (y in df_clean[x].unique()) == True #verify that each of the known strings exist in the appropriate column in the input pandas df #note that here we subset the pandas df to the correct rank before testing the column values for rank, rank_num in zip(RANK_LIST, range(len(RANK_LIST))): for z in np.random.choice(str_list[rank_num], 5): #only pull 5 random strings and test for speed purposes assert (z in df_clean[df_clean[rank_var] == rank][x].unique()) == True
def fuzzy_cv(cv_list, base_var, rank_dictionary, subset=None, threshold=75, jupyter=False): """This is the master function for this module. It is used to loop over our list of randomly sampled cross-validation dfs and run the fuzzy prediction pipeline on them in order to return results and accuracy metrics for each. It reads in our custom fuzzy module in order to use its functions in sequence on each cv run. TODO: ? :param cv_list: This is a list of pandas df, each containing a different cross-validation run. :param base_var: This is a string indicating the variable you want to analyze the string values of and predict rank :param rank_dictionary: This is a dictionary that can be used to map the str names of the ranks back to ordinal vals :param subset: This is an optional parameter that can be used to subset our list of unknown words for testing :param threshold: This is the similarity score threshold, above which we think implies sufficient semantic meaning in word similarity to accurately predict the words quality ranking. :param jupyter: This is a boolean that tells us if we are running in a jupyter nb. If so, we use a different tqdm progress bar. :return: cv_distrib: This is a list of len=len(cv_list), containing pandas dfs that have the distributions of scores for each unknown word :return: cv_preds: This is a list of len=len(cv_list), containing pandas dfs that have the prediction for each word based on the distributions of scores :return: cv_results: This is a list of len=len(cv_list), containing pandas crosstabs that indicate the accuracy score result for each cv run :return: cv_df: This is a list of len=len(cv_list), containing pandas dfs in which each of the unknown words has a prediction column added based on the fuzzy model process """ #import packages from fuzzywuzzy import fuzz from fuzzywuzzy import process import pandas as pd import numpy as np if jupyter == True: from tqdm import tqdm_notebook as tqdm else: from tqdm import tqdm as tqdm #import custom modules import sys sys.path.append('../hp_classify') import model.fuzzy as fz #setup objects rank_var = base_var + '_rank' og_var = rank_var + '_og' #TODO validate syntax rank_values = list(rank_dictionary.values()) rank_keys = list(rank_dictionary.keys()) # initialize lists to store loop vals cv_distrib = [] cv_preds = [] cv_results = [] cv_df = [] #loop over each cross validation: for i in tqdm(range(len(cv_list)), desc="cv loop"): print('working on cv loop #', i) df = cv_list[i].copy() #subset the cv list to the current df #build corpus of known and unknown strings str_list, idk_strings = fz.build_corpus(df, base_var, rank_var, rank_values) #subset the unknown strings to allow for faster testing if subset != None: idk_strings = idk_strings[subset] #find distribution of scores for each string distrib = fz.fuzzy_scan(idk_strings, str_list, jupyter=jupyter) #TODO, output plots of distribution for analysis #predict class based on probability of exceeding similarity cutoff preds = fz.fuzzy_predict(distrib, rank_keys, 'word', threshold, rank_dictionary) #merge results back on the test data to validate train = df[df['train'] == 0] out = pd.merge(train, preds, left_on=base_var, right_on='word', how='left') # Verify that rows have neither been added or lost by merging on predictions if len(train) != len(out): class RowCountException(Exception): """Custom exception class. This exception is raised when the rowcount is not as expected. """ pass raise RowCountException( "Rowcount was modified by merge, output df is no longer representative" ) #calculate success rate and tabulate out['success'] = np.where(out[og_var] == out['pred'], 1, 0) success_rate = pd.crosstab(out[~pd.isnull(out['pred'])]['success'], columns='count') #append results to prep for next loop cv_distrib.append(distrib) cv_preds.append(preds) cv_results.append(success_rate) cv_df.append(out) return (cv_distrib, cv_preds, cv_results, cv_df)