def fuzzy_match(self, left_df: pd.DataFrame, right_df: pd.DataFrame): return pd.concat( fuzzy_merge( left_df, right_df, left_on=self.left_on, right_on=self.right_on, threshold=self._threshold, ignore_case=True, ignore_nonalpha=True, method=method, ).assign(similarity_index=self._threshold) for method in ["jaro"])
def run_stuff(df_input, df_names): """ Attempts to determine a user's gender by comparing their username to tradtionally gendered names. Utilizes Levenshtein distance. Assumes two Pandas Dataframes with columns "author_login" and "name". The threshold can be varied as needed; I empirically determined 0.72 to be the most accurate. Args: df_input: A DataFrame with unknown usernames to check. df_names: A DataFrame with traditionally gendered names, i.e. only names traditionally associated with men or women. Returns: A Fuzzy Pandas DataFrame consisting of names which match up. """ results = fpd.fuzzy_merge(df_input, df_names, left_on="author_login", right_on="name", method="levenshtein", threshold=0.72, keep="match") return results
import pandas as pd import fuzzy_pandas as fpd import geopandas as gpd import re clean_shp = pd.read_csv( "/Users/hopecj/projects/gerryspam/NJ/dat/cleanprec_shp.csv") clean_elec = pd.read_csv( "/Users/hopecj/projects/gerryspam/NJ/dat/cleanprec_elec.csv") # merge by digits and first precinct word results_exact = fpd.fuzzy_merge(clean_shp, clean_elec, left_on='shp_loc_prec_code', right_on='elec_loc_prec_code', ignore_case=True, keep='all', join='full-outer', method='exact') print("Found", results_exact.shape) results_exact.head(5) out = results_exact[[ 'year', 'STATEFP', 'COUNTYFP', 'NAMELSAD', 'G16DPRS', 'G16RPRS', 'G16DHOR', 'G16RHOR', 'precinct', 'precinct', 'shp_loc_prec_code', 'elec_loc_prec_code', 'shp_loc_prec', 'elec_loc_prec', 'countynam' ]] out.to_csv("/Users/hopecj/projects/gerryspam/NJ/dat/NJ16_merging.csv")
payload["Government Incidents Reported"] = len(government_incident_numbers) payload["Difference bwt Citizens 1 & Gov"] = government_master.shape[0] - people_master.shape[0] payload["Citizens 1 Incidents Missing Reports"] = community_crime_map_missing_incidents payload["Government Missing Incident Numbers"] = len(government_missing_incidents) payload["Citizens 2 Incidents Reported"] = len(spotcrime_incidents) government_master["date1"] = government_master["date1"].astype(str) spotcrime_master["date"] = spotcrime_master["date"].astype(str) # Fuzzy Matching matches = fpd.fuzzy_merge(government_master, spotcrime_master, left_on=['incident_address', 'lat', 'lon', 'date'], right_on=['address', 'lat', 'lon', 'date'], ignore_case=True, method='levenshtein', # method='bilenko', threshold=0.40) missing = list(set(spotcrime_master['cdid'].to_list()) - set(matches['cdid'].to_list())) payload["Citizens 2 Incidents Missing Reports"] = len(missing) # HOMELESS_STATISTICS = {} # DRUG_STATISTICS = {} # # victim_addresses = government_master['comphaddress'].to_list() # drugs = government_master['objattack'].to_list() # dates_of_occurence = government_master['date1'].to_list() # # for i in range(0, len(victim_addresses)):
for contexts_item in all_papers_dict['contexts']: # print("contexts_item['text'] : " ) # print( contexts_item['text']) # if papers_answers_row['answer'] in contexts_item['text']: # print(contexts_item['text']) sentences_df.loc[len(sentences_df)] = [ contexts_item['text'], contexts_item['sentences'][0]['sentence_id'] + ':' + contexts_item['sentences'][len(contexts_item['sentences'])-1]['sentence_id'] ] # print("sentences_df : " ) # print(sentences_df) # fuzzy match answers back to sentence text to get the closest match matched_results = fuzzy_pandas.fuzzy_merge(papers_answers_row_df, sentences_df, left_on = ['answer'], right_on = ['text'], keep = 'all', # papers_answers_row_df.columns.to_list(), # keep_left = 'all', # papers_answers_row_df.columns.to_list(), # keep_right = 'all', # sentences_df.columns.to_list(), method = 'levenshtein', # left_id_col='rank', # right_id_col='sentence_ids', ignore_case=True) # print("matched_results : " ) # print(matched_results) # matched_results.to_csv('C:/Dev/EPIC-QA/test.csv' ) if len(matched_results.index) > 0: papers_answers_sentences_list.append(matched_results) # print ('len(papers_answers_sentences_list): ' + str(len(papers_answers_sentences_list))) print ('Writing output files ...')
str) + '_' + partnership['prec_word1'] ############# ############# MERGING ############# # LEVENSHTEIN - STRING DISTANCE METRIC out = partnership.merge(elec_16, left_on='shp_loc_prec', right_on='elec_loc_prec', how='outer') results = fpd.fuzzy_merge(partnership, elec_16, left_on='shp_loc_prec', right_on='elec_loc_prec', ignore_case=True, keep='match', method='levenshtein', threshold=0.85) print("Found", results.shape) results.head(5) len(results) len(elec_16) frac_mached = len(results) / len(elec_16) print("fraction matched =", frac_mached * 100) # bilenko - prompts for matches results_bilenko = fpd.fuzzy_merge(partnership, elec_16, left_on='shp_loc_prec',
P['Score'] = P['Score'].replace(np.nan, 0) # P = P[P['Round']<=2] P['id'] = "R:" + P['Round'].astype(str) + "_P:" + P['Name'] P['pl_pos_id'] = (P['id'] + '_' + P['Position']) # %% # Set score to zero for injured players url = "https://raw.githubusercontent.com/conradbez/afl_injuries/main/injuries.csv" s = requests.get(url).content Injuries = pd.read_csv(io.StringIO(s.decode('utf-8')), index_col=0) P = fpd.fuzzy_merge(P, Injuries, left_on='Name', right_on='PLAYER', join='left-outer', method='levenshtein') P.loc[(P['DATE_BACK'] > P['Date']) & (~P['DATE_BACK'].isna()) & (~P['Date'].isna()), 'Score'] = 0 P = P.drop(Injuries.columns, axis=1) # %% player_contraints = {} prob = LpProblem("aflProblem", LpMaximize) player_contraints = LpVariable.dicts("Player Contraints", P['pl_pos_id'].unique(), 0, 1,