Esempio n. 1
0
 def fuzzy_match(self, left_df: pd.DataFrame, right_df: pd.DataFrame):
     return pd.concat(
         fuzzy_merge(
             left_df,
             right_df,
             left_on=self.left_on,
             right_on=self.right_on,
             threshold=self._threshold,
             ignore_case=True,
             ignore_nonalpha=True,
             method=method,
         ).assign(similarity_index=self._threshold) for method in ["jaro"])
Esempio n. 2
0
def run_stuff(df_input, df_names):
    """
    Attempts to determine a user's gender by comparing their username to tradtionally gendered names.
    Utilizes Levenshtein distance. Assumes two Pandas Dataframes with columns "author_login" and "name".
    The threshold can be varied as needed; I empirically determined 0.72 to be the most accurate.

    Args:
        df_input: A DataFrame with unknown usernames to check.
        df_names: A DataFrame with traditionally gendered names, i.e. only names traditionally associated with men or women.

    Returns:
        A Fuzzy Pandas DataFrame consisting of names which match up.
    """
    results = fpd.fuzzy_merge(df_input,
                              df_names,
                              left_on="author_login",
                              right_on="name",
                              method="levenshtein",
                              threshold=0.72,
                              keep="match")
    return results
Esempio n. 3
0
import pandas as pd
import fuzzy_pandas as fpd
import geopandas as gpd
import re

clean_shp = pd.read_csv(
    "/Users/hopecj/projects/gerryspam/NJ/dat/cleanprec_shp.csv")
clean_elec = pd.read_csv(
    "/Users/hopecj/projects/gerryspam/NJ/dat/cleanprec_elec.csv")

# merge by digits and first precinct word
results_exact = fpd.fuzzy_merge(clean_shp,
                                clean_elec,
                                left_on='shp_loc_prec_code',
                                right_on='elec_loc_prec_code',
                                ignore_case=True,
                                keep='all',
                                join='full-outer',
                                method='exact')

print("Found", results_exact.shape)
results_exact.head(5)

out = results_exact[[
    'year', 'STATEFP', 'COUNTYFP', 'NAMELSAD', 'G16DPRS', 'G16RPRS', 'G16DHOR',
    'G16RHOR', 'precinct', 'precinct', 'shp_loc_prec_code',
    'elec_loc_prec_code', 'shp_loc_prec', 'elec_loc_prec', 'countynam'
]]

out.to_csv("/Users/hopecj/projects/gerryspam/NJ/dat/NJ16_merging.csv")
Esempio n. 4
0
    payload["Government Incidents Reported"] = len(government_incident_numbers)
    payload["Difference bwt Citizens 1 & Gov"] = government_master.shape[0] - people_master.shape[0]
    payload["Citizens 1 Incidents Missing Reports"] = community_crime_map_missing_incidents
    payload["Government Missing Incident Numbers"] = len(government_missing_incidents)
    payload["Citizens 2 Incidents Reported"] = len(spotcrime_incidents)

    government_master["date1"] = government_master["date1"].astype(str)
    spotcrime_master["date"] = spotcrime_master["date"].astype(str)


    # Fuzzy Matching

    matches = fpd.fuzzy_merge(government_master, spotcrime_master,
                              left_on=['incident_address', 'lat', 'lon', 'date'],
                              right_on=['address', 'lat', 'lon', 'date'],
                              ignore_case=True,
                              method='levenshtein',
                              # method='bilenko',
                              threshold=0.40)

    missing = list(set(spotcrime_master['cdid'].to_list()) - set(matches['cdid'].to_list()))
    payload["Citizens 2 Incidents Missing Reports"] = len(missing)

    # HOMELESS_STATISTICS = {}
    # DRUG_STATISTICS = {}
    #
    # victim_addresses = government_master['comphaddress'].to_list()
    # drugs = government_master['objattack'].to_list()
    # dates_of_occurence = government_master['date1'].to_list()
    #
    # for i in range(0, len(victim_addresses)):
        for contexts_item in all_papers_dict['contexts']:
            # print("contexts_item['text'] : " )
            # print( contexts_item['text'])
            # if papers_answers_row['answer'] in contexts_item['text']:
            #     print(contexts_item['text'])
            sentences_df.loc[len(sentences_df)] = [ contexts_item['text'], contexts_item['sentences'][0]['sentence_id'] + ':' + contexts_item['sentences'][len(contexts_item['sentences'])-1]['sentence_id'] ]
    # print("sentences_df : " )
    # print(sentences_df)

    # fuzzy match answers back to sentence text to get the closest match
    matched_results = fuzzy_pandas.fuzzy_merge(papers_answers_row_df,
                                            sentences_df,
                                            left_on = ['answer'],
                                            right_on = ['text'],
                                            keep = 'all', # papers_answers_row_df.columns.to_list(),
                                            # keep_left = 'all', # papers_answers_row_df.columns.to_list(),
                                            # keep_right = 'all', # sentences_df.columns.to_list(),
                                            method = 'levenshtein',
                                            # left_id_col='rank',
                                            # right_id_col='sentence_ids',
                                            ignore_case=True)


    # print("matched_results : " )
    # print(matched_results)
    # matched_results.to_csv('C:/Dev/EPIC-QA/test.csv' )
    if len(matched_results.index) > 0:
        papers_answers_sentences_list.append(matched_results)
    # print ('len(papers_answers_sentences_list): ' + str(len(papers_answers_sentences_list)))

print ('Writing output files ...')
Esempio n. 6
0
    str) + '_' + partnership['prec_word1']

#############
############# MERGING
#############

# LEVENSHTEIN - STRING DISTANCE METRIC
out = partnership.merge(elec_16,
                        left_on='shp_loc_prec',
                        right_on='elec_loc_prec',
                        how='outer')

results = fpd.fuzzy_merge(partnership,
                          elec_16,
                          left_on='shp_loc_prec',
                          right_on='elec_loc_prec',
                          ignore_case=True,
                          keep='match',
                          method='levenshtein',
                          threshold=0.85)

print("Found", results.shape)
results.head(5)
len(results)
len(elec_16)
frac_mached = len(results) / len(elec_16)
print("fraction matched =", frac_mached * 100)

# bilenko - prompts for matches
results_bilenko = fpd.fuzzy_merge(partnership,
                                  elec_16,
                                  left_on='shp_loc_prec',
Esempio n. 7
0
P['Score'] = P['Score'].replace(np.nan, 0)
# P = P[P['Round']<=2]
P['id'] = "R:" + P['Round'].astype(str) + "_P:" + P['Name']
P['pl_pos_id'] = (P['id'] + '_' + P['Position'])

# %%
# Set score to zero for injured players

url = "https://raw.githubusercontent.com/conradbez/afl_injuries/main/injuries.csv"
s = requests.get(url).content
Injuries = pd.read_csv(io.StringIO(s.decode('utf-8')), index_col=0)

P = fpd.fuzzy_merge(P,
                    Injuries,
                    left_on='Name',
                    right_on='PLAYER',
                    join='left-outer',
                    method='levenshtein')

P.loc[(P['DATE_BACK'] > P['Date']) & (~P['DATE_BACK'].isna()) &
      (~P['Date'].isna()), 'Score'] = 0
P = P.drop(Injuries.columns, axis=1)

# %%
player_contraints = {}
prob = LpProblem("aflProblem", LpMaximize)

player_contraints = LpVariable.dicts("Player Contraints",
                                     P['pl_pos_id'].unique(),
                                     0,
                                     1,