def fuzzy_match_songs_dataset(): try: conn = database_conn() year = 1922 while (year < 2019): df1 = pd.read_sql( 'SELECT artist, song, year FROM songs_dataset WHERE year = {year} ORDER BY artist' .format(year=year), con=conn) df2 = pd.read_sql( 'SELECT artist, song, year FROM lyrics_kaggle WHERE year = {year} AND genre is not null AND genre <> "Not Available" ORDER BY artist' .format(year=year), con=conn) # Columns to match on from df_left DF1 SONGS_DATASET left_on = ["artist", "song", "year"] # Columns to match on from df_right DF2 LYRICS_KAGGLE right_on = ["artist", "song", "year"] #df3 = fuzzymatcher.fuzzy_left_join(df1,df2,left_on,right_on) start_time = time.time() try: df = fuzzymatcher.link_table(df1, df2, left_on, right_on) print("--- %s seconds ---" % (time.time() - start_time)) mtch = df.loc[df['match_rank'] == 1] mtch.to_sql(con=sqlalchemy_engine(), name='fuzzy_match_SD_LK', if_exists='append') print("Year: {yr} - DB [OK]".format(yr=year)) except Exception as e: print("Problem with year [{yr}] - [{e}]".format(yr=year, e=str(e))) year = year + 1 except Exception as e: print("Exception occurred \n" + str(e))
def test_data_100(self): dg = DataGetterCartesian() m = Matcher(data_getter = dg) df_left = pd.read_csv("tests/data/left_3.csv") df_right = pd.read_csv("tests/data/right_3.csv") on = ["first_name", "surname", "dob", "city"] m.add_data(df_left, df_right, on, on) start = timer() m.match_all() lt = m.get_formatted_link_table() end = timer() time_taken = end - start cartesian_perc = link_table_percentage_correct(lt) lt2 = link_table(df_left, df_right, on, on) sqlite_perc = link_table_percentage_correct(lt2) this_record = {} this_record["datetime"] = datetime.datetime.now().isoformat() this_record["commit_hash"] = get_commit_hash() this_record["datagetter_cartesian"] = cartesian_perc this_record["datagetter_sqlite"] = sqlite_perc this_record["test_type"] = "left_3" this_record["time_taken"] = time_taken with open("tests/datagetter_performance.txt", "a") as myfile: myfile.writelines(json.dumps(this_record) + "\n")
def fuzzy_match_attempt_two(): try: conn = database_conn() year = 1957 while (year < 2019): df1 = pd.read_sql( 'SELECT id, artist_kaggle, song_kaggle, year_kaggle FROM song_artist_universe WHERE id_lyrics_kaggle IS NULL and year_kaggle = {year} ORDER BY artist_kaggle' .format(year=year), con=conn) df2 = pd.read_sql( 'SELECT id, artist, song, year FROM lyrics_kaggle WHERE year = {year} ORDER BY artist' .format(year=year), con=conn) # Columns to match on from df_left left_on = ["artist_kaggle", "song_kaggle", "year_kaggle"] # Columns to match on from df_right right_on = ["artist", "song", "year"] #df3 = fuzzymatcher.fuzzy_left_join(df1,df2,left_on,right_on) start_time = time.time() try: df = fuzzymatcher.link_table(df1, df2, left_on, right_on) print("--- %s seconds ---" % (time.time() - start_time)) mtch = df.loc[df['match_score'] >= 0.38] mtch.to_sql(con=sqlalchemy_engine(), name='fuzzy_matches', if_exists='append') print("Year: {yr} - DB [OK]".format(yr=year)) except Exception as e: print("Problem with year [{yr}] - [{e}]".format(yr=year, e=str(e))) year = year + 1 except Exception as e: print("Exception occurred \n" + str(e))
def test_nulls_no_errors(self): """ """ df_left = pd.read_csv("tests/data/left_token_escape.csv") df_right = pd.read_csv("tests/data/right_token_escape.csv") # Columns to match on from df_left left_on = ["fname", "mname", "lname"] # Columns to match on from df_right right_on = ["name", "middlename", "surname"] on = [ "first_name", "surname", ] flj = link_table(df_left, df_right, left_on, right_on, left_id_col="id", right_id_col="id")
def test_some_colnames_match(self): """ Adding two numbers should give the correct answer """ left = pd.read_csv("tests/data/left_1.csv") left = left.rename(columns = {"fname": "name"}) right = pd.read_csv("tests/data/right_1.csv") left_on = ["name", "mname", "lname", "dob"] right_on = ["name", "middlename", "surname", "date"] df = link_table(left, right, left_on, right_on) expected_columns = ['__id_left', '__id_right', 'match_score', 'match_rank', 'name_left', 'name_right', 'mname', 'middlename', 'lname', 'surname', 'dob', 'date'] actual_columns = list(df.columns) self.assertEqual(expected_columns, actual_columns)
def test_all_colnames_match_with_id(self): """ Adding two numbers should give the correct answer """ left = pd.read_csv("tests/data/left_2.csv") right = pd.read_csv("tests/data/right_2.csv") left_on = ["fname", "mname", "lname", "dob"] right_on = ["fname", "mname", "lname", "dob"] df = link_table(left, right, left_on, right_on, left_id_col="id", right_id_col="id") expected_columns = ['id_left', 'id_right', 'match_score', 'match_rank', 'fname_left', 'fname_right', 'mname_left', 'mname_right', 'lname_left', 'lname_right', 'dob_left', 'dob_right'] actual_columns = list(df.columns) self.assertEqual(expected_columns, actual_columns)
def dataset(self): return fm.link_table(self.df1, self.df2, self.arr1, self.arr2, left_id_col='id', right_id_col='id')
def fuzzy_match_result(entry, neighbours, __COUNT_ERROR, __NAMES_ERROR, tolerance=3): """ Fuzzy match on final 5 neighbours using Levenshtein distance to get exact match. Input: -------- entry: Pandas series entry of which merchant from Liven we trying to match. neighbours: Entries of neighbours we are trying to fuzzy match on. Output: -------- result: Entry array of joined entry and closest neighbour. __COUNT_ERROR: Track of error. __NAMES_ERROR: Names of merchants. """ try: # Turn series/array into dataframe to make it easier to join. temp_df = pd.DataFrame( np.array(entry).reshape(1, 6), columns=["name", "Suburb", "Lat", "Long", "City", "Id"]) # Fuzzy matching join. matches_df = link_table( temp_df, neighbours, ["name", "Suburb", "City", "Id"], [ "name", "rating", "num_ratings", "cuisines", "price_two", "contact", "url", "branches url", "num branches", "facility", "timetable" ]) if matches_df.match_score[0] is None: # When score is pretty low, so do not match. __COUNT_ERROR += 1 __NAMES_ERROR.append( (temp_df.name, temp_df.Suburb, "Fuzzy Error - NO match")) return -1, __COUNT_ERROR, __NAMES_ERROR if matches_df.match_score[0] > 0.5: __COUNT_ERROR += 1 __NAMES_ERROR.append( (temp_df.name, temp_df.Suburb, "Fuzzy Error - BAD match")) return -1, __COUNT_ERROR, __NAMES_ERROR # Select columns from join. matches_df = matches_df[[ "name_left", "Suburb", "Id", "name_right", "url" ]] # Rename columns. matches_df.columns = ["Name", "Suburb", "Id", "Zomato Name", "url"] if len(matches_df) > tolerance: raise Exception("Issue with matching.") return matches_df, __COUNT_ERROR, __NAMES_ERROR except: return -1, __COUNT_ERROR, __NAMES_ERROR
def test_nulls_no_errors(self): """ Adding two numbers should give the correct answer """ df_left = pd.read_csv("tests/data/left_5_nas.csv") df_right = pd.read_csv("tests/data/right_5_nas.csv") on = ["first_name", "surname", "dob", "city"] flj = link_table(df_left, df_right, on, on)
def fuzzy_match_attempt_one(): try: conn = database_conn() df1 = pd.read_sql( 'SELECT Clave, id FROM artist_song_billboard WHERE id_lyrics IS NULL ORDER BY Clave', con=conn) df2 = pd.read_sql( 'SELECT A.Clave, A.index id_letra FROM lyrics_kaggle A ORDER BY A.Clave', con=conn) # Columns to match on from df_left left_on = ["Clave"] # Columns to match on from df_right right_on = ["Clave"] #df3 = fuzzymatcher.fuzzy_left_join(df1,df2,left_on,right_on) df3 = fuzzymatcher.link_table(df1, df2, left_on, right_on) df3.head(10) except Exception as e: print("Exception occurred \n" + str(e))