Ejemplo n.º 1
0
def fuzzy_match_songs_dataset():
    try:
        conn = database_conn()
        year = 1922
        while (year < 2019):
            df1 = pd.read_sql(
                'SELECT artist, song, year FROM songs_dataset WHERE year = {year} ORDER BY artist'
                .format(year=year),
                con=conn)
            df2 = pd.read_sql(
                'SELECT artist, song, year FROM lyrics_kaggle WHERE year = {year} AND  genre is not null AND genre <> "Not Available"  ORDER BY artist'
                .format(year=year),
                con=conn)
            # Columns to match on from df_left DF1 SONGS_DATASET
            left_on = ["artist", "song", "year"]
            # Columns to match on from df_right DF2 LYRICS_KAGGLE
            right_on = ["artist", "song", "year"]
            #df3 = fuzzymatcher.fuzzy_left_join(df1,df2,left_on,right_on)
            start_time = time.time()
            try:
                df = fuzzymatcher.link_table(df1, df2, left_on, right_on)
                print("--- %s seconds ---" % (time.time() - start_time))
                mtch = df.loc[df['match_rank'] == 1]
                mtch.to_sql(con=sqlalchemy_engine(),
                            name='fuzzy_match_SD_LK',
                            if_exists='append')
                print("Year: {yr} - DB [OK]".format(yr=year))
            except Exception as e:
                print("Problem with year [{yr}] - [{e}]".format(yr=year,
                                                                e=str(e)))
            year = year + 1
    except Exception as e:
        print("Exception occurred \n" + str(e))
Ejemplo n.º 2
0
    def test_data_100(self):
        dg = DataGetterCartesian()
        m = Matcher(data_getter = dg)

        df_left = pd.read_csv("tests/data/left_3.csv")
        df_right = pd.read_csv("tests/data/right_3.csv")

        on = ["first_name", "surname", "dob", "city"]

        m.add_data(df_left, df_right, on, on)

        start = timer()
        m.match_all()
        lt = m.get_formatted_link_table()
        end = timer()
        time_taken = end - start

        cartesian_perc = link_table_percentage_correct(lt)

        lt2 = link_table(df_left, df_right, on, on)
        sqlite_perc = link_table_percentage_correct(lt2)

        this_record = {}
        this_record["datetime"] = datetime.datetime.now().isoformat()
        this_record["commit_hash"] = get_commit_hash()
        this_record["datagetter_cartesian"] = cartesian_perc
        this_record["datagetter_sqlite"] = sqlite_perc
        this_record["test_type"] = "left_3"
        this_record["time_taken"] = time_taken

        with open("tests/datagetter_performance.txt", "a") as myfile:
            myfile.writelines(json.dumps(this_record) + "\n")
Ejemplo n.º 3
0
def fuzzy_match_attempt_two():
    try:
        conn = database_conn()
        year = 1957
        while (year < 2019):
            df1 = pd.read_sql(
                'SELECT id, artist_kaggle, song_kaggle, year_kaggle FROM song_artist_universe WHERE id_lyrics_kaggle IS NULL and year_kaggle = {year} ORDER BY artist_kaggle'
                .format(year=year),
                con=conn)
            df2 = pd.read_sql(
                'SELECT id, artist, song, year FROM lyrics_kaggle WHERE year = {year} ORDER BY artist'
                .format(year=year),
                con=conn)
            # Columns to match on from df_left
            left_on = ["artist_kaggle", "song_kaggle", "year_kaggle"]
            # Columns to match on from df_right
            right_on = ["artist", "song", "year"]
            #df3 = fuzzymatcher.fuzzy_left_join(df1,df2,left_on,right_on)
            start_time = time.time()
            try:
                df = fuzzymatcher.link_table(df1, df2, left_on, right_on)
                print("--- %s seconds ---" % (time.time() - start_time))
                mtch = df.loc[df['match_score'] >= 0.38]
                mtch.to_sql(con=sqlalchemy_engine(),
                            name='fuzzy_matches',
                            if_exists='append')
                print("Year: {yr} - DB [OK]".format(yr=year))
            except Exception as e:
                print("Problem with year [{yr}] - [{e}]".format(yr=year,
                                                                e=str(e)))
            year = year + 1
    except Exception as e:
        print("Exception occurred \n" + str(e))
Ejemplo n.º 4
0
    def test_nulls_no_errors(self):
        """

        """

        df_left = pd.read_csv("tests/data/left_token_escape.csv")
        df_right = pd.read_csv("tests/data/right_token_escape.csv")

        # Columns to match on from df_left
        left_on = ["fname", "mname", "lname"]

        # Columns to match on from df_right
        right_on = ["name", "middlename", "surname"]

        on = [
            "first_name",
            "surname",
        ]

        flj = link_table(df_left,
                         df_right,
                         left_on,
                         right_on,
                         left_id_col="id",
                         right_id_col="id")
Ejemplo n.º 5
0
    def test_some_colnames_match(self):
        """
        Adding two numbers should give the correct answer
        """
        left = pd.read_csv("tests/data/left_1.csv")
        left = left.rename(columns = {"fname": "name"})
        right = pd.read_csv("tests/data/right_1.csv")
        left_on = ["name", "mname", "lname",  "dob"]
        right_on = ["name", "middlename", "surname", "date"]

        df = link_table(left, right, left_on, right_on)

        expected_columns = ['__id_left',
                            '__id_right',
                            'match_score',
                            'match_rank',
                            'name_left',
                            'name_right',
                            'mname',
                            'middlename',
                            'lname',
                            'surname',
                            'dob',
                            'date']

        actual_columns = list(df.columns)
        self.assertEqual(expected_columns, actual_columns)
Ejemplo n.º 6
0
    def test_all_colnames_match_with_id(self):
        """
        Adding two numbers should give the correct answer
        """
        left = pd.read_csv("tests/data/left_2.csv")
        right = pd.read_csv("tests/data/right_2.csv")
        left_on = ["fname", "mname", "lname",  "dob"]
        right_on = ["fname", "mname", "lname",  "dob"]

        df = link_table(left, right, left_on, right_on, left_id_col="id", right_id_col="id")

        expected_columns = ['id_left',
                    'id_right',
                    'match_score',
                    'match_rank',
                    'fname_left',
                    'fname_right',
                    'mname_left',
                    'mname_right',
                    'lname_left',
                    'lname_right',
                    'dob_left',
                    'dob_right']

        actual_columns = list(df.columns)
        self.assertEqual(expected_columns, actual_columns)
Ejemplo n.º 7
0
 def dataset(self):
     return fm.link_table(self.df1,
                          self.df2,
                          self.arr1,
                          self.arr2,
                          left_id_col='id',
                          right_id_col='id')
Ejemplo n.º 8
0
def fuzzy_match_result(entry,
                       neighbours,
                       __COUNT_ERROR,
                       __NAMES_ERROR,
                       tolerance=3):
    """
    Fuzzy match on final 5 neighbours using Levenshtein distance to get exact match.
    
    Input:
    --------
    entry: Pandas series entry of which merchant from Liven we trying to match.
    neighbours: Entries of neighbours we are trying to fuzzy match on.
    
    Output:
    --------
    result: Entry array of joined entry and closest neighbour.
    __COUNT_ERROR: Track of error.
    __NAMES_ERROR: Names of merchants.
    """
    try:
        # Turn series/array into dataframe to make it easier to join.
        temp_df = pd.DataFrame(
            np.array(entry).reshape(1, 6),
            columns=["name", "Suburb", "Lat", "Long", "City", "Id"])
        # Fuzzy matching join.

        matches_df = link_table(
            temp_df, neighbours, ["name", "Suburb", "City", "Id"], [
                "name", "rating", "num_ratings", "cuisines", "price_two",
                "contact", "url", "branches url", "num branches", "facility",
                "timetable"
            ])

        if matches_df.match_score[0] is None:
            # When score is pretty low, so do not match.
            __COUNT_ERROR += 1
            __NAMES_ERROR.append(
                (temp_df.name, temp_df.Suburb, "Fuzzy Error - NO match"))
            return -1, __COUNT_ERROR, __NAMES_ERROR

        if matches_df.match_score[0] > 0.5:
            __COUNT_ERROR += 1
            __NAMES_ERROR.append(
                (temp_df.name, temp_df.Suburb, "Fuzzy Error - BAD match"))
            return -1, __COUNT_ERROR, __NAMES_ERROR

        # Select columns from join.
        matches_df = matches_df[[
            "name_left", "Suburb", "Id", "name_right", "url"
        ]]
        # Rename columns.

        matches_df.columns = ["Name", "Suburb", "Id", "Zomato Name", "url"]
        if len(matches_df) > tolerance:
            raise Exception("Issue with matching.")
        return matches_df, __COUNT_ERROR, __NAMES_ERROR

    except:
        return -1, __COUNT_ERROR, __NAMES_ERROR
Ejemplo n.º 9
0
    def test_nulls_no_errors(self):
        """
        Adding two numbers should give the correct answer
        """
        df_left = pd.read_csv("tests/data/left_5_nas.csv")
        df_right = pd.read_csv("tests/data/right_5_nas.csv")

        on = ["first_name", "surname", "dob", "city"]

        flj = link_table(df_left, df_right, on, on)
Ejemplo n.º 10
0
def fuzzy_match_attempt_one():
    try:
        conn = database_conn()
        df1 = pd.read_sql(
            'SELECT Clave, id FROM artist_song_billboard WHERE id_lyrics IS NULL ORDER BY Clave',
            con=conn)
        df2 = pd.read_sql(
            'SELECT A.Clave, A.index id_letra FROM lyrics_kaggle A ORDER BY A.Clave',
            con=conn)
        # Columns to match on from df_left
        left_on = ["Clave"]
        # Columns to match on from df_right
        right_on = ["Clave"]
        #df3 = fuzzymatcher.fuzzy_left_join(df1,df2,left_on,right_on)
        df3 = fuzzymatcher.link_table(df1, df2, left_on, right_on)
        df3.head(10)
    except Exception as e:
        print("Exception occurred \n" + str(e))