def compare_titles_for_artist(self, artist, title_to_compare): ''' Compares the string similarity of any song associated to an artist and an unknown title for this artist. The goal here is to be able to match different spellings of the same song. If the similarity score is above the threshold set, it returns the track instance of the matching artist song we already know. Otherwise it returns 'No match'. ''' for artist_track in self.artist_tracks_titles[artist]: title_similarity_for_artist = Utility.compute_similarity_score( title_to_compare, artist_track) # value observed to bring consistently a match between similar songs if title_similarity_for_artist > 0.625: #we fetch the track instance associated with the close match title_artist = Utility.concat_title_artist( artist_track, artist) track_instance = self.track_instance_dict[title_artist] return track_instance return 'No match'
def test_concat_title_artist(self): title = 'Title' artist = 'Artist ' result = Utility.concat_title_artist(title, artist) self.assertEqual(result, 'Title && Artist')
def process_likes_dislikes_df(self, likes_dislikes_df): ''' This function goes through each row of the likes_dislikes dataframe, updating track instances as they appear. This dataframe contains a small proportion of all the tracks ever listened to, and/or in the library. As a result, we only want to update existing tracks, and not create new ones. The logic works as follows, knowing that we do this for each row of the dataframe: - we loop through all the track instances we created so far, and see if any of their identifier matches the id of the row we are looking at - if we find a match, we update the track with the rating, appearance, and if we didn't already have the associated title, we add it to the list of titles of that track - else: - if the track is in the dictionary of track instances, we update the existing track's rating and appearance - otherwise, we have two options: - either we know the artist and we can find a similar title in the artist dict, and in this case we update the existing track - or we do not know this artist, or we do not find a close match of title for this artist and in this case we add it to the tracks we could not match and we ignored ''' for index, row in likes_dislikes_df.iterrows(): #we want to look only at rows where the name of the song is available if str(row['Title']) != 'nan': title = row['Title'] if str(row['Artist']) != 'nan': artist = row['Artist'] else: artist = 'No Artist' else: self.items_not_matched['likes_dislikes'].append(index) continue title_artist = Utility.concat_title_artist(title, artist) # first we check using the Item Reference as an id found_match = False for title_name in self.track_instance_dict.keys(): track_instance = self.track_instance_dict[title_name] if row['Item Reference'] in track_instance.apple_music_id: track_instance.add_appearance({ 'source': 'likes_dislikes', 'df_index': index }) track_instance.set_rating(row['Preference']) if not track_instance.has_title_name(row['Title']): track_instance.add_title(row['Title']) self.track_instance_dict[title_artist] = track_instance if row['Title'] not in self.artist_tracks_titles[ artist]: self.artist_tracks_titles[artist].append(title) found_match = True break if found_match is False: #we check if we already saw this track (using title and artist names) if title_artist in self.track_instance_dict.keys(): track_instance = self.track_instance_dict[title_artist] track_instance.add_appearance({ 'source': 'likes_dislikes', 'df_index': index }) track_instance.set_rating(row['Preference']) else: # if we had no match with title and artist, we look for similarity in the title for the artist if artist in self.artist_tracks_titles.keys(): titles_comparison_result = self.compare_titles_for_artist( artist, title) if titles_comparison_result == 'No match': #we add the item to the items_not_matched self.items_not_matched['likes_dislikes'].append( index) else: track_instance = titles_comparison_result if not track_instance.has_title_name(title): track_instance.add_title(title) track_instance.add_appearance({ 'source': 'likes_dislikes', 'df_index': index }) track_instance.set_rating(row['Preference']) self.track_instance_dict[ title_artist] = track_instance self.artist_tracks_titles[artist].append(title) else: #we add the item to the items_not_matched, #we choose not to add it to the Track instances as the amount of information is little #and our reference really is the play activity! self.items_not_matched['likes_dislikes'].append(index) continue
def process_play_df(self, play_activity_df): ''' This function goes through each row of the play activity dataframe, creating and updating track instances as they appear. As this is the dataframe we are able to get the most information from, we want to create new instances whenever we are not facing unknown songs (NaN as a title).The approach is very similar to the one used for the library tracks. The logic works as follows, knowing that we do this for each row of the dataframe: - if the track is in the dictionary of track instances, we update the existing track using update_track_from_play_activity - else, we have two options : - either we know this artist and we can find a similar title in the artist dict, and in this case we update the existing track using update_track_from_play_activity - or we do not know this artist, or we do not find a close match of title for this artist and in this case we create a new track instance using instantiate_track and then update_track_from_play_activity ''' for index, row in play_activity_df.iterrows(): #we want to look only at rows where the name of the song is available if str(row['Title']) != 'nan': title = row['Title'] if str(row['Artist']) != 'nan': artist = row['Artist'] else: artist = 'No Artist' else: self.items_not_matched['play_activity'].append(index) continue #we check if we already saw this track (using title and artist names) title_artist = Utility.concat_title_artist(title, artist) if title_artist in self.track_instance_dict.keys(): track_instance = self.track_instance_dict[title_artist] self.update_track_instance('play_activity_df', track_instance, index, row) else: # if we had no match with title and artist, we look for similarity in the title for the artist if artist in self.artist_tracks_titles.keys(): titles_comparison_result = self.compare_titles_for_artist( artist, title) if titles_comparison_result == 'No match': #we instantiate the Track object track_instance = Track(self.increment) track_instance.instantiate_track(title, artist) self.update_track_instance('play_activity_df', track_instance, index, row) #we update the dictionary that keeps track of our instances, titles of artists, and increment self.track_instance_dict[title_artist] = track_instance self.artist_tracks_titles[artist].append(title) self.increment += 1 else: track_instance = titles_comparison_result if not track_instance.has_title_name(title): track_instance.add_title(title) track_instance.add_appearance({ 'source': 'play_activity', 'df_index': index }) #we also track the match in the track_instances and artist dicts self.track_instance_dict[title_artist] = track_instance self.artist_tracks_titles[artist].append(title) # else we know we never saw this track because the artist is unknown else: #we update the artist/track names dictionnary self.artist_tracks_titles[artist] = [] self.artist_tracks_titles[artist].append(title) #we instantiate the Track object track_instance = Track(self.increment) track_instance.instantiate_track(title, artist) self.update_track_instance('play_activity_df', track_instance, index, row) #we update the dictionary that keeps track of our instances, and increment self.track_instance_dict[title_artist] = track_instance self.increment += 1
def process_library_tracks_df(self, library_df): ''' This function goes through each row of the library tracks dataframe, creating and updating track instances as they appear. As this is the first dataframe we go through, we want to create new instances whenever we are not facing unknown songs (NaN as a title) The logic works as follows, knowing that we do this for each row of the dataframe: - we look only at rows with a title different than NaN, and we set the artist to 'No Artist' if the artist is also Nan - if the track is not in the dictionary of track instances, it means that we never saw the combination title/artist of this row. So two options here: - either we know this artist and we can find a similar title in the artist dict, and in this case we update the existing track using update_track_from_library - or we do not know this artist, or we do not find a close match of title for this artist and in this case we create a new track instance using instantiate_track and then update_track_from_library - else, we update the existing track using update_track_from_library ''' for index, row in library_df.iterrows(): if str(row['Title']) != 'nan': title = row['Title'] if str(row['Artist']) != 'nan': artist = row['Artist'] else: artist = 'No Artist' title_artist = Utility.concat_title_artist(title, artist) if title_artist not in self.track_instance_dict.keys(): if artist in self.artist_tracks_titles.keys(): titles_comparison_result = self.compare_titles_for_artist( artist, title) if titles_comparison_result == 'No match': #we instantiate the Track object track_instance = Track(self.increment) track_instance.instantiate_track(title, artist) self.update_track_instance('library_tracks_df', track_instance, index, row) self.track_instance_dict[ title_artist] = track_instance self.increment += 1 else: track_instance = titles_comparison_result if not track_instance.has_title_name(title): track_instance.add_title(title) self.update_track_instance('library_tracks_df', track_instance, index, row) self.track_instance_dict[ title_artist] = track_instance self.artist_tracks_titles[artist].append(title) else: #there was no close match, and the song was never seen, so we instantiate a new Track track_instance = Track(self.increment) track_instance.instantiate_track(title, artist) self.update_track_instance('library_tracks_df', track_instance, index, row) self.track_instance_dict[title_artist] = track_instance self.increment += 1 else: track_instance = self.track_instance_dict[title_artist] self.update_track_instance('library_tracks_df', track_instance, index, row) #we update the artist/track names dictionnary if artist not in self.artist_tracks_titles: self.artist_tracks_titles[artist] = [] if title not in self.artist_tracks_titles[artist]: self.artist_tracks_titles[artist].append(title) else: self.items_not_matched['library_tracks'].append(index)