Example #1
0
    def parse_input_df(source_files):
        '''
            This method is in charge of validating the format of the input source_files.
            If the provided input doesn't contain the required files, or if their format is incorrect, no
            further parsing/processing is possible - so it is important to report the problem early in the process!
            This method simply prints a warning message.
            Otherwise, the dataframes dictionary is populated with the input dataframes. 
        '''

        error_message_bad_input = ("Please ensure that the input_df object has the following structure...\n"

        '{  "identifier_infos_df" : identifier_infos_df, "library_tracks_df" : library_tracks_df,\n'
        '"library_activity_df" : library_activity_df, "likes_dislikes_df" : likes_dislikes_df, "play_activity_df" : play_activity_df    }\n'

        '...And that the values in this dictionary are pandas dataframes. Returned object is empty.')

        dataframes = {}
        if len(source_files) != 5:
            print('WARNING:\n {0}'.format(error_message_bad_input))

        else:
            if Utility.validate_input_df_files(source_files):
                dataframes['likes_dislikes_df'] = source_files['likes_dislikes_df']
                dataframes['play_activity_df'] = source_files['play_activity_df']
                dataframes['identifier_infos_df'] = source_files['identifier_infos_df']
                dataframes['library_tracks_df'] = source_files['library_tracks_df']
                dataframes['library_activity_df'] = source_files['library_activity_df']
            else:
                print('WARNING:\n {0}'.format(error_message_bad_input))
        
        return dataframes
 def setUpClass(cls):
     #we use the test df
     target_files = {
         'identifier_infos_path' : 'test_df/Apple Music Activity/Identifier Information.json.zip',
         'library_tracks_path' : 'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip',
         'library_activity_path': 'test_df/Apple Music Activity/Apple Music Library Activity.json.zip',
         'likes_dislikes_path' : 'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv',
         'play_activity_path': 'test_df/Apple Music Activity/Apple Music Play Activity.csv'
     }
     cls.input_df = Utility.get_df_from_archive('apple_music_analyser/tests/test_df.zip', target_files)
     cls.parser = Parser(cls.input_df)
     cls.likes_dislikes_df = cls.parser.likes_dislikes_df
     cls.play_activity_df = cls.parser.play_activity_df
     cls.identifier_infos_df = cls.parser.identifier_infos_df
     cls.library_tracks_df = cls.parser.library_tracks_df
     cls.library_activity_df = cls.parser.library_activity_df
     #we process the df
     cls.process = ProcessTracks()
     cls.process.process_library_tracks_df(cls.library_tracks_df)
     cls.process.process_identifier_df(cls.identifier_infos_df)
     cls.process.process_play_df(cls.play_activity_df)
     cls.process.process_likes_dislikes_df(cls.likes_dislikes_df)
     #we extract the useful objects from the process instance
     cls.track_instance_dict = cls.process.track_instance_dict
     cls.artist_tracks_titles = cls.process.artist_tracks_titles
     cls.genres_list = cls.process.genres_list
     cls.items_not_matched = cls.process.items_not_matched
 def test_get_df_from_archive_with_target(self):
     '''
       We test the case where the structure inside the archive is provided as an argument. 
       This function relies on external package (ZipFile), well covered by tests.
     '''
     target_files = {
         'identifier_infos_path':
         'test_df/Apple Music Activity/Identifier Information.json.zip',
         'library_tracks_path':
         'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip',
         'library_activity_path':
         'test_df/Apple Music Activity/Apple Music Library Activity.json.zip',
         'likes_dislikes_path':
         'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv',
         'play_activity_path':
         'test_df/Apple Music Activity/Apple Music Play Activity.csv'
     }
     archive_path = 'apple_music_analyser/tests/test_df.zip'
     result = Utility.get_df_from_archive(archive_path, target_files)
     self.assertTrue(isinstance(result, dict))
     self.assertEqual(len(result), 5)
     self.assertEqual(list(result.keys()), [
         'identifier_infos_df', 'library_tracks_df', 'library_activity_df',
         'likes_dislikes_df', 'play_activity_df'
     ])
     for key in result.keys():
         self.assertTrue(isinstance(result[key], pd.DataFrame))
 def test_parse_date_time_column(self):
     '''
         We only test if it returns a dict, as the values come from another function tested
         in a separate test (cf. test_extract_time_info_from_datetime)
     '''
     df = pd.DataFrame(pd.Series('2020-01-01'), columns=['Timestamp'])
     result = Utility.parse_date_time_column(df, 'Timestamp')
     self.assertEqual(type(result), dict)
 def test_get_df_from_archive_bad_archive(self):
     '''
       We test the case where the path is wrong
       This function relies on external package (ZipFile), well covered by tests.
     '''
     archive_path = None
     result = Utility.get_df_from_archive(archive_path)
     self.assertEqual(result, {})
 def test_extract_time_info_from_datetime(self):
     serie = pd.to_datetime(pd.Series('2020-01-01'))
     year, month, dom, dow, hod = Utility.extract_time_info_from_datetime(
         serie)
     self.assertEqual(year.values[0], 2020)
     self.assertEqual(month.values[0], 1)
     self.assertEqual(dom.values[0], 1)
     self.assertEqual(dow.values[0], 'Wednesday')
     self.assertEqual(hod.values[0], 0)
 def setUp(self):
     target_files = {
         'identifier_infos_path' : 'test_df/Apple Music Activity/Identifier Information.json.zip',
         'library_tracks_path' : 'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip',
         'library_activity_path': 'test_df/Apple Music Activity/Apple Music Library Activity.json.zip',
         'likes_dislikes_path' : 'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv',
         'play_activity_path': 'test_df/Apple Music Activity/Apple Music Play Activity.csv'
     }
     self.input_df = Utility.get_df_from_archive('apple_music_analyser/tests/test_df.zip', target_files)
     self.df_visualization = VisualizationDataframe(self.input_df)
Example #8
0
    def parse_library_activity_df(library_activity_df):
        '''
            Method in charge of parsing the library activity dataframe.
            It is responsible for adding time columns from the timestamp column (year, month, day of the month,...), as well
            as agent columns (what performed the action, what model).

        '''
        parsed_df = library_activity_df.copy()
        # parse time related column
        parsed_datetime_series = Utility.parse_date_time_column(parsed_df, 'Transaction Date')
        Utility.add_time_related_columns(parsed_df, parsed_datetime_series, col_name_prefix='Transaction ')
    
        # parse action agent column
        parsed_df['Transaction Agent'] = parsed_df['UserAgent'].str.split('/').str.get(0)
        parsed_df.replace({'Transaction Agent' : { 'itunescloudd' : 'iPhone', 'iTunes' : 'Macintosh'}}, inplace=True)
        parsed_df['Transaction Agent Model'] = parsed_df[parsed_df['Transaction Agent'] == 'iPhone']['UserAgent'].str.split('/').str.get(3).str.split(',').str.get(0)
        parsed_df.loc[parsed_df['Transaction Agent'].eq('Macintosh'), 'Transaction Agent Model'] = 'Macintosh'

        return parsed_df
 def test_add_time_related_columns(self):
     df = pd.DataFrame(pd.Series('2020-01-01'), columns=['Timestamp'])
     datetime_series = Utility.parse_date_time_column(df, 'Timestamp')
     Utility.add_time_related_columns(df,
                                      datetime_series,
                                      col_name_prefix='pref_',
                                      col_name_suffix='_suff')
     expected = {
         'Timestamp': ['2020-01-01'],
         'pref_date time_suff': ['2020-01-01'],
         'pref_Year_suff': [2020],
         'pref_Month_suff': [1],
         'pref_DOM_suff': [1],
         'pref_DOW_suff': ['Wednesday'],
         'pref_HOD_suff': [0]
     }
     expected_output = pd.DataFrame.from_dict(expected)
     self.assertEqual(df.shape, expected_output.shape)
     self.assertEqual(df.columns.tolist(), expected_output.columns.tolist())
 def compare_titles_for_artist(self, artist, title_to_compare):
     '''
         Compares the string similarity of any song associated to an artist and an unknown
         title for this artist. The goal here is to be able to match different spellings of 
         the same song. 
         If the similarity score is above the threshold set, it returns the track instance
         of the matching artist song we already know. 
         Otherwise it returns 'No match'.
     '''
     for artist_track in self.artist_tracks_titles[artist]:
         title_similarity_for_artist = Utility.compute_similarity_score(
             title_to_compare, artist_track)
         # value observed to bring consistently a match between similar songs
         if title_similarity_for_artist > 0.625:
             #we fetch the track instance associated with the close match
             title_artist = Utility.concat_title_artist(
                 artist_track, artist)
             track_instance = self.track_instance_dict[title_artist]
             return track_instance
     return 'No match'
 def test_convert_to_local_time(self):
     serie = pd.date_range('2020-01-01', periods=9, freq='H')
     timezone_serie = pd.Series(
         [3600, -7200, 0, 3600000, -7200000, 60, -120, 1, -2])
     result = Utility.convert_to_local_time(serie, timezone_serie).tolist()
     result = [str(x) for x in result]
     self.assertEqual(result, [
         '2020-01-01 01:00:00', '2019-12-31 23:00:00',
         '2020-01-01 02:00:00', '2020-01-01 04:00:00',
         '2020-01-01 02:00:00', '2020-01-01 06:00:00',
         '2020-01-01 04:00:00', '2020-01-01 08:00:00', '2020-01-01 06:00:00'
     ])
 def setUp(self):
     target_files = {
         'identifier_infos_path' : 'test_df/Apple Music Activity/Identifier Information.json.zip',
         'library_tracks_path' : 'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip',
         'library_activity_path': 'test_df/Apple Music Activity/Apple Music Library Activity.json.zip',
         'likes_dislikes_path' : 'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv',
         'play_activity_path': 'test_df/Apple Music Activity/Apple Music Play Activity.csv'
     }
     self.input_df = Utility.get_df_from_archive('apple_music_analyser/tests/test_df.zip', target_files)
     self.parser = Parser(self.input_df)
     self.likes_dislikes_df = self.parser.likes_dislikes_df
     self.play_activity_df = self.parser.play_activity_df
     self.identifier_infos_df = self.parser.identifier_infos_df
     self.library_tracks_df = self.parser.library_tracks_df
     self.library_activity_df = self.parser.library_activity_df
     self.process = ProcessTracks()
     self.track_instance = Track(self.process.increment)
Example #13
0
 def test_init_Parser(self):
     target_files = {
         'identifier_infos_path':
         'test_df/Apple Music Activity/Identifier Information.json.zip',
         'library_tracks_path':
         'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip',
         'library_activity_path':
         'test_df/Apple Music Activity/Apple Music Library Activity.json.zip',
         'likes_dislikes_path':
         'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv',
         'play_activity_path':
         'test_df/Apple Music Activity/Apple Music Play Activity.csv'
     }
     input_df = Utility.get_df_from_archive(
         'apple_music_analyser/tests/test_df.zip', target_files)
     shape_input_likes_dislikes_df = input_df['likes_dislikes_df'].shape
     shape_input_play_activity_df = input_df['play_activity_df'].shape
     shape_input_identifier_infos_df = input_df['identifier_infos_df'].shape
     shape_input_library_tracks_df = input_df['library_tracks_df'].shape
     shape_input_library_activity_df = input_df['library_activity_df'].shape
     result = Parser(input_df)
     self.assertTrue(isinstance(result.likes_dislikes_df, pd.DataFrame))
     self.assertEqual(result.likes_dislikes_df.shape,
                      (shape_input_likes_dislikes_df[0],
                       shape_input_likes_dislikes_df[1] + 2))
     self.assertTrue(isinstance(result.play_activity_df, pd.DataFrame))
     self.assertEqual(result.play_activity_df.shape,
                      (shape_input_play_activity_df[0] - 1,
                       shape_input_play_activity_df[1] - 14))
     self.assertTrue(isinstance(result.identifier_infos_df, pd.DataFrame))
     self.assertEqual(result.identifier_infos_df.shape,
                      (shape_input_identifier_infos_df[0],
                       shape_input_identifier_infos_df[1]))
     self.assertTrue(isinstance(result.library_tracks_df, pd.DataFrame))
     self.assertEqual(result.library_tracks_df.shape,
                      (shape_input_library_tracks_df[0],
                       shape_input_library_tracks_df[1] - 34))
     self.assertTrue(isinstance(result.library_activity_df, pd.DataFrame))
     self.assertEqual(result.library_activity_df.shape,
                      (shape_input_library_activity_df[0],
                       shape_input_library_activity_df[1] + 8))
Example #14
0
from apple_music_analyser.VisualizationDataframe import VisualizationDataframe
from apple_music_analyser.Utility import Utility

# LOAD A PICKLE
###########################################################################################################################

# we assume you have an instance of the visualization dataframe class saved in the same folder under the name 'viz_df_instance.pkl'
viz_df_instance = Utility.load_from_pickle('viz_df_instance.pkl')

# SAVE A PICKLE
###########################################################################################################################

# get the input file - see starter_code.py for more details

path_to_archive = '../apple_music_analyser/tests/test_df.zip'
target_files = {
    'identifier_infos_path':
    'test_df/Apple Music Activity/Identifier Information.json.zip',
    'library_tracks_path':
    'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip',
    'library_activity_path':
    'test_df/Apple Music Activity/Apple Music Library Activity.json.zip',
    'likes_dislikes_path':
    'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv',
    'play_activity_path':
    'test_df/Apple Music Activity/Apple Music Play Activity.csv'
}
input_df = Utility.get_df_from_archive(path_to_archive, target_files)

# create an instance of the visualization dataframe class
viz_df_instance = VisualizationDataframe(input_df)
    def process_play_df(self, play_activity_df):
        '''
            This function goes through each row of the play activity dataframe, creating and updating
            track instances as they appear.
            As this is the dataframe we are able to get the most information from, we want to create
            new instances whenever we are not facing unknown songs (NaN as a title).The approach is
            very similar to the one used for the library tracks.
            
            The logic works as follows, knowing that we do this for each row of the dataframe:
                - if the track is in the dictionary of track instances, we update the existing
                track using update_track_from_play_activity
                - else, we have two options :
                    - either we know this artist and we can find a similar title in the artist dict,
                    and in this case we update the existing track using update_track_from_play_activity
                    - or we do not know this artist, or we do not find a close match of title for this
                    artist and in this case we create a new track instance using instantiate_track and
                    then update_track_from_play_activity
        '''
        for index, row in play_activity_df.iterrows():
            #we want to look only at rows where the name of the song is available
            if str(row['Title']) != 'nan':
                title = row['Title']
                if str(row['Artist']) != 'nan':
                    artist = row['Artist']
                else:
                    artist = 'No Artist'
            else:
                self.items_not_matched['play_activity'].append(index)
                continue

            #we check if we already saw this track (using title and artist names)
            title_artist = Utility.concat_title_artist(title, artist)
            if title_artist in self.track_instance_dict.keys():
                track_instance = self.track_instance_dict[title_artist]
                self.update_track_instance('play_activity_df', track_instance,
                                           index, row)

            else:
                # if we had no match with title and artist, we look for similarity in the title for the artist
                if artist in self.artist_tracks_titles.keys():
                    titles_comparison_result = self.compare_titles_for_artist(
                        artist, title)
                    if titles_comparison_result == 'No match':
                        #we instantiate the Track object
                        track_instance = Track(self.increment)
                        track_instance.instantiate_track(title, artist)
                        self.update_track_instance('play_activity_df',
                                                   track_instance, index, row)
                        #we update the dictionary that keeps track of our instances, titles of artists, and increment
                        self.track_instance_dict[title_artist] = track_instance
                        self.artist_tracks_titles[artist].append(title)
                        self.increment += 1

                    else:
                        track_instance = titles_comparison_result
                        if not track_instance.has_title_name(title):
                            track_instance.add_title(title)
                        track_instance.add_appearance({
                            'source': 'play_activity',
                            'df_index': index
                        })
                        #we also track the match in the track_instances and artist dicts
                        self.track_instance_dict[title_artist] = track_instance
                        self.artist_tracks_titles[artist].append(title)

                # else we know we never saw this track because the artist is unknown
                else:
                    #we update the artist/track names dictionnary
                    self.artist_tracks_titles[artist] = []
                    self.artist_tracks_titles[artist].append(title)

                    #we instantiate the Track object
                    track_instance = Track(self.increment)
                    track_instance.instantiate_track(title, artist)
                    self.update_track_instance('play_activity_df',
                                               track_instance, index, row)

                    #we update the dictionary that keeps track of our instances, and increment
                    self.track_instance_dict[title_artist] = track_instance
                    self.increment += 1
 def test_compute_ratio_songs(self):
     serie = pd.Series([1, 1, 1, 1, 2, 2, 3, 3, 3, 3])
     result = Utility.compute_ratio_songs(serie).tolist()
     self.assertEqual(result, [40.0, 40.0, 20.0])
    def process_likes_dislikes_df(self, likes_dislikes_df):
        '''
            This function goes through each row of the likes_dislikes dataframe, updating
            track instances as they appear.
            This dataframe contains a small proportion of all the tracks ever listened to, and/or in
            the library. As a result, we only want to update existing tracks, and not create new ones.
            The logic works as follows, knowing that we do this for each row of the dataframe:
                - we loop through all the track instances we created so far, and see if any of their 
                identifier matches the id of the row we are looking at
                - if we find a match, we update the track with the rating, appearance, and if we didn't
                already have the associated title, we add it to the list of titles of that track
                - else:
                    - if the track is in the dictionary of track instances, we update the existing
                track's rating and appearance
                    - otherwise, we have two options:
                        - either we know the artist and we can find a similar title in the artist dict,
                    and in this case we update the existing track
                        - or we do not know this artist, or we do not find a close match of title for this
                    artist and in this case we add it to the tracks we could not match and we ignored
        '''
        for index, row in likes_dislikes_df.iterrows():
            #we want to look only at rows where the name of the song is available
            if str(row['Title']) != 'nan':
                title = row['Title']
                if str(row['Artist']) != 'nan':
                    artist = row['Artist']
                else:
                    artist = 'No Artist'
            else:
                self.items_not_matched['likes_dislikes'].append(index)
                continue

            title_artist = Utility.concat_title_artist(title, artist)

            # first we check using the Item Reference as an id
            found_match = False
            for title_name in self.track_instance_dict.keys():
                track_instance = self.track_instance_dict[title_name]
                if row['Item Reference'] in track_instance.apple_music_id:
                    track_instance.add_appearance({
                        'source': 'likes_dislikes',
                        'df_index': index
                    })
                    track_instance.set_rating(row['Preference'])
                    if not track_instance.has_title_name(row['Title']):
                        track_instance.add_title(row['Title'])
                        self.track_instance_dict[title_artist] = track_instance
                        if row['Title'] not in self.artist_tracks_titles[
                                artist]:
                            self.artist_tracks_titles[artist].append(title)
                    found_match = True
                    break

            if found_match is False:
                #we check if we already saw this track (using title and artist names)
                if title_artist in self.track_instance_dict.keys():
                    track_instance = self.track_instance_dict[title_artist]
                    track_instance.add_appearance({
                        'source': 'likes_dislikes',
                        'df_index': index
                    })
                    track_instance.set_rating(row['Preference'])

                else:
                    # if we had no match with title and artist, we look for similarity in the title for the artist
                    if artist in self.artist_tracks_titles.keys():
                        titles_comparison_result = self.compare_titles_for_artist(
                            artist, title)
                        if titles_comparison_result == 'No match':
                            #we add the item to the items_not_matched
                            self.items_not_matched['likes_dislikes'].append(
                                index)
                        else:
                            track_instance = titles_comparison_result
                            if not track_instance.has_title_name(title):
                                track_instance.add_title(title)
                            track_instance.add_appearance({
                                'source': 'likes_dislikes',
                                'df_index': index
                            })
                            track_instance.set_rating(row['Preference'])
                            self.track_instance_dict[
                                title_artist] = track_instance
                            self.artist_tracks_titles[artist].append(title)
                    else:
                        #we add the item to the items_not_matched,
                        #we choose not to add it to the Track instances as the amount of information is little
                        #and our reference really is the play activity!
                        self.items_not_matched['likes_dislikes'].append(index)
                        continue
Example #18
0
from apple_music_analyser.Utility import Utility
from apple_music_analyser.VisualizationDataframe import VisualizationDataframe

# CASE 1 - you pass the archive provided by Apple
###########################################################################################################################

# get the input files - with a structure like the one of the archive Apple provides
input_df = Utility.get_df_from_archive(path_to_archive)

# create an instance of the visualization dataframe class
viz_df_instance = VisualizationDataframe(input_df)

# CASE 2 - you want to pass the files in an archive with a custom structure
###########################################################################################################################

# get the input files - from an archive with a custom structure
# you can run this code as is, we use the files used for testing

path_to_archive = '../apple_music_analyser/tests/test_df.zip'
target_files = {
    'identifier_infos_path':
    'test_df/Apple Music Activity/Identifier Information.json.zip',
    'library_tracks_path':
    'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip',
    'library_activity_path':
    'test_df/Apple Music Activity/Apple Music Library Activity.json.zip',
    'likes_dislikes_path':
    'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv',
    'play_activity_path':
    'test_df/Apple Music Activity/Apple Music Play Activity.csv'
}
 def test_concat_title_artist(self):
     title = 'Title'
     artist = 'Artist '
     result = Utility.concat_title_artist(title, artist)
     self.assertEqual(result, 'Title && Artist')
from apple_music_analyser.Utility import Utility
from apple_music_analyser.Query import QueryFactory
from apple_music_analyser.DataVisualization import RankingListVisualization

# GET THE INPUT - Loading a pickle file
###########################################################################################################################
# see starter_code.py and save_load.py for more details

viz_df_instance = Utility.load_from_pickle('viz_df_instance.pkl')
df_viz = viz_df_instance.get_df_viz()

# BUILD THE RANKING LIST WITHOUT QUERY
###########################################################################################################################

# construct a count dict
# possible to replace 'Genres' by: 'Title', 'Artist', 'Track_origin'
count_dict_genres = viz_df_instance.track_summary_objects.build_ranking_dict_per_year(
    df_viz, 'Genres')

# build the ranking list, limited to 5 items per year
ranking_genres = RankingListVisualization(count_dict_genres, 5)

# get the ranked_dict, printed on the console
ranking_genres.get_ranked_dict(print_output=True)

# BUILD THE RANKING LIST WITH QUERY
###########################################################################################################################

# define the query parameters
# params_dict = {
#     'year':list,
Example #21
0
from apple_music_analyser.Utility import Utility
from apple_music_analyser.Query import QueryFactory
from apple_music_analyser.DataVisualization import BarChartVisualization


# GET THE INPUT - Loading a pickle file
###########################################################################################################################
# see starter_code.py and save_load.py for more details

viz_df_instance = Utility.load_from_pickle('viz_df_instance.pkl')
df_viz = viz_df_instance.get_df_viz()
# note that you can filter this df_viz using queries (see query.py for examples!)


# BUILD A BAR CHART FOR DAY OF THE WEEK WITH RATIO OF SONGS
###########################################################################################################################

# create the BarChart instance
years_to_plot = sorted(df_viz['Play_Year'].dropna().unique())
bar_chart = BarChartVisualization(df_viz)
bar_chart.hover_unit = '%'

# for each year, build the x and y series, computing the percentage of songs, and generate the traces
for year in years_to_plot:
    x_serie = df_viz[df_viz['Play_Year']==year]['Play_DOW'].unique()
    y_serie = Utility.compute_ratio_songs(df_viz[df_viz['Play_Year']==year]['Play_DOW'])
    bar_chart.render_bar_chart(x_serie, y_serie, str(year))

# edit the layout of the xaxis and show the rendered plot
xaxis=dict(categoryorder='array',
            tickangle = -45,
    def process_library_tracks_df(self, library_df):
        '''
            This function goes through each row of the library tracks dataframe, creating and updating
            track instances as they appear.
            As this is the first dataframe we go through, we want to create new instances whenever
            we are not facing unknown songs (NaN as a title)
            The logic works as follows, knowing that we do this for each row of the dataframe:
                - we look only at rows with a title different than NaN, and we set the artist to
                'No Artist' if the artist is also Nan
                - if the track is not in the dictionary of track instances, it means that we never
                saw the combination title/artist of this row. So two options here:
                    - either we know this artist and we can find a similar title in the artist dict, and in
                    this case we update the existing track using update_track_from_library
                    - or we do not know this artist, or we do not find a close match of title for this artist
                    and in this case we create a new track instance using instantiate_track and then
                    update_track_from_library
                - else, we update the existing track using update_track_from_library
        '''
        for index, row in library_df.iterrows():
            if str(row['Title']) != 'nan':
                title = row['Title']
                if str(row['Artist']) != 'nan':
                    artist = row['Artist']
                else:
                    artist = 'No Artist'

                title_artist = Utility.concat_title_artist(title, artist)

                if title_artist not in self.track_instance_dict.keys():
                    if artist in self.artist_tracks_titles.keys():
                        titles_comparison_result = self.compare_titles_for_artist(
                            artist, title)

                        if titles_comparison_result == 'No match':
                            #we instantiate the Track object
                            track_instance = Track(self.increment)
                            track_instance.instantiate_track(title, artist)
                            self.update_track_instance('library_tracks_df',
                                                       track_instance, index,
                                                       row)
                            self.track_instance_dict[
                                title_artist] = track_instance
                            self.increment += 1

                        else:
                            track_instance = titles_comparison_result
                            if not track_instance.has_title_name(title):
                                track_instance.add_title(title)
                            self.update_track_instance('library_tracks_df',
                                                       track_instance, index,
                                                       row)
                            self.track_instance_dict[
                                title_artist] = track_instance
                            self.artist_tracks_titles[artist].append(title)

                    else:
                        #there was no close match, and the song was never seen, so we instantiate a new Track
                        track_instance = Track(self.increment)
                        track_instance.instantiate_track(title, artist)
                        self.update_track_instance('library_tracks_df',
                                                   track_instance, index, row)
                        self.track_instance_dict[title_artist] = track_instance
                        self.increment += 1

                else:
                    track_instance = self.track_instance_dict[title_artist]
                    self.update_track_instance('library_tracks_df',
                                               track_instance, index, row)

                #we update the artist/track names dictionnary
                if artist not in self.artist_tracks_titles:
                    self.artist_tracks_titles[artist] = []
                if title not in self.artist_tracks_titles[artist]:
                    self.artist_tracks_titles[artist].append(title)
            else:
                self.items_not_matched['library_tracks'].append(index)
Example #23
0
    def parse_play_activity_df(play_activity_df, convert_to_local_time = True, drop_columns=True):
        '''
            Method in charge of parsing the play activity dataframe. The parsing is performed in multiple steps:
            1. Rename the columns containing song title and artist
            2. Time columns: first obtain a timestamp column without missing values, using Event Start Timestamp and Event End Timestamp
            3. Time columns: add time columns from the timestamp column (year, month, day of the month,...), with or without conversion
            to local time (args)
            4. Remove outlier rows (Apple Music service started in 2015, so we drop rows with a year before 2015)
            5. Add a column with a flag for partial vs complete listening of a given track
            6. Add a column with a simplified 'origin' of the song, i.e. how it was found (search, suggestion, library,...)
            7. Add a column with a calculation of the listening duration in minutes
            8. Remove outliers of listening duration (99th percentile)
            9. Drop unused columns (args)

        '''

        columns_to_drop = [
        'Apple Id Number', 'Apple Music Subscription', 'Build Version', 'Client IP Address',
        'Content Specific Type', 'Device Identifier', 'Event Reason Hint Type', 'Activity date time',
        'End Position In Milliseconds', 'Event Received Timestamp', 'Media Type', 'Metrics Bucket Id', 
        'Metrics Client Id','Original Title', 'Source Type', 'Start Position In Milliseconds',
        'Store Country Name', 'Milliseconds Since Play', 'Event End Timestamp', 'Event Start Timestamp',
        'UTC Offset In Seconds','Play Duration Milliseconds', 'Media Duration In Milliseconds', 'Feature Name'
        ]
        # Rename columns for merges later
        parsed_df = play_activity_df.copy()
        parsed_df.rename(columns={'Content Name':'Title', 'Artist Name':'Artist'}, inplace=True)
        
        # Add time related columns
        parsed_df['Activity date time'] = pd.to_datetime(parsed_df['Event Start Timestamp'])
        parsed_df['Activity date time'].fillna(pd.to_datetime(parsed_df['Event End Timestamp']), inplace=True)
        if convert_to_local_time is True:
            parsed_df['Activity date time'] = Utility.convert_to_local_time(parsed_df['Activity date time'], parsed_df['UTC Offset In Seconds'])
        parsed_datetime_series = Utility.parse_date_time_column(parsed_df, 'Activity date time')
        Utility.add_time_related_columns(parsed_df, parsed_datetime_series, col_name_prefix='Play ')

        # We remove year outliers (Apple Music started in 2015, whatever is reported before is a mistake)
        parsed_df = parsed_df.drop(parsed_df[parsed_df['Play Year']< 2015].index)

        # Add partial listening column 
        play_duration = parsed_df['Play Duration Milliseconds']
        media_duration = parsed_df['Media Duration In Milliseconds']
        Parser.set_partial_listening(parsed_df, parsed_df['End Reason Type'], play_duration, media_duration)

        # Add track origin column
        parsed_df['Track origin'] = parsed_df['Feature Name'].apply(Parser.get_track_origin)

        # Add play duration column
        activity_start = pd.to_datetime(parsed_df['Event Start Timestamp'])
        activity_end = pd.to_datetime(parsed_df['Event End Timestamp'])
        played_completely = parsed_df['Played completely']
        Parser.compute_play_duration(parsed_df, activity_start, activity_end, played_completely, play_duration, media_duration)

        # we remove outliers from this play duration column, saying that if a value if above 1h30,
        # we drop it, and replace it by the duration of the media
        Parser.remove_play_duration_outliers(parsed_df, parsed_df['Play duration in minutes'], media_duration, 90)

        #we can then remove the columns we do not need anymore!
        if drop_columns:
            parsed_df = parsed_df.drop(columns_to_drop, axis=1, errors='ignore')

        return parsed_df