def parse_input_df(source_files): ''' This method is in charge of validating the format of the input source_files. If the provided input doesn't contain the required files, or if their format is incorrect, no further parsing/processing is possible - so it is important to report the problem early in the process! This method simply prints a warning message. Otherwise, the dataframes dictionary is populated with the input dataframes. ''' error_message_bad_input = ("Please ensure that the input_df object has the following structure...\n" '{ "identifier_infos_df" : identifier_infos_df, "library_tracks_df" : library_tracks_df,\n' '"library_activity_df" : library_activity_df, "likes_dislikes_df" : likes_dislikes_df, "play_activity_df" : play_activity_df }\n' '...And that the values in this dictionary are pandas dataframes. Returned object is empty.') dataframes = {} if len(source_files) != 5: print('WARNING:\n {0}'.format(error_message_bad_input)) else: if Utility.validate_input_df_files(source_files): dataframes['likes_dislikes_df'] = source_files['likes_dislikes_df'] dataframes['play_activity_df'] = source_files['play_activity_df'] dataframes['identifier_infos_df'] = source_files['identifier_infos_df'] dataframes['library_tracks_df'] = source_files['library_tracks_df'] dataframes['library_activity_df'] = source_files['library_activity_df'] else: print('WARNING:\n {0}'.format(error_message_bad_input)) return dataframes
def setUpClass(cls): #we use the test df target_files = { 'identifier_infos_path' : 'test_df/Apple Music Activity/Identifier Information.json.zip', 'library_tracks_path' : 'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip', 'library_activity_path': 'test_df/Apple Music Activity/Apple Music Library Activity.json.zip', 'likes_dislikes_path' : 'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv', 'play_activity_path': 'test_df/Apple Music Activity/Apple Music Play Activity.csv' } cls.input_df = Utility.get_df_from_archive('apple_music_analyser/tests/test_df.zip', target_files) cls.parser = Parser(cls.input_df) cls.likes_dislikes_df = cls.parser.likes_dislikes_df cls.play_activity_df = cls.parser.play_activity_df cls.identifier_infos_df = cls.parser.identifier_infos_df cls.library_tracks_df = cls.parser.library_tracks_df cls.library_activity_df = cls.parser.library_activity_df #we process the df cls.process = ProcessTracks() cls.process.process_library_tracks_df(cls.library_tracks_df) cls.process.process_identifier_df(cls.identifier_infos_df) cls.process.process_play_df(cls.play_activity_df) cls.process.process_likes_dislikes_df(cls.likes_dislikes_df) #we extract the useful objects from the process instance cls.track_instance_dict = cls.process.track_instance_dict cls.artist_tracks_titles = cls.process.artist_tracks_titles cls.genres_list = cls.process.genres_list cls.items_not_matched = cls.process.items_not_matched
def test_get_df_from_archive_with_target(self): ''' We test the case where the structure inside the archive is provided as an argument. This function relies on external package (ZipFile), well covered by tests. ''' target_files = { 'identifier_infos_path': 'test_df/Apple Music Activity/Identifier Information.json.zip', 'library_tracks_path': 'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip', 'library_activity_path': 'test_df/Apple Music Activity/Apple Music Library Activity.json.zip', 'likes_dislikes_path': 'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv', 'play_activity_path': 'test_df/Apple Music Activity/Apple Music Play Activity.csv' } archive_path = 'apple_music_analyser/tests/test_df.zip' result = Utility.get_df_from_archive(archive_path, target_files) self.assertTrue(isinstance(result, dict)) self.assertEqual(len(result), 5) self.assertEqual(list(result.keys()), [ 'identifier_infos_df', 'library_tracks_df', 'library_activity_df', 'likes_dislikes_df', 'play_activity_df' ]) for key in result.keys(): self.assertTrue(isinstance(result[key], pd.DataFrame))
def test_parse_date_time_column(self): ''' We only test if it returns a dict, as the values come from another function tested in a separate test (cf. test_extract_time_info_from_datetime) ''' df = pd.DataFrame(pd.Series('2020-01-01'), columns=['Timestamp']) result = Utility.parse_date_time_column(df, 'Timestamp') self.assertEqual(type(result), dict)
def test_get_df_from_archive_bad_archive(self): ''' We test the case where the path is wrong This function relies on external package (ZipFile), well covered by tests. ''' archive_path = None result = Utility.get_df_from_archive(archive_path) self.assertEqual(result, {})
def test_extract_time_info_from_datetime(self): serie = pd.to_datetime(pd.Series('2020-01-01')) year, month, dom, dow, hod = Utility.extract_time_info_from_datetime( serie) self.assertEqual(year.values[0], 2020) self.assertEqual(month.values[0], 1) self.assertEqual(dom.values[0], 1) self.assertEqual(dow.values[0], 'Wednesday') self.assertEqual(hod.values[0], 0)
def setUp(self): target_files = { 'identifier_infos_path' : 'test_df/Apple Music Activity/Identifier Information.json.zip', 'library_tracks_path' : 'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip', 'library_activity_path': 'test_df/Apple Music Activity/Apple Music Library Activity.json.zip', 'likes_dislikes_path' : 'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv', 'play_activity_path': 'test_df/Apple Music Activity/Apple Music Play Activity.csv' } self.input_df = Utility.get_df_from_archive('apple_music_analyser/tests/test_df.zip', target_files) self.df_visualization = VisualizationDataframe(self.input_df)
def parse_library_activity_df(library_activity_df): ''' Method in charge of parsing the library activity dataframe. It is responsible for adding time columns from the timestamp column (year, month, day of the month,...), as well as agent columns (what performed the action, what model). ''' parsed_df = library_activity_df.copy() # parse time related column parsed_datetime_series = Utility.parse_date_time_column(parsed_df, 'Transaction Date') Utility.add_time_related_columns(parsed_df, parsed_datetime_series, col_name_prefix='Transaction ') # parse action agent column parsed_df['Transaction Agent'] = parsed_df['UserAgent'].str.split('/').str.get(0) parsed_df.replace({'Transaction Agent' : { 'itunescloudd' : 'iPhone', 'iTunes' : 'Macintosh'}}, inplace=True) parsed_df['Transaction Agent Model'] = parsed_df[parsed_df['Transaction Agent'] == 'iPhone']['UserAgent'].str.split('/').str.get(3).str.split(',').str.get(0) parsed_df.loc[parsed_df['Transaction Agent'].eq('Macintosh'), 'Transaction Agent Model'] = 'Macintosh' return parsed_df
def test_add_time_related_columns(self): df = pd.DataFrame(pd.Series('2020-01-01'), columns=['Timestamp']) datetime_series = Utility.parse_date_time_column(df, 'Timestamp') Utility.add_time_related_columns(df, datetime_series, col_name_prefix='pref_', col_name_suffix='_suff') expected = { 'Timestamp': ['2020-01-01'], 'pref_date time_suff': ['2020-01-01'], 'pref_Year_suff': [2020], 'pref_Month_suff': [1], 'pref_DOM_suff': [1], 'pref_DOW_suff': ['Wednesday'], 'pref_HOD_suff': [0] } expected_output = pd.DataFrame.from_dict(expected) self.assertEqual(df.shape, expected_output.shape) self.assertEqual(df.columns.tolist(), expected_output.columns.tolist())
def compare_titles_for_artist(self, artist, title_to_compare): ''' Compares the string similarity of any song associated to an artist and an unknown title for this artist. The goal here is to be able to match different spellings of the same song. If the similarity score is above the threshold set, it returns the track instance of the matching artist song we already know. Otherwise it returns 'No match'. ''' for artist_track in self.artist_tracks_titles[artist]: title_similarity_for_artist = Utility.compute_similarity_score( title_to_compare, artist_track) # value observed to bring consistently a match between similar songs if title_similarity_for_artist > 0.625: #we fetch the track instance associated with the close match title_artist = Utility.concat_title_artist( artist_track, artist) track_instance = self.track_instance_dict[title_artist] return track_instance return 'No match'
def test_convert_to_local_time(self): serie = pd.date_range('2020-01-01', periods=9, freq='H') timezone_serie = pd.Series( [3600, -7200, 0, 3600000, -7200000, 60, -120, 1, -2]) result = Utility.convert_to_local_time(serie, timezone_serie).tolist() result = [str(x) for x in result] self.assertEqual(result, [ '2020-01-01 01:00:00', '2019-12-31 23:00:00', '2020-01-01 02:00:00', '2020-01-01 04:00:00', '2020-01-01 02:00:00', '2020-01-01 06:00:00', '2020-01-01 04:00:00', '2020-01-01 08:00:00', '2020-01-01 06:00:00' ])
def setUp(self): target_files = { 'identifier_infos_path' : 'test_df/Apple Music Activity/Identifier Information.json.zip', 'library_tracks_path' : 'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip', 'library_activity_path': 'test_df/Apple Music Activity/Apple Music Library Activity.json.zip', 'likes_dislikes_path' : 'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv', 'play_activity_path': 'test_df/Apple Music Activity/Apple Music Play Activity.csv' } self.input_df = Utility.get_df_from_archive('apple_music_analyser/tests/test_df.zip', target_files) self.parser = Parser(self.input_df) self.likes_dislikes_df = self.parser.likes_dislikes_df self.play_activity_df = self.parser.play_activity_df self.identifier_infos_df = self.parser.identifier_infos_df self.library_tracks_df = self.parser.library_tracks_df self.library_activity_df = self.parser.library_activity_df self.process = ProcessTracks() self.track_instance = Track(self.process.increment)
def test_init_Parser(self): target_files = { 'identifier_infos_path': 'test_df/Apple Music Activity/Identifier Information.json.zip', 'library_tracks_path': 'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip', 'library_activity_path': 'test_df/Apple Music Activity/Apple Music Library Activity.json.zip', 'likes_dislikes_path': 'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv', 'play_activity_path': 'test_df/Apple Music Activity/Apple Music Play Activity.csv' } input_df = Utility.get_df_from_archive( 'apple_music_analyser/tests/test_df.zip', target_files) shape_input_likes_dislikes_df = input_df['likes_dislikes_df'].shape shape_input_play_activity_df = input_df['play_activity_df'].shape shape_input_identifier_infos_df = input_df['identifier_infos_df'].shape shape_input_library_tracks_df = input_df['library_tracks_df'].shape shape_input_library_activity_df = input_df['library_activity_df'].shape result = Parser(input_df) self.assertTrue(isinstance(result.likes_dislikes_df, pd.DataFrame)) self.assertEqual(result.likes_dislikes_df.shape, (shape_input_likes_dislikes_df[0], shape_input_likes_dislikes_df[1] + 2)) self.assertTrue(isinstance(result.play_activity_df, pd.DataFrame)) self.assertEqual(result.play_activity_df.shape, (shape_input_play_activity_df[0] - 1, shape_input_play_activity_df[1] - 14)) self.assertTrue(isinstance(result.identifier_infos_df, pd.DataFrame)) self.assertEqual(result.identifier_infos_df.shape, (shape_input_identifier_infos_df[0], shape_input_identifier_infos_df[1])) self.assertTrue(isinstance(result.library_tracks_df, pd.DataFrame)) self.assertEqual(result.library_tracks_df.shape, (shape_input_library_tracks_df[0], shape_input_library_tracks_df[1] - 34)) self.assertTrue(isinstance(result.library_activity_df, pd.DataFrame)) self.assertEqual(result.library_activity_df.shape, (shape_input_library_activity_df[0], shape_input_library_activity_df[1] + 8))
from apple_music_analyser.VisualizationDataframe import VisualizationDataframe from apple_music_analyser.Utility import Utility # LOAD A PICKLE ########################################################################################################################### # we assume you have an instance of the visualization dataframe class saved in the same folder under the name 'viz_df_instance.pkl' viz_df_instance = Utility.load_from_pickle('viz_df_instance.pkl') # SAVE A PICKLE ########################################################################################################################### # get the input file - see starter_code.py for more details path_to_archive = '../apple_music_analyser/tests/test_df.zip' target_files = { 'identifier_infos_path': 'test_df/Apple Music Activity/Identifier Information.json.zip', 'library_tracks_path': 'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip', 'library_activity_path': 'test_df/Apple Music Activity/Apple Music Library Activity.json.zip', 'likes_dislikes_path': 'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv', 'play_activity_path': 'test_df/Apple Music Activity/Apple Music Play Activity.csv' } input_df = Utility.get_df_from_archive(path_to_archive, target_files) # create an instance of the visualization dataframe class viz_df_instance = VisualizationDataframe(input_df)
def process_play_df(self, play_activity_df): ''' This function goes through each row of the play activity dataframe, creating and updating track instances as they appear. As this is the dataframe we are able to get the most information from, we want to create new instances whenever we are not facing unknown songs (NaN as a title).The approach is very similar to the one used for the library tracks. The logic works as follows, knowing that we do this for each row of the dataframe: - if the track is in the dictionary of track instances, we update the existing track using update_track_from_play_activity - else, we have two options : - either we know this artist and we can find a similar title in the artist dict, and in this case we update the existing track using update_track_from_play_activity - or we do not know this artist, or we do not find a close match of title for this artist and in this case we create a new track instance using instantiate_track and then update_track_from_play_activity ''' for index, row in play_activity_df.iterrows(): #we want to look only at rows where the name of the song is available if str(row['Title']) != 'nan': title = row['Title'] if str(row['Artist']) != 'nan': artist = row['Artist'] else: artist = 'No Artist' else: self.items_not_matched['play_activity'].append(index) continue #we check if we already saw this track (using title and artist names) title_artist = Utility.concat_title_artist(title, artist) if title_artist in self.track_instance_dict.keys(): track_instance = self.track_instance_dict[title_artist] self.update_track_instance('play_activity_df', track_instance, index, row) else: # if we had no match with title and artist, we look for similarity in the title for the artist if artist in self.artist_tracks_titles.keys(): titles_comparison_result = self.compare_titles_for_artist( artist, title) if titles_comparison_result == 'No match': #we instantiate the Track object track_instance = Track(self.increment) track_instance.instantiate_track(title, artist) self.update_track_instance('play_activity_df', track_instance, index, row) #we update the dictionary that keeps track of our instances, titles of artists, and increment self.track_instance_dict[title_artist] = track_instance self.artist_tracks_titles[artist].append(title) self.increment += 1 else: track_instance = titles_comparison_result if not track_instance.has_title_name(title): track_instance.add_title(title) track_instance.add_appearance({ 'source': 'play_activity', 'df_index': index }) #we also track the match in the track_instances and artist dicts self.track_instance_dict[title_artist] = track_instance self.artist_tracks_titles[artist].append(title) # else we know we never saw this track because the artist is unknown else: #we update the artist/track names dictionnary self.artist_tracks_titles[artist] = [] self.artist_tracks_titles[artist].append(title) #we instantiate the Track object track_instance = Track(self.increment) track_instance.instantiate_track(title, artist) self.update_track_instance('play_activity_df', track_instance, index, row) #we update the dictionary that keeps track of our instances, and increment self.track_instance_dict[title_artist] = track_instance self.increment += 1
def test_compute_ratio_songs(self): serie = pd.Series([1, 1, 1, 1, 2, 2, 3, 3, 3, 3]) result = Utility.compute_ratio_songs(serie).tolist() self.assertEqual(result, [40.0, 40.0, 20.0])
def process_likes_dislikes_df(self, likes_dislikes_df): ''' This function goes through each row of the likes_dislikes dataframe, updating track instances as they appear. This dataframe contains a small proportion of all the tracks ever listened to, and/or in the library. As a result, we only want to update existing tracks, and not create new ones. The logic works as follows, knowing that we do this for each row of the dataframe: - we loop through all the track instances we created so far, and see if any of their identifier matches the id of the row we are looking at - if we find a match, we update the track with the rating, appearance, and if we didn't already have the associated title, we add it to the list of titles of that track - else: - if the track is in the dictionary of track instances, we update the existing track's rating and appearance - otherwise, we have two options: - either we know the artist and we can find a similar title in the artist dict, and in this case we update the existing track - or we do not know this artist, or we do not find a close match of title for this artist and in this case we add it to the tracks we could not match and we ignored ''' for index, row in likes_dislikes_df.iterrows(): #we want to look only at rows where the name of the song is available if str(row['Title']) != 'nan': title = row['Title'] if str(row['Artist']) != 'nan': artist = row['Artist'] else: artist = 'No Artist' else: self.items_not_matched['likes_dislikes'].append(index) continue title_artist = Utility.concat_title_artist(title, artist) # first we check using the Item Reference as an id found_match = False for title_name in self.track_instance_dict.keys(): track_instance = self.track_instance_dict[title_name] if row['Item Reference'] in track_instance.apple_music_id: track_instance.add_appearance({ 'source': 'likes_dislikes', 'df_index': index }) track_instance.set_rating(row['Preference']) if not track_instance.has_title_name(row['Title']): track_instance.add_title(row['Title']) self.track_instance_dict[title_artist] = track_instance if row['Title'] not in self.artist_tracks_titles[ artist]: self.artist_tracks_titles[artist].append(title) found_match = True break if found_match is False: #we check if we already saw this track (using title and artist names) if title_artist in self.track_instance_dict.keys(): track_instance = self.track_instance_dict[title_artist] track_instance.add_appearance({ 'source': 'likes_dislikes', 'df_index': index }) track_instance.set_rating(row['Preference']) else: # if we had no match with title and artist, we look for similarity in the title for the artist if artist in self.artist_tracks_titles.keys(): titles_comparison_result = self.compare_titles_for_artist( artist, title) if titles_comparison_result == 'No match': #we add the item to the items_not_matched self.items_not_matched['likes_dislikes'].append( index) else: track_instance = titles_comparison_result if not track_instance.has_title_name(title): track_instance.add_title(title) track_instance.add_appearance({ 'source': 'likes_dislikes', 'df_index': index }) track_instance.set_rating(row['Preference']) self.track_instance_dict[ title_artist] = track_instance self.artist_tracks_titles[artist].append(title) else: #we add the item to the items_not_matched, #we choose not to add it to the Track instances as the amount of information is little #and our reference really is the play activity! self.items_not_matched['likes_dislikes'].append(index) continue
from apple_music_analyser.Utility import Utility from apple_music_analyser.VisualizationDataframe import VisualizationDataframe # CASE 1 - you pass the archive provided by Apple ########################################################################################################################### # get the input files - with a structure like the one of the archive Apple provides input_df = Utility.get_df_from_archive(path_to_archive) # create an instance of the visualization dataframe class viz_df_instance = VisualizationDataframe(input_df) # CASE 2 - you want to pass the files in an archive with a custom structure ########################################################################################################################### # get the input files - from an archive with a custom structure # you can run this code as is, we use the files used for testing path_to_archive = '../apple_music_analyser/tests/test_df.zip' target_files = { 'identifier_infos_path': 'test_df/Apple Music Activity/Identifier Information.json.zip', 'library_tracks_path': 'test_df/Apple Music Activity/Apple Music Library Tracks.json.zip', 'library_activity_path': 'test_df/Apple Music Activity/Apple Music Library Activity.json.zip', 'likes_dislikes_path': 'test_df/Apple Music Activity/Apple Music Likes and Dislikes.csv', 'play_activity_path': 'test_df/Apple Music Activity/Apple Music Play Activity.csv' }
def test_concat_title_artist(self): title = 'Title' artist = 'Artist ' result = Utility.concat_title_artist(title, artist) self.assertEqual(result, 'Title && Artist')
from apple_music_analyser.Utility import Utility from apple_music_analyser.Query import QueryFactory from apple_music_analyser.DataVisualization import RankingListVisualization # GET THE INPUT - Loading a pickle file ########################################################################################################################### # see starter_code.py and save_load.py for more details viz_df_instance = Utility.load_from_pickle('viz_df_instance.pkl') df_viz = viz_df_instance.get_df_viz() # BUILD THE RANKING LIST WITHOUT QUERY ########################################################################################################################### # construct a count dict # possible to replace 'Genres' by: 'Title', 'Artist', 'Track_origin' count_dict_genres = viz_df_instance.track_summary_objects.build_ranking_dict_per_year( df_viz, 'Genres') # build the ranking list, limited to 5 items per year ranking_genres = RankingListVisualization(count_dict_genres, 5) # get the ranked_dict, printed on the console ranking_genres.get_ranked_dict(print_output=True) # BUILD THE RANKING LIST WITH QUERY ########################################################################################################################### # define the query parameters # params_dict = { # 'year':list,
from apple_music_analyser.Utility import Utility from apple_music_analyser.Query import QueryFactory from apple_music_analyser.DataVisualization import BarChartVisualization # GET THE INPUT - Loading a pickle file ########################################################################################################################### # see starter_code.py and save_load.py for more details viz_df_instance = Utility.load_from_pickle('viz_df_instance.pkl') df_viz = viz_df_instance.get_df_viz() # note that you can filter this df_viz using queries (see query.py for examples!) # BUILD A BAR CHART FOR DAY OF THE WEEK WITH RATIO OF SONGS ########################################################################################################################### # create the BarChart instance years_to_plot = sorted(df_viz['Play_Year'].dropna().unique()) bar_chart = BarChartVisualization(df_viz) bar_chart.hover_unit = '%' # for each year, build the x and y series, computing the percentage of songs, and generate the traces for year in years_to_plot: x_serie = df_viz[df_viz['Play_Year']==year]['Play_DOW'].unique() y_serie = Utility.compute_ratio_songs(df_viz[df_viz['Play_Year']==year]['Play_DOW']) bar_chart.render_bar_chart(x_serie, y_serie, str(year)) # edit the layout of the xaxis and show the rendered plot xaxis=dict(categoryorder='array', tickangle = -45,
def process_library_tracks_df(self, library_df): ''' This function goes through each row of the library tracks dataframe, creating and updating track instances as they appear. As this is the first dataframe we go through, we want to create new instances whenever we are not facing unknown songs (NaN as a title) The logic works as follows, knowing that we do this for each row of the dataframe: - we look only at rows with a title different than NaN, and we set the artist to 'No Artist' if the artist is also Nan - if the track is not in the dictionary of track instances, it means that we never saw the combination title/artist of this row. So two options here: - either we know this artist and we can find a similar title in the artist dict, and in this case we update the existing track using update_track_from_library - or we do not know this artist, or we do not find a close match of title for this artist and in this case we create a new track instance using instantiate_track and then update_track_from_library - else, we update the existing track using update_track_from_library ''' for index, row in library_df.iterrows(): if str(row['Title']) != 'nan': title = row['Title'] if str(row['Artist']) != 'nan': artist = row['Artist'] else: artist = 'No Artist' title_artist = Utility.concat_title_artist(title, artist) if title_artist not in self.track_instance_dict.keys(): if artist in self.artist_tracks_titles.keys(): titles_comparison_result = self.compare_titles_for_artist( artist, title) if titles_comparison_result == 'No match': #we instantiate the Track object track_instance = Track(self.increment) track_instance.instantiate_track(title, artist) self.update_track_instance('library_tracks_df', track_instance, index, row) self.track_instance_dict[ title_artist] = track_instance self.increment += 1 else: track_instance = titles_comparison_result if not track_instance.has_title_name(title): track_instance.add_title(title) self.update_track_instance('library_tracks_df', track_instance, index, row) self.track_instance_dict[ title_artist] = track_instance self.artist_tracks_titles[artist].append(title) else: #there was no close match, and the song was never seen, so we instantiate a new Track track_instance = Track(self.increment) track_instance.instantiate_track(title, artist) self.update_track_instance('library_tracks_df', track_instance, index, row) self.track_instance_dict[title_artist] = track_instance self.increment += 1 else: track_instance = self.track_instance_dict[title_artist] self.update_track_instance('library_tracks_df', track_instance, index, row) #we update the artist/track names dictionnary if artist not in self.artist_tracks_titles: self.artist_tracks_titles[artist] = [] if title not in self.artist_tracks_titles[artist]: self.artist_tracks_titles[artist].append(title) else: self.items_not_matched['library_tracks'].append(index)
def parse_play_activity_df(play_activity_df, convert_to_local_time = True, drop_columns=True): ''' Method in charge of parsing the play activity dataframe. The parsing is performed in multiple steps: 1. Rename the columns containing song title and artist 2. Time columns: first obtain a timestamp column without missing values, using Event Start Timestamp and Event End Timestamp 3. Time columns: add time columns from the timestamp column (year, month, day of the month,...), with or without conversion to local time (args) 4. Remove outlier rows (Apple Music service started in 2015, so we drop rows with a year before 2015) 5. Add a column with a flag for partial vs complete listening of a given track 6. Add a column with a simplified 'origin' of the song, i.e. how it was found (search, suggestion, library,...) 7. Add a column with a calculation of the listening duration in minutes 8. Remove outliers of listening duration (99th percentile) 9. Drop unused columns (args) ''' columns_to_drop = [ 'Apple Id Number', 'Apple Music Subscription', 'Build Version', 'Client IP Address', 'Content Specific Type', 'Device Identifier', 'Event Reason Hint Type', 'Activity date time', 'End Position In Milliseconds', 'Event Received Timestamp', 'Media Type', 'Metrics Bucket Id', 'Metrics Client Id','Original Title', 'Source Type', 'Start Position In Milliseconds', 'Store Country Name', 'Milliseconds Since Play', 'Event End Timestamp', 'Event Start Timestamp', 'UTC Offset In Seconds','Play Duration Milliseconds', 'Media Duration In Milliseconds', 'Feature Name' ] # Rename columns for merges later parsed_df = play_activity_df.copy() parsed_df.rename(columns={'Content Name':'Title', 'Artist Name':'Artist'}, inplace=True) # Add time related columns parsed_df['Activity date time'] = pd.to_datetime(parsed_df['Event Start Timestamp']) parsed_df['Activity date time'].fillna(pd.to_datetime(parsed_df['Event End Timestamp']), inplace=True) if convert_to_local_time is True: parsed_df['Activity date time'] = Utility.convert_to_local_time(parsed_df['Activity date time'], parsed_df['UTC Offset In Seconds']) parsed_datetime_series = Utility.parse_date_time_column(parsed_df, 'Activity date time') Utility.add_time_related_columns(parsed_df, parsed_datetime_series, col_name_prefix='Play ') # We remove year outliers (Apple Music started in 2015, whatever is reported before is a mistake) parsed_df = parsed_df.drop(parsed_df[parsed_df['Play Year']< 2015].index) # Add partial listening column play_duration = parsed_df['Play Duration Milliseconds'] media_duration = parsed_df['Media Duration In Milliseconds'] Parser.set_partial_listening(parsed_df, parsed_df['End Reason Type'], play_duration, media_duration) # Add track origin column parsed_df['Track origin'] = parsed_df['Feature Name'].apply(Parser.get_track_origin) # Add play duration column activity_start = pd.to_datetime(parsed_df['Event Start Timestamp']) activity_end = pd.to_datetime(parsed_df['Event End Timestamp']) played_completely = parsed_df['Played completely'] Parser.compute_play_duration(parsed_df, activity_start, activity_end, played_completely, play_duration, media_duration) # we remove outliers from this play duration column, saying that if a value if above 1h30, # we drop it, and replace it by the duration of the media Parser.remove_play_duration_outliers(parsed_df, parsed_df['Play duration in minutes'], media_duration, 90) #we can then remove the columns we do not need anymore! if drop_columns: parsed_df = parsed_df.drop(columns_to_drop, axis=1, errors='ignore') return parsed_df