def test_convert_to_local_time(self):
     serie = pd.date_range('2020-01-01', periods=9, freq='H')
     timezone_serie = pd.Series(
         [3600, -7200, 0, 3600000, -7200000, 60, -120, 1, -2])
     result = Utility.convert_to_local_time(serie, timezone_serie).tolist()
     result = [str(x) for x in result]
     self.assertEqual(result, [
         '2020-01-01 01:00:00', '2019-12-31 23:00:00',
         '2020-01-01 02:00:00', '2020-01-01 04:00:00',
         '2020-01-01 02:00:00', '2020-01-01 06:00:00',
         '2020-01-01 04:00:00', '2020-01-01 08:00:00', '2020-01-01 06:00:00'
     ])
Exemple #2
0
    def parse_play_activity_df(play_activity_df, convert_to_local_time = True, drop_columns=True):
        '''
            Method in charge of parsing the play activity dataframe. The parsing is performed in multiple steps:
            1. Rename the columns containing song title and artist
            2. Time columns: first obtain a timestamp column without missing values, using Event Start Timestamp and Event End Timestamp
            3. Time columns: add time columns from the timestamp column (year, month, day of the month,...), with or without conversion
            to local time (args)
            4. Remove outlier rows (Apple Music service started in 2015, so we drop rows with a year before 2015)
            5. Add a column with a flag for partial vs complete listening of a given track
            6. Add a column with a simplified 'origin' of the song, i.e. how it was found (search, suggestion, library,...)
            7. Add a column with a calculation of the listening duration in minutes
            8. Remove outliers of listening duration (99th percentile)
            9. Drop unused columns (args)

        '''

        columns_to_drop = [
        'Apple Id Number', 'Apple Music Subscription', 'Build Version', 'Client IP Address',
        'Content Specific Type', 'Device Identifier', 'Event Reason Hint Type', 'Activity date time',
        'End Position In Milliseconds', 'Event Received Timestamp', 'Media Type', 'Metrics Bucket Id', 
        'Metrics Client Id','Original Title', 'Source Type', 'Start Position In Milliseconds',
        'Store Country Name', 'Milliseconds Since Play', 'Event End Timestamp', 'Event Start Timestamp',
        'UTC Offset In Seconds','Play Duration Milliseconds', 'Media Duration In Milliseconds', 'Feature Name'
        ]
        # Rename columns for merges later
        parsed_df = play_activity_df.copy()
        parsed_df.rename(columns={'Content Name':'Title', 'Artist Name':'Artist'}, inplace=True)
        
        # Add time related columns
        parsed_df['Activity date time'] = pd.to_datetime(parsed_df['Event Start Timestamp'])
        parsed_df['Activity date time'].fillna(pd.to_datetime(parsed_df['Event End Timestamp']), inplace=True)
        if convert_to_local_time is True:
            parsed_df['Activity date time'] = Utility.convert_to_local_time(parsed_df['Activity date time'], parsed_df['UTC Offset In Seconds'])
        parsed_datetime_series = Utility.parse_date_time_column(parsed_df, 'Activity date time')
        Utility.add_time_related_columns(parsed_df, parsed_datetime_series, col_name_prefix='Play ')

        # We remove year outliers (Apple Music started in 2015, whatever is reported before is a mistake)
        parsed_df = parsed_df.drop(parsed_df[parsed_df['Play Year']< 2015].index)

        # Add partial listening column 
        play_duration = parsed_df['Play Duration Milliseconds']
        media_duration = parsed_df['Media Duration In Milliseconds']
        Parser.set_partial_listening(parsed_df, parsed_df['End Reason Type'], play_duration, media_duration)

        # Add track origin column
        parsed_df['Track origin'] = parsed_df['Feature Name'].apply(Parser.get_track_origin)

        # Add play duration column
        activity_start = pd.to_datetime(parsed_df['Event Start Timestamp'])
        activity_end = pd.to_datetime(parsed_df['Event End Timestamp'])
        played_completely = parsed_df['Played completely']
        Parser.compute_play_duration(parsed_df, activity_start, activity_end, played_completely, play_duration, media_duration)

        # we remove outliers from this play duration column, saying that if a value if above 1h30,
        # we drop it, and replace it by the duration of the media
        Parser.remove_play_duration_outliers(parsed_df, parsed_df['Play duration in minutes'], media_duration, 90)

        #we can then remove the columns we do not need anymore!
        if drop_columns:
            parsed_df = parsed_df.drop(columns_to_drop, axis=1, errors='ignore')

        return parsed_df