Ejemplo n.º 1
0
    def strip(self, number_of_days=None, min_log_days=None):

        if self.__stripped__:
            log('Already stripped this Appevents object!', lvl=1)
            return self

        log('Stripping object to longest uninterrupted sequence.')
        # Get longest uninterrupted sequence
        tqdm.pandas(desc="Finding longest uninterrupted sequence.", position=0, leave=True)
        self.__data__ = self.__data__.groupby('id').progress_apply(lambda df: longest_uninterrupted(df=df)).reset_index(
            drop=True)

        # Cut off head and tail
        tqdm.pandas(desc="Cutting off head and tail.", position=0, leave=True)
        self.__data__ = self.__data__.groupby('id').progress_apply(lambda df: remove_first_and_last(df=df)).reset_index(
            drop=True)

        # If a number of days is set
        if number_of_days:
            self.select_n_first_days(n=number_of_days, inplace=True)

        # If a minimum number of log days is set
        if min_log_days:
            self.impose_min_days(n=min_log_days, inplace=True)

        # Remember that we did this
        self.__stripped__ = True

        return self
Ejemplo n.º 2
0
 def get_applications(self, by: str = 'events') -> dict:
     """
     Returns applications and their frequency
     """
     if by == 'events':
         return self.__data__.application.value_counts()
     elif by == 'duration':
         return self.__data__.groupby('application').duration.sum().sort_values(ascending=False)
     else:
         log("Cannot get applications according to that metric. Choose 'events' or 'duration'.", lvl=1)
         return {}
Ejemplo n.º 3
0
def add_category(df: pd.DataFrame,
                 scrape=False,
                 overwrite=False) -> pd.DataFrame:
    """
    Take a data frame and annotate rows with category field, based on application name.

    :param df:data frame (appevents or notifications)
    :param scrape: scrape Play Store for new info (set to True if no meta data is found)
    :return: Annotated data frame
    """

    # Load app meta data
    try:
        meta = dict(
            np.load(join(hlp.CACHE_DIR, 'app_meta.npy'),
                    allow_pickle=True).item())
    except Exception as e:
        log('No app meta data found. Scraping Play store.', lvl=1)
        scrape = True
        meta = {}

    # Check if data frame has an application field
    if 'application' not in df:
        raise Exception('Cannot find <application> column in data frame!')

    # Scape the Play store if requested
    if scrape:
        applications = list(df.application.unique())

        meta, _ = scrape_play_store(app_names=applications,
                                    cache=meta,
                                    overwrite=overwrite)

    # Add category field to row
    def adding_category_row(app: str):

        if app in meta.keys() and meta[app]['genre']:
            try:
                return meta[app]['genre'].lower()
            except:
                print(f"Exception for {app}")
                return meta[app]['genre']
        else:
            return 'unknown'

    tqdm.pandas(desc="Adding category", position=0, leave=True)
    df['category'] = df.application.progress_apply(adding_category_row)

    return df
Ejemplo n.º 4
0
    def load(cls, path: str, file_type='infer', sep=',', decimal='.'):
        """
        Construct Notifications object from path

        :param path: path to the file
        :param file_type: file extension (csv, parquet, or pickle)
        :param sep: separator for csv files
        :param decimal: decimal for csv files
        :return: Notifications object
        """

        # Load data frame, depending on file type
        if file_type == 'infer':

            # Get extension
            file_type = path.split('.')[-1]

            # Only allow the following extensions
            if file_type not in ['csv', 'pickle', 'pkl', 'parquet']:
                raise Exception("ERROR: Could not infer file type!")

            log("Recognized file type as <{type}>.".format(type=file_type),
                lvl=3)

        # CSV
        if file_type == 'csv':
            data = pd.read_csv(
                filepath_or_buffer=path,
                # usecols=,
                sep=sep,
                decimal=decimal,
                error_bad_lines=False)

        # Pickle
        elif file_type == 'pickle' or file_type == 'pkl':
            data = pd.read_pickle(path=path)

        # Parquet
        elif file_type == 'parquet':
            data = pd.read_parquet(path=path, engine='auto')

        # Unknown
        else:
            raise Exception(
                "ERROR: You want me to read what now? Invalid file type! ")

        return cls(data=data)
Ejemplo n.º 5
0
    def get_categories(self, by: str = 'events') -> dict:
        """
        Returns categories and their frequency
        """

        # Add categories if not present
        if 'category' not in self.__data__.columns:
            log('Data not annotated with categories yet. Fixing...', lvl=1)
            self.add_category()

        if by == 'events':
            return self.__data__.category.value_counts()
        elif by == 'duration':
            return self.__data__.groupby('category').duration.sum().sort_values(ascending=False)
        else:
            log("Cannot get categories according to that metric. Choose 'events' or 'duration'.", lvl=1)
            return {}
Ejemplo n.º 6
0
    def __init__(self, data: pd.DataFrame = None, add_categories=False, add_date_annotation=False,
                 get_session_sequences=False, strip=False):

        # Drop 'Unnamed' columns
        for col in data.columns:

            if col.startswith('Unnamed'):
                data.drop(labels=[col], axis=1, inplace=True)

        # Set dtypes #
        ##############

        # Set datetimes
        try:
            data.startTime = data.startTime.astype('datetime64[ns]')
        except Exception as e:
            log('Could not convert startTime column to datetime format: ', e)
        try:
            data.endTime = data.endTime.astype('datetime64[ns]')
        except Exception as e:
            log('Could not convert endTime column to datetime format: ', e)

        # Downcast battery column
        try:
            data.battery = data.battery.astype('uint8')
        except Exception as e:
            log('Could not convert battery column to uint8 format: ', e)

        # Factorize ids
        # data.id = data.id.astype('category')

        # Factorize apps
        # data.application = data.application.astype('category')

        # Factorize sessions
        # data.session = data.session.astype('category')

        # Sort data frame
        data.sort_values(by=['id', 'startTime'], inplace=True)

        # Set data attribute
        self.__data__ = data

        # Keep track of stripping
        self.__stripped__ = False

        # Add date columns
        self.__data__ = hlp.add_dates(df=self.__data__, index='appevents')
        data.startDate = data.startDate.astype('datetime64[D]')
        data.endDate = data.endDate.astype('datetime64[D]')

        # Add duration columns
        self.__data__ = hlp.add_duration(df=self.__data__)

        # Add categories on request
        if add_categories:
            self.add_category()

        # Add date annotations on request
        if add_date_annotation:
            self.add_date_type()

        # Strip on request
        if strip:
            self.strip(number_of_days=14)

        # Initialize attributes
        self.__session_sequences__ = self.get_session_sequences() if get_session_sequences else None
Ejemplo n.º 7
0
    sns.set_palette('Accent')
    sns.set_style('white')
    sns.distplot(age)
    plt.show()

    #age = age[age>35]
    #selection_ids = list(age.index)
    #ae.filter(users=selection_ids,inplace=True)

    # Annotate (already scraped so set to False)
    ae.add_category(scrape=False)
    ae.add_time_of_day()
    ae.add_date_type()

    # BUILD FEATURES
    log('Getting agnostic features.')
    feature_list = []

    apps = ae.get_applications()[:30].index.tolist()
    categories = ae.get_categories()[:30].index.to_list()
    times_of_day = [
        'late_night', 'early_morning', 'morning', 'noon', 'eve', 'night'
    ]
    from_push = [False, True, None]
    '''find_apps = lambda term: [app for app in apps if app.__contains__(term)]
    apps = list(ae.get_applications().index)
    facebook = find_apps('facebook')
    whatsapp = find_apps('whatsapp')'''

    # Build features
    categories.append(None)
Ejemplo n.º 8
0
def scrape_play_store(app_names: list,
                      cache: dict,
                      overwrite=False) -> (dict, list):
    """
    Scrape app meta data from Google play store.

    :param app_name: the official app name (e.g., com.facebook.katana)
    :return: dict with meta data for apps that got a hit, list with remaining apps
    """
    '''try:
        cache = np.load(file=join(hlp.CACHE_DIR, 'app_meta_custom.npy'), allow_pickle=True).item()
    except:
        log('No cache was found for app meta data.', lvl=3)'''

    # Play store URL prefix
    play_store_url = 'https://play.google.com/store/apps/details?id='

    # Initialize dict of knowns and list of unknowns
    known_apps = {}
    unknown_apps = []
    cached_apps = 0

    # Loop over app names
    t_app_names = app_names if hlp.LOG_LEVEL > 1 else tqdm(
        app_names, position=0, leave=True)
    if hlp.LOG_LEVEL == 1:
        t_app_names.set_description('Scraping')
    for app_name in t_app_names:

        # Check with local cache, which must be a dict
        if isinstance(cache, dict):

            # Is the app name in the cache's keys? Is the genre attached to it a NaN?
            if app_name in cache.keys() and not pd.isna(
                    cache[app_name]['genre']):

                log(f"Info for f{app_name} is in cache.", lvl=3)
                cached_apps += 1

                # If we don't want to overwrite, skip this one
                if not overwrite:
                    continue

        # Combined into full URLs per app
        url = f'{play_store_url}{app_name}'

        # Get HTML from URL
        response = get(url)

        # Create BeautifulSoup object
        soup = BeautifulSoup(response.text, 'html.parser')

        # Get attributes
        try:

            # Store all meta data for this app here
            meta = {'source': 'play_store'}

            # Find the name
            name = soup.find('h1', {'class': 'AHFaub'})

            # If we can't even find that, get out of here
            if not name:
                raise Exception(f'Could not find anything on {app_name}.')
            else:
                meta['name'] = name.text

            # Find info
            info = soup.find_all(attrs={'class': 'R8zArc'})

            # ... extract text where possible
            info_text = [info_bit.text for info_bit in info]

            # ... and fill in the blanks
            while len(info_text) < 3:
                info_text.append(None)

            meta['company'] = info_text[0]
            meta['genre'] = info_text[1]
            meta['genre2'] = info_text[2]

            # Find purchase info
            purchases = soup.find('div', {'class': 'bSIuKf'})
            if purchases:
                meta['purchases'] = purchases.text

            # Find rating info
            rating = soup.find('div', {'class': 'BHMmbe'})
            if rating:
                meta['rating'] = rating.text

            # Add it to the big dict (lol)
            log(f'Got it! <{app_name}> meta data was scraped.', lvl=3)
            known_apps[app_name] = meta

        except Exception as e:
            log(f'Problem for <{app_name}> - {e}', lvl=3)
            unknown_apps.append(app_name)

        zzz = rnd.uniform(1, 3)
        # print(f'Sleeping for {round(zzz, 2)} seconds.')
        # print()
        # time.sleep(zzz)

    log(f"Obtained info for {len(known_apps)} apps.", lvl=2)
    log(f"Failed to get info on {len(unknown_apps)} apps.", lvl=2)
    log(f"{cached_apps} apps were already cached.", lvl=2)

    # Merge new info with cache
    if isinstance(cache, dict):

        # If we specified overwrite, store scraped info in cache over old info
        if overwrite:
            # known_apps |= cache # Python3.9
            known_apps = {**known_apps, **cache}
        # ... else retain app info
        else:
            # known_apps = cache|known_apps
            known_apps = {**cache, **known_apps}

    # Store app meta data cache
    np.save(file=join(pardir, pardir, 'cache', 'app_meta.npy'), arr=known_apps)

    return known_apps, unknown_apps
Ejemplo n.º 9
0
        df[new_col] = hours.progress_apply(label_hour)

    return df


if __name__ == '__main__':
    # Let's go
    hlp.hi()
    hlp.set_dir(join(pardir, 'cache'))
    hlp.set_param(log_level=1,
                  data_dir=join(pardir, pardir, 'data', 'glance',
                                'processed_appevents'),
                  cache_dir=join(pardir, 'cache'))

    # Load the data and gather apps
    log('Collecting app names.', lvl=1)
    appevents_files = listdir(hlp.DATA_DIR)
    apps = {}

    # Load data
    data = hlp.load(path=join(hlp.DATA_DIR, appevents_files[0]),
                    index='appevents')

    # Add apps to the set (no duplicates)
    app_counts = Counter(list(data.application))
    apps = {**apps, **app_counts}

    data = add_date_annotation(data, ['startDate', 'endDate'])

# Sort apps by number of times they occurred in data
'''apps = {k: v for k, v in sorted(apps.items(), key=lambda item: item[1], reverse=True)}