def strip(self, number_of_days=None, min_log_days=None): if self.__stripped__: log('Already stripped this Appevents object!', lvl=1) return self log('Stripping object to longest uninterrupted sequence.') # Get longest uninterrupted sequence tqdm.pandas(desc="Finding longest uninterrupted sequence.", position=0, leave=True) self.__data__ = self.__data__.groupby('id').progress_apply(lambda df: longest_uninterrupted(df=df)).reset_index( drop=True) # Cut off head and tail tqdm.pandas(desc="Cutting off head and tail.", position=0, leave=True) self.__data__ = self.__data__.groupby('id').progress_apply(lambda df: remove_first_and_last(df=df)).reset_index( drop=True) # If a number of days is set if number_of_days: self.select_n_first_days(n=number_of_days, inplace=True) # If a minimum number of log days is set if min_log_days: self.impose_min_days(n=min_log_days, inplace=True) # Remember that we did this self.__stripped__ = True return self
def get_applications(self, by: str = 'events') -> dict: """ Returns applications and their frequency """ if by == 'events': return self.__data__.application.value_counts() elif by == 'duration': return self.__data__.groupby('application').duration.sum().sort_values(ascending=False) else: log("Cannot get applications according to that metric. Choose 'events' or 'duration'.", lvl=1) return {}
def add_category(df: pd.DataFrame, scrape=False, overwrite=False) -> pd.DataFrame: """ Take a data frame and annotate rows with category field, based on application name. :param df:data frame (appevents or notifications) :param scrape: scrape Play Store for new info (set to True if no meta data is found) :return: Annotated data frame """ # Load app meta data try: meta = dict( np.load(join(hlp.CACHE_DIR, 'app_meta.npy'), allow_pickle=True).item()) except Exception as e: log('No app meta data found. Scraping Play store.', lvl=1) scrape = True meta = {} # Check if data frame has an application field if 'application' not in df: raise Exception('Cannot find <application> column in data frame!') # Scape the Play store if requested if scrape: applications = list(df.application.unique()) meta, _ = scrape_play_store(app_names=applications, cache=meta, overwrite=overwrite) # Add category field to row def adding_category_row(app: str): if app in meta.keys() and meta[app]['genre']: try: return meta[app]['genre'].lower() except: print(f"Exception for {app}") return meta[app]['genre'] else: return 'unknown' tqdm.pandas(desc="Adding category", position=0, leave=True) df['category'] = df.application.progress_apply(adding_category_row) return df
def load(cls, path: str, file_type='infer', sep=',', decimal='.'): """ Construct Notifications object from path :param path: path to the file :param file_type: file extension (csv, parquet, or pickle) :param sep: separator for csv files :param decimal: decimal for csv files :return: Notifications object """ # Load data frame, depending on file type if file_type == 'infer': # Get extension file_type = path.split('.')[-1] # Only allow the following extensions if file_type not in ['csv', 'pickle', 'pkl', 'parquet']: raise Exception("ERROR: Could not infer file type!") log("Recognized file type as <{type}>.".format(type=file_type), lvl=3) # CSV if file_type == 'csv': data = pd.read_csv( filepath_or_buffer=path, # usecols=, sep=sep, decimal=decimal, error_bad_lines=False) # Pickle elif file_type == 'pickle' or file_type == 'pkl': data = pd.read_pickle(path=path) # Parquet elif file_type == 'parquet': data = pd.read_parquet(path=path, engine='auto') # Unknown else: raise Exception( "ERROR: You want me to read what now? Invalid file type! ") return cls(data=data)
def get_categories(self, by: str = 'events') -> dict: """ Returns categories and their frequency """ # Add categories if not present if 'category' not in self.__data__.columns: log('Data not annotated with categories yet. Fixing...', lvl=1) self.add_category() if by == 'events': return self.__data__.category.value_counts() elif by == 'duration': return self.__data__.groupby('category').duration.sum().sort_values(ascending=False) else: log("Cannot get categories according to that metric. Choose 'events' or 'duration'.", lvl=1) return {}
def __init__(self, data: pd.DataFrame = None, add_categories=False, add_date_annotation=False, get_session_sequences=False, strip=False): # Drop 'Unnamed' columns for col in data.columns: if col.startswith('Unnamed'): data.drop(labels=[col], axis=1, inplace=True) # Set dtypes # ############## # Set datetimes try: data.startTime = data.startTime.astype('datetime64[ns]') except Exception as e: log('Could not convert startTime column to datetime format: ', e) try: data.endTime = data.endTime.astype('datetime64[ns]') except Exception as e: log('Could not convert endTime column to datetime format: ', e) # Downcast battery column try: data.battery = data.battery.astype('uint8') except Exception as e: log('Could not convert battery column to uint8 format: ', e) # Factorize ids # data.id = data.id.astype('category') # Factorize apps # data.application = data.application.astype('category') # Factorize sessions # data.session = data.session.astype('category') # Sort data frame data.sort_values(by=['id', 'startTime'], inplace=True) # Set data attribute self.__data__ = data # Keep track of stripping self.__stripped__ = False # Add date columns self.__data__ = hlp.add_dates(df=self.__data__, index='appevents') data.startDate = data.startDate.astype('datetime64[D]') data.endDate = data.endDate.astype('datetime64[D]') # Add duration columns self.__data__ = hlp.add_duration(df=self.__data__) # Add categories on request if add_categories: self.add_category() # Add date annotations on request if add_date_annotation: self.add_date_type() # Strip on request if strip: self.strip(number_of_days=14) # Initialize attributes self.__session_sequences__ = self.get_session_sequences() if get_session_sequences else None
sns.set_palette('Accent') sns.set_style('white') sns.distplot(age) plt.show() #age = age[age>35] #selection_ids = list(age.index) #ae.filter(users=selection_ids,inplace=True) # Annotate (already scraped so set to False) ae.add_category(scrape=False) ae.add_time_of_day() ae.add_date_type() # BUILD FEATURES log('Getting agnostic features.') feature_list = [] apps = ae.get_applications()[:30].index.tolist() categories = ae.get_categories()[:30].index.to_list() times_of_day = [ 'late_night', 'early_morning', 'morning', 'noon', 'eve', 'night' ] from_push = [False, True, None] '''find_apps = lambda term: [app for app in apps if app.__contains__(term)] apps = list(ae.get_applications().index) facebook = find_apps('facebook') whatsapp = find_apps('whatsapp')''' # Build features categories.append(None)
def scrape_play_store(app_names: list, cache: dict, overwrite=False) -> (dict, list): """ Scrape app meta data from Google play store. :param app_name: the official app name (e.g., com.facebook.katana) :return: dict with meta data for apps that got a hit, list with remaining apps """ '''try: cache = np.load(file=join(hlp.CACHE_DIR, 'app_meta_custom.npy'), allow_pickle=True).item() except: log('No cache was found for app meta data.', lvl=3)''' # Play store URL prefix play_store_url = 'https://play.google.com/store/apps/details?id=' # Initialize dict of knowns and list of unknowns known_apps = {} unknown_apps = [] cached_apps = 0 # Loop over app names t_app_names = app_names if hlp.LOG_LEVEL > 1 else tqdm( app_names, position=0, leave=True) if hlp.LOG_LEVEL == 1: t_app_names.set_description('Scraping') for app_name in t_app_names: # Check with local cache, which must be a dict if isinstance(cache, dict): # Is the app name in the cache's keys? Is the genre attached to it a NaN? if app_name in cache.keys() and not pd.isna( cache[app_name]['genre']): log(f"Info for f{app_name} is in cache.", lvl=3) cached_apps += 1 # If we don't want to overwrite, skip this one if not overwrite: continue # Combined into full URLs per app url = f'{play_store_url}{app_name}' # Get HTML from URL response = get(url) # Create BeautifulSoup object soup = BeautifulSoup(response.text, 'html.parser') # Get attributes try: # Store all meta data for this app here meta = {'source': 'play_store'} # Find the name name = soup.find('h1', {'class': 'AHFaub'}) # If we can't even find that, get out of here if not name: raise Exception(f'Could not find anything on {app_name}.') else: meta['name'] = name.text # Find info info = soup.find_all(attrs={'class': 'R8zArc'}) # ... extract text where possible info_text = [info_bit.text for info_bit in info] # ... and fill in the blanks while len(info_text) < 3: info_text.append(None) meta['company'] = info_text[0] meta['genre'] = info_text[1] meta['genre2'] = info_text[2] # Find purchase info purchases = soup.find('div', {'class': 'bSIuKf'}) if purchases: meta['purchases'] = purchases.text # Find rating info rating = soup.find('div', {'class': 'BHMmbe'}) if rating: meta['rating'] = rating.text # Add it to the big dict (lol) log(f'Got it! <{app_name}> meta data was scraped.', lvl=3) known_apps[app_name] = meta except Exception as e: log(f'Problem for <{app_name}> - {e}', lvl=3) unknown_apps.append(app_name) zzz = rnd.uniform(1, 3) # print(f'Sleeping for {round(zzz, 2)} seconds.') # print() # time.sleep(zzz) log(f"Obtained info for {len(known_apps)} apps.", lvl=2) log(f"Failed to get info on {len(unknown_apps)} apps.", lvl=2) log(f"{cached_apps} apps were already cached.", lvl=2) # Merge new info with cache if isinstance(cache, dict): # If we specified overwrite, store scraped info in cache over old info if overwrite: # known_apps |= cache # Python3.9 known_apps = {**known_apps, **cache} # ... else retain app info else: # known_apps = cache|known_apps known_apps = {**cache, **known_apps} # Store app meta data cache np.save(file=join(pardir, pardir, 'cache', 'app_meta.npy'), arr=known_apps) return known_apps, unknown_apps
df[new_col] = hours.progress_apply(label_hour) return df if __name__ == '__main__': # Let's go hlp.hi() hlp.set_dir(join(pardir, 'cache')) hlp.set_param(log_level=1, data_dir=join(pardir, pardir, 'data', 'glance', 'processed_appevents'), cache_dir=join(pardir, 'cache')) # Load the data and gather apps log('Collecting app names.', lvl=1) appevents_files = listdir(hlp.DATA_DIR) apps = {} # Load data data = hlp.load(path=join(hlp.DATA_DIR, appevents_files[0]), index='appevents') # Add apps to the set (no duplicates) app_counts = Counter(list(data.application)) apps = {**apps, **app_counts} data = add_date_annotation(data, ['startDate', 'endDate']) # Sort apps by number of times they occurred in data '''apps = {k: v for k, v in sorted(apps.items(), key=lambda item: item[1], reverse=True)}