def clean_df(df): """ DESCR: cleans Dataframe in preperation for machine learning algorithms INPUT: messy Dataframe OUTPUT: clean DF """ # Get rid of stuff that just has no chance of being useful worthless = ['_id', 'visible', 'is_private', 'is_following', 'logo', 'meta'] df.drop(worthless, axis=1, inplace=True) # New Features Section #Feature for length of domain portion df['url_length'] = df['URL'].apply(lambda x: len(x.split('.')[0]) - 8) #Is it someting.wordpress.com or just something.com df['wordpress_in_url'] = df['URL'].apply(lambda x: 'wordpress.com' in x) #chars in blog name df['len_blog_name'] = df['name'].apply(lambda x: len(x)) #words in blog name df['words_in_blog_name'] = df['name'].apply(lambda x: len(x.split())) df.drop('name', axis=1, inplace=True) # Check if single author df['single_author'] = df['post_list'].apply(lambda x: num_authors(x) == 1) # Stuff from post list df['num_posts'] = df['post_list'].apply(lambda x: len(x)) df['start_date'] = df['post_list'].apply(lambda x: min([date_convert_w_error(post['modified']) for post in x])) df['most_recent_date'] = df['post_list'].apply(lambda x: max([date_convert_w_error(post['modified']) for post in x])) df['blog_life_in_days'] = (df['most_recent_date'] - df['start_date']).apply(lambda x: x.days) # Create likes hist, comments hist, avg and std in post gaps df['likts_history'] = df['post_list'].apply(lambda x: posts_to_likes(x)) df['comment_history'] = df['post_list'].apply(lambda x: post_to_comments(x)) df['average_gap'] = df['post_list'].apply(lambda x: avg_and_stddev_days_between_posts(x)[0]) df['std_dev_gap'] = df['post_list'].apply(lambda x: avg_and_stddev_days_between_posts(x)[1]) # Drop non blogs ie more than one author df = df[df['single_author']] return df
def avg_and_stddev_days_between_posts(post_list): """ DESCR: calculates average time between posts and standard deviation INPUT: lists of posts OUPUT: tuple (avg, std) """ dates = [date_convert_w_error(post['modified']) for post in post_list] dates.reverse() diffs = [next_one - current for current, next_one in zip(dates[:-1], dates[1:])] diffs = [delta.days for delta in diffs] return (np.mean(diffs), np.std(diffs))