def clean_df(df):
    DESCR: cleans Dataframe in preperation for machine learning algorithms
    INPUT: messy Dataframe
    OUTPUT: clean DF
    # Get rid of stuff that just has no chance of being useful
    worthless = ['_id', 'visible', 'is_private', 'is_following', 'logo', 'meta']
    df.drop(worthless, axis=1, inplace=True)

    # New Features Section
    #Feature for length of domain portion
    df['url_length'] = df['URL'].apply(lambda x: len(x.split('.')[0]) - 8)

    #Is it or just
    df['wordpress_in_url'] = df['URL'].apply(lambda x: '' in x)

    #chars in blog name
    df['len_blog_name'] = df['name'].apply(lambda x: len(x))

    #words in blog name
    df['words_in_blog_name'] = df['name'].apply(lambda x: len(x.split()))
    df.drop('name', axis=1, inplace=True)

    # Check if single author
    df['single_author'] = df['post_list'].apply(lambda x: num_authors(x) == 1)

    # Stuff from post list
    df['num_posts'] = df['post_list'].apply(lambda x: len(x))
    df['start_date'] = df['post_list'].apply(lambda x: min([date_convert_w_error(post['modified']) for post in x]))
    df['most_recent_date'] = df['post_list'].apply(lambda x: max([date_convert_w_error(post['modified']) for post in x]))
    df['blog_life_in_days'] = (df['most_recent_date'] - df['start_date']).apply(lambda x: x.days)

    # Create likes hist, comments hist, avg and std in post gaps
    df['likts_history'] = df['post_list'].apply(lambda x: posts_to_likes(x))
    df['comment_history'] = df['post_list'].apply(lambda x: post_to_comments(x))
    df['average_gap'] = df['post_list'].apply(lambda x: avg_and_stddev_days_between_posts(x)[0])
    df['std_dev_gap'] = df['post_list'].apply(lambda x: avg_and_stddev_days_between_posts(x)[1])

    # Drop non blogs ie more than one author
    df = df[df['single_author']]

    return df
def avg_and_stddev_days_between_posts(post_list):
    DESCR: calculates average time between posts and standard deviation
    INPUT: lists of posts
    OUPUT: tuple (avg, std)
    dates = [date_convert_w_error(post['modified']) for post in post_list]
    diffs = [next_one - current for current, next_one in zip(dates[:-1], dates[1:])]
    diffs = [delta.days for delta in diffs]

    return (np.mean(diffs), np.std(diffs))