def initialize_analytics_reporting(): credentials = ServiceAccountCredentials.from_json_keyfile_name( get_config_field('GA', 'key_file_location'), [get_config_field('GA', 'scopes')]) # Build the service object. analytics = build('analyticsreporting', 'v4', credentials=credentials) return analytics
def run_etlw_pipeline(date_str, clean_up=False, plotly=False, gsheets=False, metrics=False, postgres=False): # ##0&1. DOWNLOAD DATA and BASIC PARSE dfs_cleaned = get_collections_cleaned() # ##2. ENRICHING OF COLLECTIONS today = dfs_cleaned['views']['createdAt'].max().strftime('%Y%m%d') # treat max date in collections as "today" dfs_enriched = enrich_collections(dfs_cleaned, date_str=today) # ##3. WRITE OUT ENRICHED COLLECTIONS write_collections(dfs_enriched, date_str=today) # ##4 METRIC STUFF - PLOTS AND SHEETS if metrics: run_metric_pipeline(dfs_enriched, online=True, sheets=True, plots=True) # ##5. PLOT GRAPHS TO PLOTLY DASHBOARD if plotly: run_plotline(dfs_enriched, start_date='2019-04-01', size=(700, 350), online=True) # ##6. PLOT GRAPHS TO PLOTLY DASHBOARD if gsheets: create_and_update_all_sheets(dfs_enriched, spreadsheet_name=get_config_field('GSHEETS', 'spreadsheet_name')) # ##7. LOAD DATA FILES TO POSTGRES DB if postgres and ENV=='ec2': run_pg_pandas_transfer(dfs_enriched, date_str=date_str) # ##8. CLEAN UP OLD FILES TO SAVE SPACE if clean_up: clean_up_old_files(days_to_keep=2) return None
def update_petrov(): dfs = et.load_from_file(date_str='most_recent', coll_names=['users']) dfu = dfs['users'] have_codes = pd.read_csv('/Users/rbloom/Downloads/petrov_day_list.csv') codes = pd.read_csv('/Users/rbloom/Downloads/codez_secret_no_use.csv', usecols=['codes']) have_codes.head() projection = ['_id', 'username', 'displayName', 'karma', 'petrovPressedButtonDate', 'petrovCodesEntered', 'petrovCodesEnteredDate'] users_raw = et.get_collection('users', query_filter={'petrovPressedButtonDate': {'$exists':True}}, projection=projection, db=et.get_mongo_db_object()) users = users_raw.merge(have_codes[['Username']], left_on='username', right_on='Username', indicator=True, how='left') users = users.merge(dfu[['_id', 'num_days_present_last_30_days', 'createdAt', 'true_earliest']].fillna(0), left_on='_id', right_on='_id', how='left') users['has_codes'] = (users['_merge']=='both') users['valid_code'] = users['petrovCodesEntered'].apply(lambda x: x in codes.values) users.loc[:,['petrovPressedButtonDate', 'petrovCodesEnteredDate']] = users.loc[:,['petrovPressedButtonDate', 'petrovCodesEnteredDate']] - pd.Timedelta('7 hours') users['karma'] = users['karma'].fillna(0) users = users.sort_values('petrovPressedButtonDate', ascending=False) users = users[ ['displayName', 'karma', 'petrovPressedButtonDate', 'petrovCodesEntered', 'petrovCodesEnteredDate', 'has_codes', 'valid_code', 'num_days_present_last_30_days', 'true_earliest']] users_pressed = users.dropna(subset=['petrovPressedButtonDate']) users_pressed = users_pressed.sort_values('petrovPressedButtonDate', ascending=True) users_pressed = users_pressed.reset_index(drop=True).reset_index() users_pressed['index'] = users_pressed['index'] + 1 users_pressed = users_pressed.sort_values('petrovPressedButtonDate', ascending=False) print('num users pressed button: {}'.format(users_pressed.shape[0])) users_pressed_and_entered = users.dropna(subset=['petrovPressedButtonDate','petrovCodesEntered']) print('num users pressed button and entered codes: {}'.format(users_pressed_and_entered.shape[0])) users_pressed_and_entered_has_codes = users_pressed_and_entered[users_pressed_and_entered['has_codes']] print('num users pressed button and entered codes: {}'.format(users_pressed_and_entered_has_codes.shape[0])) # plot_table(users_pressed, title='Users Who Pressed Button', online=True) # plot_table(users_pressed_and_entered, title='Users Who Pressed Button and Entered Codes', online=True) # plot_table(users_pressed_and_entered_has_codes, title='Users Who Pressed Button and Entered Some Codes Who Have True Codes', online=True) users_pressed['birth'] = pd.datetime.now() spreadsheet_name = get_config_field('GSHEETS', 'spreadsheet_name') s = Spread(get_config_field('GSHEETS', 'user'), spreadsheet_name, sheet='Trust & Doom', create_spread=True, create_sheet=True) s.df_to_sheet(users_pressed, replace=True, sheet='Trust & Doom', index=False)
def run_metric_pipeline(dfs, online=False, sheets=False, plots=False): dfp = dfs['posts'] dfc = dfs['comments'] allVotes, baseScoresD4, docScores = compute_karma_metric(dfs) if plots: plot_karma_metric(allVotes, online=online) if sheets: spreadsheet_name = get_config_field('GSHEETS', 'spreadsheet_name') spreadsheet_user = get_config_field('GSHEETS', 'user') s = Spread(spreadsheet_user, spreadsheet_name, sheet='Users', create_spread=True, create_sheet=True) pr_dict = {'D': 'Daily', 'W': 'Weekly'} for pr in ['D', 'W']: votes2posts = agg_votes_to_posts(allVotes, dfp, dfc, pr=pr) data = votes2posts.reset_index().sort_values( ['votedAt', 'rank'], ascending=[False, True]).copy() data['birth'] = pd.datetime.now() data.columns = [ col.replace('_', ' ').title() for col in data.columns ] s.df_to_sheet(data, replace=True, sheet='KM: Posts/{}'.format(pr_dict[pr]), index=False) votes2items = agg_votes_to_items(allVotes, dfp, dfc, pr=pr) data = votes2items.reset_index().sort_values( ['votedAt', 'rank'], ascending=[False, True]).copy() data['birth'] = pd.datetime.now() data.columns = [ col.replace('_', ' ').title() for col in data.columns ] s.df_to_sheet(data, replace=True, sheet='KM: Items/{}'.format(pr_dict[pr]), index=False)
def upload_to_gsheets(df, spreadsheet_name, sheet_name, create_spread=False, create_sheet=False, grant_access=None, index=False, format_columns=False, start=(1, 1), headers=True): df = df.copy() if format_columns: df.columns = df.columns.to_series().str.replace('_', ' ').str.title() spreadsheet = Spread(spread=spreadsheet_name, sheet=sheet_name, create_spread=create_spread, create_sheet=create_sheet, user=get_config_field('GSHEETS', 'user')) spreadsheet.df_to_sheet(df, index=index, start=start, headers=headers) if grant_access == 'primary': permissions_list = [ '{email}|writer'.format( email=get_config_field('GSHEETS', 'primary_email')) ] elif grant_access == 'team': emails = get_config_field('GSHEETS', 'team_emails').split(',') permissions_list = [ '{email}|writer'.format(email=email) for email in emails ] elif grant_access == 'public': permissions_list = ['anyone|reader'] else: permissions_list = None if permissions_list: spreadsheet.add_permissions(permissions_list) print(spreadsheet.url) return spreadsheet.url
def create_and_update_all_sheets(dfs, spreadsheet_name): dfu = dfs['users'] dfp = dfs['posts'] # dfv = dfs['votes'] s = Spread(spread=spreadsheet_name, sheet=None, create_spread=True, create_sheet=True, user=get_config_field('GSHEETS', 'user')) _ = create_and_update_user_sheet(dfu, s) _ = create_and_update_posts_sheet(dfp, s) # _ = create_and_update_votes_sheet(dfv, s) // we never use this return s
def get_events_sheet(only_top=True): spreadsheet_user = get_config_field('GSHEETS', 'user') s = Spread(user=spreadsheet_user, spread='Release & PR Events', sheet='Events', create_spread=False, create_sheet=False) events = s.sheet_to_df() events.index = pd.to_datetime(events.index) events = events.reset_index().reset_index().set_index('date') events['top'] = events['top'] == 'TRUE' if only_top: events = events[events['top']] return events
def create_and_update_all_sheets(dfs, spreadsheet_name): dfu = dfs['users'] dfp = dfs['posts'] dfv = dfs['votes'] s = Spread(get_config_field('GSHEETS', 'user'), spreadsheet_name, sheet='Users', create_spread=True, create_sheet=True) _ = create_and_update_user_sheet(dfu, s) _ = create_and_update_posts_sheet(dfp, s) _ = create_and_update_votes_sheet(dfv, s) return s
def get_report(dims, metrics, start_date=None, end_date=None, days=None, page_size=None): view_id = get_config_field('GA', 'view_id') analytics = initialize_analytics_reporting() if start_date and end_date and days: raise Exception( 'Argument error: Cannot specify all of start_date, end_date, and days' ) elif not (start_date or end_date or days): raise Exception( 'Argument error: Must specify two of {start_date, end_date', 'day') elif end_date and not start_date: start_date = (pd.to_datetime(end_date) - pd.Timedelta(days - 1, unit='d')).strftime('%Y-%m-%d') elif not end_date and start_date: end_date = (pd.to_datetime(start_date) + pd.Timedelta(days - 1, unit='d')).strftime('%Y-%m-%d') elif not (end_date or start_date): end_date = pd.datetime.today().strftime('%Y-%m-%d') start_date = (pd.to_datetime(end_date) - pd.Timedelta(days - 1, unit='d')).strftime('%Y-%m-%d') df = get_report_recursive(analytics, view_id, dims, metrics, start_date, end_date, page_size=page_size) if 'ga:date' in df.columns: df['date'] = pd.to_datetime(df['ga:date']) df = df.drop(['ga:date'], axis=1) return df
import pandas as pd import numpy as np import configparser import sqlalchemy as sqa import re from table_schemas import table_creation_commands from utils import timed, get_config_field, print_and_log from io import StringIO import csv from IPython.display import display BASE_PATH = get_config_field('PATHS', 'base') def camel_to_snake(name): name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower() def clean_dataframe_text(df): def replace_strings(col, pat, repl): df.loc[:, col] = df.loc[:, col].str.replace(pat, repl) for col in df.columns: if pd.api.types.is_string_dtype(df[col]): _ = [ replace_strings(col, pat, repl) for pat, repl in [('\\', ''), ('\t', ' '), ('\n', '\\n'), ('\r', '\\r')]
from pymongo import MongoClient import pathlib import html2text import os import shutil import configparser from losttheplotly import run_plotline from cellularautomaton import * from karmametric import run_metric_pipeline from flipthetable import run_pg_pandas_transfer from utils import timed, print_and_log, get_config_field MONGO_DB_NAME = get_config_field('MONGODB', 'db_name') MONGO_DB_URL = get_config_field('MONGODB', 'prod_db_url') BASE_PATH = get_config_field('PATHS','base') ENV = get_config_field('ENV', 'env') def get_mongo_db_object(): client = MongoClient(MONGO_DB_URL) db = client[MONGO_DB_NAME] return db def get_collection(coll_name, db, projection=None, query_filter=None, limit=None): """ Downloads and returns single collection from MongoDB and returns dataframe.
import pandas as pd import core_pipeline as et from utils import get_config_field import plotly from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot from gspread_pandas import Spread, Client plotly.tools.set_credentials_file(username=get_config_field('PLOTLY', 'username'), api_key=get_config_field('PLOTLY', 'api_key')) init_notebook_mode(connected=True) from plotly_ops import plot_table def update_petrov(): dfs = et.load_from_file(date_str='most_recent', coll_names=['users']) dfu = dfs['users'] have_codes = pd.read_csv('/Users/rbloom/Downloads/petrov_day_list.csv') codes = pd.read_csv('/Users/rbloom/Downloads/codez_secret_no_use.csv', usecols=['codes']) have_codes.head() projection = ['_id', 'username', 'displayName', 'karma', 'petrovPressedButtonDate', 'petrovCodesEntered', 'petrovCodesEnteredDate'] users_raw = et.get_collection('users', query_filter={'petrovPressedButtonDate': {'$exists':True}}, projection=projection, db=et.get_mongo_db_object()) users = users_raw.merge(have_codes[['Username']], left_on='username', right_on='Username', indicator=True, how='left') users = users.merge(dfu[['_id', 'num_days_present_last_30_days', 'createdAt', 'true_earliest']].fillna(0), left_on='_id', right_on='_id', how='left') users['has_codes'] = (users['_merge']=='both') users['valid_code'] = users['petrovCodesEntered'].apply(lambda x: x in codes.values)
def run_metric_pipeline(collections, end_date_str, online=False, sheets=False, plots=False): def prepare_upload(aggregated_votes): return (aggregated_votes.reset_index().sort_values( ['votedAt', 'rank'], ascending=[ False, True ]).copy().assign(birth=pd.datetime.now()).rename( lambda x: x.replace('_', ' ').title(), axis=1).head( int(1e4)) # we don't need that many rows by default ) posts = collections['posts'] comments = collections['comments'] allVotes, baseScoresD4, docScores = compute_karma_metric(collections) end_date = pd.to_datetime(end_date_str).strftime('%Y-%m-%d') start_date = (pd.to_datetime(end_date) - pd.Timedelta(180, unit='d')).strftime('%Y-%m-%d') start_date_sheets = (pd.to_datetime(end_date) - pd.Timedelta(30, unit='d')).strftime('%Y-%m-%d') if plots: _ = plot_karma_metric(allVotes, online=online, start_date=start_date, end_date=end_date, period='D', ma=7) _ = plot_karma_metric(allVotes, online=online, start_date=start_date, end_date=end_date, period='W', ma=4) if sheets: spreadsheet_name = get_config_field('GSHEETS', 'spreadsheet_name') spreadsheet_user = get_config_field('GSHEETS', 'user') s = Spread(spread=spreadsheet_name, sheet=None, create_spread=True, create_sheet=True, user=spreadsheet_user) for period in ['D', 'W']: votes2posts = agg_votes_to_posts(allVotes, posts, comments, period=period, start_date=start_date_sheets) votes2items = agg_votes_to_items(allVotes, posts, comments, period=period, start_date=start_date_sheets) s.df_to_sheet(prepare_upload(votes2posts), replace=True, sheet='KM: Posts/{}'.format(PERIOD_DICT[period]), index=False) s.df_to_sheet(prepare_upload(votes2items), replace=True, sheet='KM: Items/{}'.format(PERIOD_DICT[period]), index=False)
def plot_karma_metric(allVotes, start_date, end_date, online=False, period='D', ma=7): votes_ts = allVotes.set_index('votedAt').resample(period)['effect'].sum() votes_ts = votes_ts.reset_index().iloc[:-1] votes_ts_ma = votes_ts.set_index('votedAt')['effect'].rolling( ma).mean().round(1).reset_index() days_in_period = {'D': 1, 'W': 7, 'M': 365 / 12, 'Y': 365} # trends = create_trend_frame(days_in_period[period] * 550, period) # plotly section date_col = 'votedAt' title = 'effect' color = 'red' size = (1200, 500) data = [ go.Scatter(x=votes_ts[date_col], y=votes_ts['effect'].round(1), line={ 'color': color, 'width': 0.5 }, name='{}-value'.format(PERIOD_DICT[period]), hoverinfo='x+y+name'), go.Scatter(x=votes_ts_ma[date_col], y=votes_ts_ma['effect'].round(1), line={ 'color': color, 'width': 4 }, name='average of last {} {}s'.format( ma, PERIOD_DICT2[period]), hoverinfo='x+y+name') #, # go.Scatter(x=trends['date'], y=trends['5%'], line={'color': 'grey', 'width': 1, 'dash': 'dash'}, mode='lines', # name='5% growth', hoverinfo='skip'), # go.Scatter(x=trends['date'], y=trends['7%'], line={'color': 'black', 'width': 2, 'dash': 'dash'}, mode='lines', # name='7% growth', hoverinfo='x+y'), # go.Scatter(x=trends['date'], y=trends['10%'], line={'color': 'grey', 'width': 1, 'dash': 'dash'}, mode='lines', # name='10% growth', hoverinfo='skip') ] layout = go.Layout( autosize=True, width=size[0], height=size[1], title='Net Karma, 4x Downvote, {}, 1.2 item exponent'.format( PERIOD_DICT[period].capitalize()), xaxis={ 'range': [start_date, end_date], 'title': None }, yaxis={ 'range': [ 0, votes_ts.set_index(date_col)[start_date:]['effect'].max() * 1.1 ], 'title': 'net karma' }) fig = go.Figure(data=data, layout=layout) set_credentials_file(username=get_config_field('PLOTLY', 'username'), api_key=get_config_field('PLOTLY', 'api_key')) init_notebook_mode(connected=True) filename = 'Net Karma Metric - {}'.format(PERIOD_DICT[period].capitalize()) if online: py.iplot(fig, filename=filename) else: iplot(fig, filename=filename) return votes_ts
def run_core_pipeline(date_str, from_file=False, clean_up=True, dash=True, gsheets=True, metrics=True, postgres=True, tags=True, ga=True, urls=True, gather_town=True, limit=None): # ##1. LOAD DATA if from_file: dfs_enriched = load_from_file(date_str) else: dfs_cleaned = get_collections_cleaned(limit=limit) today = dfs_cleaned['views']['createdAt'].max().strftime( '%Y%m%d' ) # treat max date in collections as "today" in case of load from file from older date # ##2. PREPARE DATA dfs_enriched = enrich_collections(dfs_cleaned, date_str=today) # ##3. WRITE OUT ENRICHED COLLECTIONS write_collections(dfs_enriched, date_str=today) # ##2 METRIC STUFF - PLOTS AND SHEETS if metrics: run_metric_pipeline(dfs_enriched, date_str, online=True, sheets=True, plots=True) # ##3. GENERATE TIMESERIES FOR DASH if dash: run_dash_aggregations_pipeline(dfs_enriched, date_str) # ##4. PLOT GRAPHS TO PLOTLY DASHBOARD if gsheets: create_and_update_all_sheets(dfs_enriched, spreadsheet_name=get_config_field( 'GSHEETS', 'spreadsheet_name')) # ##5. LOAD DATA FILES TO POSTGRES DB if postgres: run_pg_pandas_transfer(dfs_enriched) # ##6. GOOGLE ANALYTICS PIPELINE if ga: run_ga_pipeline() # ##7. URLS TABLE UPDATE if urls: run_url_table_update(dfs_enriched) # ##8. GATHER TOWN if gather_town: run_gather_town_pipeline() # ##9. CLEAN UP OLD FILES TO SAVE SPACE if clean_up: clean_up_old_files(days_to_keep=2) return None
def run_plotline(dfs, online=False, start_date=None, end_date=None, size=(1000, 400), pr='D', ma=[1, 7], widths={ 1: 0.75, 7: 3 }, annotations=False, hidden_by_default=[]): set_credentials_file(username=get_config_field('PLOTLY', 'username'), api_key=get_config_field('PLOTLY', 'api_key')) init_notebook_mode(connected=True) dpv = dfs['views'] # pv = post-views minimum_post_views = 1 valid_users = get_valid_users( dfs, required_minimum_posts_views=minimum_post_views) valid_posts = get_valid_posts(dfs, required_upvotes=1) valid_comments = get_valid_comments(dfs) valid_votes = get_valid_votes(dfs) valid_views = get_valid_views(dfs) valid_views['hour'] = valid_views['createdAt'].dt.round('H') valid_views_deduped = valid_views.drop_duplicates( subset=['userId', 'documentId', 'hour']) plotly_args = { 'start_date': start_date, 'end_date': end_date, 'period': pr, 'moving_average_lengths': ma, 'widths': widths, 'size': size, 'online': online, 'annotations': annotations, 'hidden_by_default': hidden_by_default } timeseries_plot( title='Accounts Created, {}+ posts_viewed'.format(minimum_post_views), datapoints=valid_users, date_col='true_earliest', color='grey', **plotly_args) timeseries_plot(title='Num Logged-In Users', datapoints=dpv[dpv['userId'].isin(valid_users['_id'])], date_col='createdAt', color='black', unique_on='userId', **plotly_args) timeseries_plot(title='Num Posts with 2+ upvotes', datapoints=valid_posts, date_col='postedAt', color='blue', **plotly_args) timeseries_plot(title='Num Unique Posters', datapoints=valid_posts, date_col='postedAt', color='darkblue', unique_on='userId', **plotly_args) timeseries_plot(title='Num Comments', datapoints=valid_comments, date_col='postedAt', color='green', **plotly_args) timeseries_plot(title='Num Unique Commenters', datapoints=valid_comments, date_col='postedAt', color='darkgreen', unique_on='userId', **plotly_args) timeseries_plot(title='Num Votes (excluding self-votes)', datapoints=valid_votes, date_col='votedAt', color='orange', **plotly_args) timeseries_plot(title='Num Unique Voters', datapoints=valid_votes, date_col='votedAt', color='darkorange', unique_on='userId', **plotly_args) timeseries_plot(title='Num Logged-In Post Views', datapoints=valid_views_deduped, date_col='createdAt', color='red', **plotly_args)