Ejemplo n.º 1
0
def initialize_analytics_reporting():
    credentials = ServiceAccountCredentials.from_json_keyfile_name(
        get_config_field('GA', 'key_file_location'),
        [get_config_field('GA', 'scopes')])
    # Build the service object.
    analytics = build('analyticsreporting', 'v4', credentials=credentials)
    return analytics
Ejemplo n.º 2
0
def run_etlw_pipeline(date_str, clean_up=False, plotly=False, gsheets=False,
                      metrics=False, postgres=False):
    # ##0&1. DOWNLOAD DATA and BASIC PARSE
    dfs_cleaned = get_collections_cleaned()

    # ##2. ENRICHING OF COLLECTIONS
    today = dfs_cleaned['views']['createdAt'].max().strftime('%Y%m%d')  # treat max date in collections as "today"
    dfs_enriched = enrich_collections(dfs_cleaned, date_str=today)

    # ##3. WRITE OUT ENRICHED COLLECTIONS
    write_collections(dfs_enriched, date_str=today)

    # ##4 METRIC STUFF - PLOTS AND SHEETS
    if metrics:
        run_metric_pipeline(dfs_enriched, online=True, sheets=True, plots=True)

    # ##5. PLOT GRAPHS TO PLOTLY DASHBOARD
    if plotly:
        run_plotline(dfs_enriched, start_date='2019-04-01', size=(700, 350), online=True)

    # ##6. PLOT GRAPHS TO PLOTLY DASHBOARD
    if gsheets:
        create_and_update_all_sheets(dfs_enriched, spreadsheet_name=get_config_field('GSHEETS', 'spreadsheet_name'))

    # ##7. LOAD DATA FILES TO POSTGRES DB
    if postgres and ENV=='ec2':
        run_pg_pandas_transfer(dfs_enriched, date_str=date_str)

    # ##8. CLEAN UP OLD FILES TO SAVE SPACE
    if clean_up:
        clean_up_old_files(days_to_keep=2)

    return None
Ejemplo n.º 3
0
def update_petrov():
    dfs = et.load_from_file(date_str='most_recent', coll_names=['users'])
    dfu = dfs['users']

    have_codes = pd.read_csv('/Users/rbloom/Downloads/petrov_day_list.csv')
    codes = pd.read_csv('/Users/rbloom/Downloads/codez_secret_no_use.csv', usecols=['codes'])

    have_codes.head()

    projection = ['_id', 'username', 'displayName', 'karma', 'petrovPressedButtonDate', 'petrovCodesEntered', 'petrovCodesEnteredDate']
    users_raw = et.get_collection('users', query_filter={'petrovPressedButtonDate': {'$exists':True}},
                              projection=projection, db=et.get_mongo_db_object())

    users = users_raw.merge(have_codes[['Username']], left_on='username', right_on='Username', indicator=True, how='left')
    users = users.merge(dfu[['_id', 'num_days_present_last_30_days', 'createdAt', 'true_earliest']].fillna(0), left_on='_id', right_on='_id', how='left')
    users['has_codes'] = (users['_merge']=='both')
    users['valid_code'] = users['petrovCodesEntered'].apply(lambda x: x in codes.values)

    users.loc[:,['petrovPressedButtonDate', 'petrovCodesEnteredDate']] = users.loc[:,['petrovPressedButtonDate', 'petrovCodesEnteredDate']] - pd.Timedelta('7 hours')
    users['karma'] = users['karma'].fillna(0)
    users = users.sort_values('petrovPressedButtonDate', ascending=False)

    users = users[ ['displayName', 'karma', 'petrovPressedButtonDate', 'petrovCodesEntered', 'petrovCodesEnteredDate',
                        'has_codes', 'valid_code', 'num_days_present_last_30_days', 'true_earliest']]

    users_pressed = users.dropna(subset=['petrovPressedButtonDate'])
    users_pressed = users_pressed.sort_values('petrovPressedButtonDate', ascending=True)
    users_pressed = users_pressed.reset_index(drop=True).reset_index()
    users_pressed['index'] = users_pressed['index'] + 1
    users_pressed = users_pressed.sort_values('petrovPressedButtonDate', ascending=False)
    print('num users pressed button: {}'.format(users_pressed.shape[0]))

    users_pressed_and_entered = users.dropna(subset=['petrovPressedButtonDate','petrovCodesEntered'])
    print('num users pressed button and entered codes: {}'.format(users_pressed_and_entered.shape[0]))

    users_pressed_and_entered_has_codes = users_pressed_and_entered[users_pressed_and_entered['has_codes']]
    print('num users pressed button and entered codes: {}'.format(users_pressed_and_entered_has_codes.shape[0]))

    # plot_table(users_pressed, title='Users Who Pressed Button', online=True)
    # plot_table(users_pressed_and_entered, title='Users Who Pressed Button and Entered Codes', online=True)
    # plot_table(users_pressed_and_entered_has_codes, title='Users Who Pressed Button and Entered Some Codes Who Have True Codes', online=True)

    users_pressed['birth'] = pd.datetime.now()
    spreadsheet_name = get_config_field('GSHEETS', 'spreadsheet_name')
    s = Spread(get_config_field('GSHEETS', 'user'), spreadsheet_name, sheet='Trust & Doom', create_spread=True,
               create_sheet=True)
    s.df_to_sheet(users_pressed, replace=True, sheet='Trust & Doom', index=False)
def run_metric_pipeline(dfs, online=False, sheets=False, plots=False):
    dfp = dfs['posts']
    dfc = dfs['comments']

    allVotes, baseScoresD4, docScores = compute_karma_metric(dfs)

    if plots:
        plot_karma_metric(allVotes, online=online)

    if sheets:
        spreadsheet_name = get_config_field('GSHEETS', 'spreadsheet_name')
        spreadsheet_user = get_config_field('GSHEETS', 'user')
        s = Spread(spreadsheet_user,
                   spreadsheet_name,
                   sheet='Users',
                   create_spread=True,
                   create_sheet=True)

        pr_dict = {'D': 'Daily', 'W': 'Weekly'}

        for pr in ['D', 'W']:
            votes2posts = agg_votes_to_posts(allVotes, dfp, dfc, pr=pr)
            data = votes2posts.reset_index().sort_values(
                ['votedAt', 'rank'], ascending=[False, True]).copy()
            data['birth'] = pd.datetime.now()
            data.columns = [
                col.replace('_', ' ').title() for col in data.columns
            ]
            s.df_to_sheet(data,
                          replace=True,
                          sheet='KM: Posts/{}'.format(pr_dict[pr]),
                          index=False)

            votes2items = agg_votes_to_items(allVotes, dfp, dfc, pr=pr)
            data = votes2items.reset_index().sort_values(
                ['votedAt', 'rank'], ascending=[False, True]).copy()
            data['birth'] = pd.datetime.now()
            data.columns = [
                col.replace('_', ' ').title() for col in data.columns
            ]
            s.df_to_sheet(data,
                          replace=True,
                          sheet='KM: Items/{}'.format(pr_dict[pr]),
                          index=False)
Ejemplo n.º 5
0
def upload_to_gsheets(df,
                      spreadsheet_name,
                      sheet_name,
                      create_spread=False,
                      create_sheet=False,
                      grant_access=None,
                      index=False,
                      format_columns=False,
                      start=(1, 1),
                      headers=True):

    df = df.copy()

    if format_columns:
        df.columns = df.columns.to_series().str.replace('_', ' ').str.title()

    spreadsheet = Spread(spread=spreadsheet_name,
                         sheet=sheet_name,
                         create_spread=create_spread,
                         create_sheet=create_sheet,
                         user=get_config_field('GSHEETS', 'user'))
    spreadsheet.df_to_sheet(df, index=index, start=start, headers=headers)

    if grant_access == 'primary':
        permissions_list = [
            '{email}|writer'.format(
                email=get_config_field('GSHEETS', 'primary_email'))
        ]
    elif grant_access == 'team':
        emails = get_config_field('GSHEETS', 'team_emails').split(',')
        permissions_list = [
            '{email}|writer'.format(email=email) for email in emails
        ]
    elif grant_access == 'public':
        permissions_list = ['anyone|reader']
    else:
        permissions_list = None

    if permissions_list:
        spreadsheet.add_permissions(permissions_list)

    print(spreadsheet.url)
    return spreadsheet.url
Ejemplo n.º 6
0
def create_and_update_all_sheets(dfs, spreadsheet_name):
    dfu = dfs['users']
    dfp = dfs['posts']
    # dfv = dfs['votes']

    s = Spread(spread=spreadsheet_name,
               sheet=None,
               create_spread=True,
               create_sheet=True,
               user=get_config_field('GSHEETS', 'user'))
    _ = create_and_update_user_sheet(dfu, s)
    _ = create_and_update_posts_sheet(dfp, s)
    # _ = create_and_update_votes_sheet(dfv, s) // we never use this

    return s
Ejemplo n.º 7
0
def get_events_sheet(only_top=True):
    spreadsheet_user = get_config_field('GSHEETS', 'user')
    s = Spread(user=spreadsheet_user,
               spread='Release & PR Events',
               sheet='Events',
               create_spread=False,
               create_sheet=False)
    events = s.sheet_to_df()
    events.index = pd.to_datetime(events.index)
    events = events.reset_index().reset_index().set_index('date')
    events['top'] = events['top'] == 'TRUE'
    if only_top:
        events = events[events['top']]

    return events
def create_and_update_all_sheets(dfs, spreadsheet_name):
    dfu = dfs['users']
    dfp = dfs['posts']
    dfv = dfs['votes']

    s = Spread(get_config_field('GSHEETS', 'user'),
               spreadsheet_name,
               sheet='Users',
               create_spread=True,
               create_sheet=True)
    _ = create_and_update_user_sheet(dfu, s)
    _ = create_and_update_posts_sheet(dfp, s)
    _ = create_and_update_votes_sheet(dfv, s)

    return s
Ejemplo n.º 9
0
def get_report(dims,
               metrics,
               start_date=None,
               end_date=None,
               days=None,
               page_size=None):

    view_id = get_config_field('GA', 'view_id')
    analytics = initialize_analytics_reporting()

    if start_date and end_date and days:
        raise Exception(
            'Argument error: Cannot specify all of start_date, end_date, and days'
        )
    elif not (start_date or end_date or days):
        raise Exception(
            'Argument error: Must specify two of {start_date, end_date', 'day')
    elif end_date and not start_date:
        start_date = (pd.to_datetime(end_date) -
                      pd.Timedelta(days - 1, unit='d')).strftime('%Y-%m-%d')
    elif not end_date and start_date:
        end_date = (pd.to_datetime(start_date) +
                    pd.Timedelta(days - 1, unit='d')).strftime('%Y-%m-%d')
    elif not (end_date or start_date):
        end_date = pd.datetime.today().strftime('%Y-%m-%d')
        start_date = (pd.to_datetime(end_date) -
                      pd.Timedelta(days - 1, unit='d')).strftime('%Y-%m-%d')

    df = get_report_recursive(analytics,
                              view_id,
                              dims,
                              metrics,
                              start_date,
                              end_date,
                              page_size=page_size)
    if 'ga:date' in df.columns:
        df['date'] = pd.to_datetime(df['ga:date'])
        df = df.drop(['ga:date'], axis=1)

    return df
Ejemplo n.º 10
0
import pandas as pd
import numpy as np
import configparser
import sqlalchemy as sqa
import re
from table_schemas import table_creation_commands
from utils import timed, get_config_field, print_and_log
from io import StringIO
import csv

from IPython.display import display

BASE_PATH = get_config_field('PATHS', 'base')


def camel_to_snake(name):
    name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower()


def clean_dataframe_text(df):
    def replace_strings(col, pat, repl):
        df.loc[:, col] = df.loc[:, col].str.replace(pat, repl)

    for col in df.columns:
        if pd.api.types.is_string_dtype(df[col]):
            _ = [
                replace_strings(col, pat, repl)
                for pat, repl in [('\\', ''), ('\t',
                                               '  '), ('\n',
                                                       '\\n'), ('\r', '\\r')]
Ejemplo n.º 11
0
from pymongo import MongoClient
import pathlib
import html2text

import os
import shutil
import configparser

from losttheplotly import run_plotline
from cellularautomaton import *
from karmametric import run_metric_pipeline
from flipthetable import run_pg_pandas_transfer

from utils import timed, print_and_log, get_config_field

MONGO_DB_NAME = get_config_field('MONGODB', 'db_name')
MONGO_DB_URL = get_config_field('MONGODB', 'prod_db_url')
BASE_PATH = get_config_field('PATHS','base')
ENV = get_config_field('ENV', 'env')


def get_mongo_db_object():
    client = MongoClient(MONGO_DB_URL)
    db = client[MONGO_DB_NAME]
    return db


def get_collection(coll_name, db, projection=None, query_filter=None, limit=None):
    """
    Downloads and returns single collection from MongoDB and returns dataframe.
Ejemplo n.º 12
0
import pandas as pd
import core_pipeline as et
from utils import get_config_field
import plotly
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from gspread_pandas import Spread, Client

plotly.tools.set_credentials_file(username=get_config_field('PLOTLY', 'username'),
                                      api_key=get_config_field('PLOTLY', 'api_key'))
init_notebook_mode(connected=True)

from plotly_ops import plot_table


def update_petrov():
    dfs = et.load_from_file(date_str='most_recent', coll_names=['users'])
    dfu = dfs['users']

    have_codes = pd.read_csv('/Users/rbloom/Downloads/petrov_day_list.csv')
    codes = pd.read_csv('/Users/rbloom/Downloads/codez_secret_no_use.csv', usecols=['codes'])

    have_codes.head()

    projection = ['_id', 'username', 'displayName', 'karma', 'petrovPressedButtonDate', 'petrovCodesEntered', 'petrovCodesEnteredDate']
    users_raw = et.get_collection('users', query_filter={'petrovPressedButtonDate': {'$exists':True}},
                              projection=projection, db=et.get_mongo_db_object())

    users = users_raw.merge(have_codes[['Username']], left_on='username', right_on='Username', indicator=True, how='left')
    users = users.merge(dfu[['_id', 'num_days_present_last_30_days', 'createdAt', 'true_earliest']].fillna(0), left_on='_id', right_on='_id', how='left')
    users['has_codes'] = (users['_merge']=='both')
    users['valid_code'] = users['petrovCodesEntered'].apply(lambda x: x in codes.values)
Ejemplo n.º 13
0
def run_metric_pipeline(collections,
                        end_date_str,
                        online=False,
                        sheets=False,
                        plots=False):
    def prepare_upload(aggregated_votes):
        return (aggregated_votes.reset_index().sort_values(
            ['votedAt', 'rank'], ascending=[
                False, True
            ]).copy().assign(birth=pd.datetime.now()).rename(
                lambda x: x.replace('_', ' ').title(), axis=1).head(
                    int(1e4))  # we don't need that many rows by default
                )

    posts = collections['posts']
    comments = collections['comments']

    allVotes, baseScoresD4, docScores = compute_karma_metric(collections)

    end_date = pd.to_datetime(end_date_str).strftime('%Y-%m-%d')
    start_date = (pd.to_datetime(end_date) -
                  pd.Timedelta(180, unit='d')).strftime('%Y-%m-%d')
    start_date_sheets = (pd.to_datetime(end_date) -
                         pd.Timedelta(30, unit='d')).strftime('%Y-%m-%d')

    if plots:
        _ = plot_karma_metric(allVotes,
                              online=online,
                              start_date=start_date,
                              end_date=end_date,
                              period='D',
                              ma=7)
        _ = plot_karma_metric(allVotes,
                              online=online,
                              start_date=start_date,
                              end_date=end_date,
                              period='W',
                              ma=4)

    if sheets:
        spreadsheet_name = get_config_field('GSHEETS', 'spreadsheet_name')
        spreadsheet_user = get_config_field('GSHEETS', 'user')
        s = Spread(spread=spreadsheet_name,
                   sheet=None,
                   create_spread=True,
                   create_sheet=True,
                   user=spreadsheet_user)

        for period in ['D', 'W']:
            votes2posts = agg_votes_to_posts(allVotes,
                                             posts,
                                             comments,
                                             period=period,
                                             start_date=start_date_sheets)
            votes2items = agg_votes_to_items(allVotes,
                                             posts,
                                             comments,
                                             period=period,
                                             start_date=start_date_sheets)

            s.df_to_sheet(prepare_upload(votes2posts),
                          replace=True,
                          sheet='KM: Posts/{}'.format(PERIOD_DICT[period]),
                          index=False)
            s.df_to_sheet(prepare_upload(votes2items),
                          replace=True,
                          sheet='KM: Items/{}'.format(PERIOD_DICT[period]),
                          index=False)
Ejemplo n.º 14
0
def plot_karma_metric(allVotes,
                      start_date,
                      end_date,
                      online=False,
                      period='D',
                      ma=7):
    votes_ts = allVotes.set_index('votedAt').resample(period)['effect'].sum()
    votes_ts = votes_ts.reset_index().iloc[:-1]
    votes_ts_ma = votes_ts.set_index('votedAt')['effect'].rolling(
        ma).mean().round(1).reset_index()

    days_in_period = {'D': 1, 'W': 7, 'M': 365 / 12, 'Y': 365}

    # trends = create_trend_frame(days_in_period[period] * 550, period)

    # plotly section
    date_col = 'votedAt'
    title = 'effect'
    color = 'red'
    size = (1200, 500)

    data = [
        go.Scatter(x=votes_ts[date_col],
                   y=votes_ts['effect'].round(1),
                   line={
                       'color': color,
                       'width': 0.5
                   },
                   name='{}-value'.format(PERIOD_DICT[period]),
                   hoverinfo='x+y+name'),
        go.Scatter(x=votes_ts_ma[date_col],
                   y=votes_ts_ma['effect'].round(1),
                   line={
                       'color': color,
                       'width': 4
                   },
                   name='average of last {} {}s'.format(
                       ma, PERIOD_DICT2[period]),
                   hoverinfo='x+y+name')  #,
        # go.Scatter(x=trends['date'], y=trends['5%'], line={'color': 'grey', 'width': 1, 'dash': 'dash'}, mode='lines',
        #            name='5% growth', hoverinfo='skip'),
        # go.Scatter(x=trends['date'], y=trends['7%'], line={'color': 'black', 'width': 2, 'dash': 'dash'}, mode='lines',
        #            name='7% growth', hoverinfo='x+y'),
        # go.Scatter(x=trends['date'], y=trends['10%'], line={'color': 'grey', 'width': 1, 'dash': 'dash'}, mode='lines',
        #            name='10% growth', hoverinfo='skip')
    ]

    layout = go.Layout(
        autosize=True,
        width=size[0],
        height=size[1],
        title='Net Karma, 4x Downvote, {}, 1.2 item exponent'.format(
            PERIOD_DICT[period].capitalize()),
        xaxis={
            'range': [start_date, end_date],
            'title': None
        },
        yaxis={
            'range': [
                0,
                votes_ts.set_index(date_col)[start_date:]['effect'].max() * 1.1
            ],
            'title':
            'net karma'
        })

    fig = go.Figure(data=data, layout=layout)

    set_credentials_file(username=get_config_field('PLOTLY', 'username'),
                         api_key=get_config_field('PLOTLY', 'api_key'))
    init_notebook_mode(connected=True)

    filename = 'Net Karma Metric - {}'.format(PERIOD_DICT[period].capitalize())
    if online:
        py.iplot(fig, filename=filename)
    else:
        iplot(fig, filename=filename)

    return votes_ts
Ejemplo n.º 15
0
def run_core_pipeline(date_str,
                      from_file=False,
                      clean_up=True,
                      dash=True,
                      gsheets=True,
                      metrics=True,
                      postgres=True,
                      tags=True,
                      ga=True,
                      urls=True,
                      gather_town=True,
                      limit=None):
    # ##1. LOAD DATA
    if from_file:
        dfs_enriched = load_from_file(date_str)
    else:
        dfs_cleaned = get_collections_cleaned(limit=limit)
        today = dfs_cleaned['views']['createdAt'].max().strftime(
            '%Y%m%d'
        )  # treat max date in collections as "today" in case of load from file from older date
        # ##2. PREPARE DATA
        dfs_enriched = enrich_collections(dfs_cleaned, date_str=today)
        # ##3. WRITE OUT ENRICHED COLLECTIONS
        write_collections(dfs_enriched, date_str=today)

    # ##2 METRIC STUFF - PLOTS AND SHEETS
    if metrics:
        run_metric_pipeline(dfs_enriched,
                            date_str,
                            online=True,
                            sheets=True,
                            plots=True)

    # ##3. GENERATE TIMESERIES FOR DASH
    if dash:
        run_dash_aggregations_pipeline(dfs_enriched, date_str)

    # ##4. PLOT GRAPHS TO PLOTLY DASHBOARD
    if gsheets:
        create_and_update_all_sheets(dfs_enriched,
                                     spreadsheet_name=get_config_field(
                                         'GSHEETS', 'spreadsheet_name'))

    # ##5. LOAD DATA FILES TO POSTGRES DB
    if postgres:
        run_pg_pandas_transfer(dfs_enriched)

    # ##6. GOOGLE ANALYTICS PIPELINE
    if ga:
        run_ga_pipeline()

    # ##7. URLS TABLE UPDATE
    if urls:
        run_url_table_update(dfs_enriched)

    # ##8. GATHER TOWN
    if gather_town:
        run_gather_town_pipeline()

    # ##9. CLEAN UP OLD FILES TO SAVE SPACE
    if clean_up:
        clean_up_old_files(days_to_keep=2)

    return None
Ejemplo n.º 16
0
def run_plotline(dfs,
                 online=False,
                 start_date=None,
                 end_date=None,
                 size=(1000, 400),
                 pr='D',
                 ma=[1, 7],
                 widths={
                     1: 0.75,
                     7: 3
                 },
                 annotations=False,
                 hidden_by_default=[]):
    set_credentials_file(username=get_config_field('PLOTLY', 'username'),
                         api_key=get_config_field('PLOTLY', 'api_key'))
    init_notebook_mode(connected=True)

    dpv = dfs['views']  # pv = post-views

    minimum_post_views = 1
    valid_users = get_valid_users(
        dfs, required_minimum_posts_views=minimum_post_views)
    valid_posts = get_valid_posts(dfs, required_upvotes=1)
    valid_comments = get_valid_comments(dfs)
    valid_votes = get_valid_votes(dfs)
    valid_views = get_valid_views(dfs)
    valid_views['hour'] = valid_views['createdAt'].dt.round('H')
    valid_views_deduped = valid_views.drop_duplicates(
        subset=['userId', 'documentId', 'hour'])

    plotly_args = {
        'start_date': start_date,
        'end_date': end_date,
        'period': pr,
        'moving_average_lengths': ma,
        'widths': widths,
        'size': size,
        'online': online,
        'annotations': annotations,
        'hidden_by_default': hidden_by_default
    }

    timeseries_plot(
        title='Accounts Created, {}+ posts_viewed'.format(minimum_post_views),
        datapoints=valid_users,
        date_col='true_earliest',
        color='grey',
        **plotly_args)
    timeseries_plot(title='Num Logged-In Users',
                    datapoints=dpv[dpv['userId'].isin(valid_users['_id'])],
                    date_col='createdAt',
                    color='black',
                    unique_on='userId',
                    **plotly_args)

    timeseries_plot(title='Num Posts with 2+ upvotes',
                    datapoints=valid_posts,
                    date_col='postedAt',
                    color='blue',
                    **plotly_args)
    timeseries_plot(title='Num Unique Posters',
                    datapoints=valid_posts,
                    date_col='postedAt',
                    color='darkblue',
                    unique_on='userId',
                    **plotly_args)

    timeseries_plot(title='Num Comments',
                    datapoints=valid_comments,
                    date_col='postedAt',
                    color='green',
                    **plotly_args)
    timeseries_plot(title='Num Unique Commenters',
                    datapoints=valid_comments,
                    date_col='postedAt',
                    color='darkgreen',
                    unique_on='userId',
                    **plotly_args)

    timeseries_plot(title='Num Votes (excluding self-votes)',
                    datapoints=valid_votes,
                    date_col='votedAt',
                    color='orange',
                    **plotly_args)
    timeseries_plot(title='Num Unique Voters',
                    datapoints=valid_votes,
                    date_col='votedAt',
                    color='darkorange',
                    unique_on='userId',
                    **plotly_args)

    timeseries_plot(title='Num Logged-In Post Views',
                    datapoints=valid_views_deduped,
                    date_col='createdAt',
                    color='red',
                    **plotly_args)