def create_last_events(): view_name = "last_events" return create_view( view_name, """SELECT * FROM first_seen_from_events f JOIN events_all e ON f.distinct_id = e.distinct_id AND e."timestamp" - 2*60*60 < f.first_seen_ts ; """)
def create_age_view(): view_name = "age_from_events" return create_view(view_name, """ SELECT f.distinct_id, last_seen_ts - first_seen_ts as age_s, ceil( (last_seen_ts::numeric - first_seen_ts::numeric)/(60*60*24)) as age_days FROM first_seen_from_events f JOIN last_seen_from_events l ON f.distinct_id = l.distinct_id""")
def create_last_events(): view_name = "last_events" return create_view(view_name, """SELECT * FROM first_seen_from_events f JOIN events_all e ON f.distinct_id = e.distinct_id AND e."timestamp" - 2*60*60 < f.first_seen_ts ; """)
def create_age_view(): view_name = "age_from_events" return create_view( view_name, """ SELECT f.distinct_id, last_seen_ts - first_seen_ts as age_s, ceil( (last_seen_ts::numeric - first_seen_ts::numeric)/(60*60*24)) as age_days FROM first_seen_from_events f JOIN last_seen_from_events l ON f.distinct_id = l.distinct_id""")
def create_dates_during_users_life(): view_name = "dates_during_users_life" return create_view(view_name, """ SELECT distinct_id, dates.date, dates.week FROM dates JOIN first_last_seen fl ON dates.date >= DATE(TIMESTAMP 'epoch' + fl.first_seen_ts * INTERVAL '1 Second ') AND dates.date <= DATE(TIMESTAMP 'epoch' + fl.last_seen_ts * INTERVAL '1 Second ') """)
def create_dates_during_users_life(): view_name = "dates_during_users_life" return create_view( view_name, """ SELECT distinct_id, dates.date, dates.week FROM dates JOIN first_last_seen fl ON dates.date >= DATE(TIMESTAMP 'epoch' + fl.first_seen_ts * INTERVAL '1 Second ') AND dates.date <= DATE(TIMESTAMP 'epoch' + fl.last_seen_ts * INTERVAL '1 Second ') """)
def cut_events_by_time(view_name, condition): main_table = "e" columns = get_all_columns_without_id("events_all") col_sql = main_table + ".distinct_id, " for col in columns: col_sql += main_table + "." + "\"" + col + "\"" + ", " col_sql = col_sql[:-2] return create_view( view_name, """SELECT """ + col_sql + """FROM first_seen_from_events f JOIN events_all e ON f.distinct_id = e.distinct_id AND """ + condition + """ ; """)
def cut_events_by_time(view_name, condition): main_table = "e" columns = get_all_columns_without_id("events_all") col_sql = main_table + ".distinct_id, " for col in columns: col_sql += main_table + "." + "\"" + col + "\"" + ", " col_sql = col_sql[:-2] return create_view(view_name, """SELECT """ + col_sql + """FROM first_seen_from_events f JOIN events_all e ON f.distinct_id = e.distinct_id AND """ + condition + """ ; """)
def create_first_events(): ''' Needed in order to establish what is a clean session and what is orphaned due to a bug Note: in resulting table there will be many rows if there are multiple events on first and last second ''' view_name = "first_events" return create_view( view_name, """SELECT f.distinct_id, event_type as first_event_type, "new user" as first_event_new_user,result as first_event_result FROM first_seen_from_events f JOIN events_all e ON f.distinct_id = e.distinct_id AND f.first_seen_ts = e."timestamp" ; """)
def create_first_events(): ''' Needed in order to establish what is a clean session and what is orphaned due to a bug Note: in resulting table there will be many rows if there are multiple events on first and last second ''' view_name = "first_events" return create_view(view_name, """SELECT f.distinct_id, event_type as first_event_type, "new user" as first_event_new_user,result as first_event_result FROM first_seen_from_events f JOIN events_all e ON f.distinct_id = e.distinct_id AND f.first_seen_ts = e."timestamp" ; """)
def create_users(): view_name = "users" return create_view(view_name, """SELECT DISTINCT distinct_id, mp_country_code, "$region", "$city", "$email" FROM events_all""")
def make_events_lower_case(): ''' redshift forces column and table names to be lower case I want to create a table/column for every event type in the future In order to not think about case when doing the lookup I lowercase the event_types at the import stage here ''' view_name = "events_all" return create_view(view_name, """ SELECT "Scene Card Title", domain, "channel type", "$distinct_id", "User Id", type, "Upper History Navigation", "Author Name", "Bookmark navigation", "Operating System", "Read Time", "Gallery Object Type", "$radio", os_version, "$app_release", "Screen Width", "$device", "New User", "Upper Browser", "$ip", "User Continued Authentication", mp_device_model, "Scene Title", "$manufacturer", "$os_version", screen_width, "URL", "timestamp", "$lib_version", "Upper Channel Type", "$carrier", distinct_id, "Topic ID", "Account Created", "$screen_width", e, recipient, "AccountCreated", mp_country_code, "Story URL", "Default Topic Title", "Curator Name", mp_lib, "Menu Item", "Bookmark Title", n, "Bookmark ID", t, referrer, "$app_version", "$browser", "History navigation", "Author2 Name", browser, "Number of Bookmarks", delivery_id, "Default Topic ID", "$initial_referring_domain", "Trial to Signup", campaign_id, "External Link", "Scene ID", "Source", "$city", "Bookmark Navigation Clicked", "Curator ID", "OS Version", "Story Read Time", category, "$referrer", "Author ID", "Screen Height", "Drop Down Follow Through", "$os", "Accepted", message_id, "Time Since Story Opened", "User Created", "$wifi", "Scene Card ID", "$ios_ifa", "$email", "$initial_referrer", "Story ID", "$referring_domain", "Story Title", "$screen_height", a, "Topic Title", "Scene Card Number", "$model", "$region", "Share Source", message_type, "Story Completion", "Scene Cards", "First Launch", "Default Topic", "Result", "time", "Message", "Navigation Clicked", os, "Author2 ID", screen_height, v, id, LOWER(event_type) AS event_type FROM raw_copy WHERE "time" > '2015-01-15' AND time < '2015-02-28 23:59:59' """)
def create_clean_users(): ''' I only analyze sesssions where users have not signed up, not logged in and not used any features that force a profile creation. I am throwing away only 15k users signed up out of 300k. only 2k of them logged in second time on the web. Same 5% proportion among users with age > 10 days. So assume that registration doesn't affect whether this is a power user or not. See cleaning_target.md ''' view_name = "clean_users" sql = """ SELECT u.* FROM users_all_features u JOIN ( SELECT DISTINCT distinct_id FROM users_first_last_events WHERE first_event_type IN ( 'app viewed', 'app navigation clicked', 'story navigation clicked', 'story viewed', 'topic viewed', 'explore topic clicked', 'explore viewed', 'story completion' ) AND last_event_type IN ( 'app viewed', 'app navigation clicked', 'story navigation clicked', 'story viewed', 'topic viewed', 'explore topic clicked', 'explore viewed', 'story completion' ) ) c ON c.distinct_id = u.distinct_id WHERE onboard_viewed_total =0 AND onboard_navigation_total = 0 AND email_register_total = 0 AND login_result_total = 0 AND Bookmarks_Viewed_total = 0 AND bookmark_navigation_total =0 AND profile_viewed_total = 0 AND profile_page_viewed_total = 0 AND profile_navigation_clicked_total = 0 AND login_page_viewed_total = 0 AND login_viewed_total = 0 AND logging_in_user_id_total = 0 AND bookmark_viewed_total = 0 AND bookmark_story_total = 0 AND bookmark_remove_total = 0 AND forgot_password_page_viewed_total = 0 AND bookmark_navigation_total = 0 ;""" return create_view(view_name, sql)
def create_users_first_last_events(): view_name = "users_first_last_events" tables = get_all_first_last_event_tables() return create_view(view_name, aggregate_all_features(tables))
def make_events_lower_case(): ''' redshift forces column and table names to be lower case I want to create a table/column for every event type in the future In order to not think about case when doing the lookup I lowercase the event_types at the import stage here ''' view_name = "events_all" return create_view( view_name, """ SELECT "Scene Card Title", domain, "channel type", "$distinct_id", "User Id", type, "Upper History Navigation", "Author Name", "Bookmark navigation", "Operating System", "Read Time", "Gallery Object Type", "$radio", os_version, "$app_release", "Screen Width", "$device", "New User", "Upper Browser", "$ip", "User Continued Authentication", mp_device_model, "Scene Title", "$manufacturer", "$os_version", screen_width, "URL", "timestamp", "$lib_version", "Upper Channel Type", "$carrier", distinct_id, "Topic ID", "Account Created", "$screen_width", e, recipient, "AccountCreated", mp_country_code, "Story URL", "Default Topic Title", "Curator Name", mp_lib, "Menu Item", "Bookmark Title", n, "Bookmark ID", t, referrer, "$app_version", "$browser", "History navigation", "Author2 Name", browser, "Number of Bookmarks", delivery_id, "Default Topic ID", "$initial_referring_domain", "Trial to Signup", campaign_id, "External Link", "Scene ID", "Source", "$city", "Bookmark Navigation Clicked", "Curator ID", "OS Version", "Story Read Time", category, "$referrer", "Author ID", "Screen Height", "Drop Down Follow Through", "$os", "Accepted", message_id, "Time Since Story Opened", "User Created", "$wifi", "Scene Card ID", "$ios_ifa", "$email", "$initial_referrer", "Story ID", "$referring_domain", "Story Title", "$screen_height", a, "Topic Title", "Scene Card Number", "$model", "$region", "Share Source", message_type, "Story Completion", "Scene Cards", "First Launch", "Default Topic", "Result", "time", "Message", "Navigation Clicked", os, "Author2 ID", screen_height, v, id, LOWER(event_type) AS event_type FROM raw_copy WHERE "time" > '2015-01-15' AND time < '2015-02-28 23:59:59' """)
def createView(): """Return true if view was created, else false, in JSON format""" if create_view(session["user_id"], request.args.get("viewname")): return jsonify(True) else: return jsonify(False)
def create_users(): view_name = "users" return create_view( view_name, """SELECT DISTINCT distinct_id, mp_country_code, "$region", "$city", "$email" FROM events_all""")