def query_classification_data(**params): """ parameters: horizon_month prediction_month n """ query = """ SELECT CAST(x.cla_0 AS INT) as num_edits_0, CAST(x.cla_1 AS INT) as num_edits_1, CAST(x.cla_2 AS INT) as num_edits_2, CAST(x.cla_3 AS INT) as num_edits_3, CAST(x.cla_tot AS INT) as num_edits_total, CAST(x.cla_tot - x.cla_0 - x.cla_1 - x.cla_2 - x.cla_3 AS INT) as num_edits_rest, CAST(x.v_no_months AS INT) as months_since_registration, CAST(x.aeb_no_months AS INT) as months_since_5_edits, CAST(x.talk_counts AS INT) as talk_counts, CAST(x.friend_count AS INT) as friend_count, z.num_edits_to_date_0, z.num_edits_to_date_1, z.num_edits_to_date_2, z.num_edits_to_date_3, z.num_edits_to_date_tot, z.num_edits_got_archived_to_date_0, z.num_edits_got_reverted_to_date_0, CASE WHEN y.user_id IS NOT NULL THEN 1 ELSE 0 END as survive FROM (SELECT * from staging.leila_edits WHERE month = %(horizon_month)s ) x LEFT JOIN (SELECT distinct(user_id) FROM staging.leila_edits WHERE month <= %(prediction_month_end)s AND month >= %(prediction_month_start)s ) y ON (x.user_id = y.user_id) LEFT JOIN (SELECT user_id, SUM(cla_0) as num_edits_to_date_0, SUM(cla_1) as num_edits_to_date_1, SUM(cla_2) as num_edits_to_date_2, SUM(cla_3) as num_edits_to_date_3, SUM(cla_tot) as num_edits_to_date_tot, SUM(cala_0) as num_edits_got_archived_to_date_0, SUM(crla_0) as num_edits_got_reverted_to_date_0 FROM staging.leila_edits WHERE month <= %(horizon_month)s GROUP BY user_id) z ON (x.user_id = z.user_id) ORDER BY RAND() LIMIT %(n)s """ return query_s1(query, params)
def query_career_cluster_data(): """ AVG(cla_2) as avg_monthly_edits_2, STD(cla_2) as var_monthly_edits_2, AVG(cla_3) as avg_monthly_edits_3, STD(cla_3) as var_monthly_edits_3, AVG(friend_count) as avg_friend_count, STD(friend_count) as var_friend_count, AVG(cla_tot) as avg_monthly_edits_tot, STD(cla_tot) as var_monthly_edits_tot, COUNT(*) as num_active_months, SUM(cala_0)/SUM(cla_0) as percent_edits_got_archived_0, SUM(crla_0)/SUM(cla_0) as percent_edits_got_reverted_0 """ query = """ SELECT AVG(cla_0) as avg_monthly_edits_0, STD(cla_0) as var_monthly_edits_0, AVG(cla_1) as avg_monthly_edits_1, STD(cla_1) as var_monthly_edits_1 FROM staging.leila_edits GROUP BY user_id HAVING COUNT(*) > 5 LIMIT 50000 """ df = query_s1(query, {}).fillna(0) return df
def query_monthly_behaviour_data(): query = """ SELECT cla_0/cla_tot AS percent_main, cla_1/cla_tot AS percent_talk, cla_2/cla_tot AS percent_user, cla_3/cla_tot AS percent_usertalk FROM staging.leila_edits WHERE month = '2013-05-31' """ return query_s1(query, {}).fillna(0)