user_topic - tuple of (user, topic ID) user_segment - group(s) that the user is a member of for dashboard comparison purposes (eg. A/B test experiments, has coach, etc.) """ is_test = lambda info: info.get('purpose', None) == 'randomized' test_cards = [(i, x[0]) for i, x in enumerate(attempts, 1) if is_test(x[2])] for i in range(1, len(test_cards)): prev_card, curr_card = test_cards[i - 1], test_cards[i] total_gain = float(curr_card[1]) - float(prev_card[1]) incremental_gain = total_gain / (curr_card[0] - prev_card[0]) topic = user_topic[1] if user_topic and len(user_topic) >= 2 else None if topic == "any": # it is not cool to analyze cards done from various stacks # as if they were done with one big, generic stack. for example, # if a user moved from an easy to a difficult topic, you would # likely see accuracy drop on the randomized cards, even though # this is very healthy user behavior. return for i in range(prev_card[0], curr_card[0]): # TODO(david): Output and group by user segments (eg. # experiments the user was in). print '%s\t%s\t%s\t%s\t%s' % (topic, user_segment, len(attempts), i, incremental_gain) if __name__ == '__main__': table_parser.parse_user_topic_input(emit_accuracy_deltas)
def emit_topic_retention(attempts, user_topic, user_segment): """Outputs a row for every (topic, segment, card number) to be aggregated in Hive. More precisely, output row has values <topic, user segment, randomized?, "card_number", card #, correct (1 or 0)> attempts - a list of exercise card attempts in a topic, ordered by time done. Each element is a tuple (bool correct, int problem_number, dict scheduler_info). user_topic - tuple of (user, topic ID) user_segment - group(s) that the user is a member of for dashboard comparison purposes (eg. A/B test experiments, has coach, etc.) """ def is_randomized(info): # Hive only casts empty strings from custom scripts to false return 'TRUE' if info.get('purpose', None) == 'randomized' else '' user, topic = user_topic # Output retention stats by card number # TODO(david): Output time taken buckets for i, attempt in enumerate(attempts, 1): print '%s\t%s\t%s\t%s\t%s\t%s' % (topic, user_segment, is_randomized(attempt[2]), "card_number", i, int(attempt[0])) if __name__ == '__main__': table_parser.parse_user_topic_input(emit_topic_retention)
def emit_topic_retention(attempts, user_topic, user_segment): """Outputs a row for every (topic, segment, card number) to be aggregated in Hive. More precisely, output row has values <topic, user segment, randomized?, "card_number", card #, correct (1 or 0)> attempts - a list of exercise card attempts in a topic, ordered by time done. Each element is a tuple (bool correct, int problem_number, dict scheduler_info). user_topic - tuple of (user, topic ID) user_segment - group(s) that the user is a member of for dashboard comparison purposes (eg. A/B test experiments, has coach, etc.) """ def is_randomized(info): # Hive only casts empty strings from custom scripts to false return 'TRUE' if info.get('purpose', None) == 'randomized' else '' user, topic = user_topic # Output retention stats by card number # TODO(david): Output time taken buckets for i, attempt in enumerate(attempts, 1): print '%s\t%s\t%s\t%s\t%s\t%s' % (topic, user_segment, is_randomized(attempt[2]), "card_number", i, int(attempt[0])) if __name__ == '__main__': table_parser.parse_user_topic_input(emit_topic_retention)