def _context_series(group): if len(group) < context_answer_limit: return [] user_answers_dict = defaultdict(list) for row in iterdicts(group): user_answers_dict[row['user_id']].append(save_fun(row)) def _user_answers(answers): if reverse: answers = answers[::-1] if require_length: answers = answers[:min(len(answers), length)] nones = [None for _ in range(length - len(answers))] else: nones = [] if reverse: answers = answers[::-1] return nones + answers else: return answers + nones return [ _user_answers(answers) for answers in user_answers_dict.itervalues() if user_length is None or len(answers) >= user_length ]
def learning_points(data, length=5): context_answers = defaultdict(dict) for row in iterdicts(data): user_answers = context_answers[row['term_type'], row['context_name']] if row['user_id'] in user_answers: user_answers[row['user_id']].append(( row['time'], (row['time'] - user_answers[row['user_id']][-1][0]).total_seconds(), len(user_answers[row['user_id']]), row['item_asked_id'] == row['item_answered_id'] )) else: user_answers[row['user_id']] = [( row['time'], 0, 0, row['item_asked_id'] == row['item_answered_id'] )] answers = [a for user_answers in context_answers.itervalues() for uas in user_answers.itervalues() for a in uas] thresholds = numpy.percentile(map(lambda xs: xs[1], answers), range(10, 100, 10)) thresholds = [60, 120, 300, 600, 3600, 3600 * 24] thresholds = zip([0] + thresholds, thresholds + [60 * 60 * 24 * 375]) result = [] for attempt in range(length): for lower, upper in thresholds: filtered = map(lambda xs: xs[3], filter(lambda xs: xs[1] >= lower and xs[1] < upper and xs[2] == attempt, answers)) result.append((attempt, lower, None if len(filtered) < 30 else numpy.mean(filtered), len(filtered))) return result
def _attrition_bias(group): if len(group) < context_answer_limit: return [] user_answers_dict = defaultdict(list) for row in iterdicts(group): user_answers_dict[row['user_id']].append(row['item_asked_id'] != row['item_answered_id']) return user_answers_dict.values()
def test_questions(data, length=100): last_user = None last_context = None counter = None result = defaultdict(lambda: 0) for row in iterdicts(data.sort(['user_id', 'context_name', 'term_type', 'id'])): if last_user != row['user_id'] or last_context != (row['context_name'], row['term_type']): last_user = row['user_id'] last_context = (row['context_name'], row['term_type']) counter = 0 if row['metainfo_id'] == 1 and counter < length: result[counter] += 1 counter += 1 return dict(result.items())
def success_before(feedback, answers, override=False): if len(feedback) == 0: return feedback if not override and 'success_before' in feedback: return feedback feedback.sort(['user', 'id'], inplace=True) last_user = None user_data = None last_date = None success_before_dict = {} for row in iterdicts(feedback): if row['user'] != last_user: last_user = row['user'] last_date = None user_data = answers[answers['user'] == last_user] filter_fun = lambda x: x['inserted'] < row['inserted'] and (last_date is None or x['inserted'] > last_date) interval_data = user_data[user_data.apply(filter_fun, axis=1)] prob = sum(interval_data['place_asked'] == interval_data['place_answered']) / float(len(interval_data)) success_before_dict[row['id']] = int(5 * round(float(prob * 100) / 5)) feedback['success_before'] = feedback.apply(lambda x: success_before_dict[x['id']], axis=1) return feedback
def load_data(answer_limit, filter_invalid_tests=True, filter_invalid_response_time=True, rolling_success=False): answers = pandas.read_csv('./answers.csv', index_col=False, parse_dates=['time']) flashcards = pandas.read_csv('./flashcards.csv', index_col=False) user_ip = pandas.read_csv('./ip_address.csv', index_col=False) answers['experiment_setup_name'] = answers['experiment_setup_id'].apply(lambda i: SETUP[i]) valid_users = map(lambda x: x[0], filter(lambda x: x[1] >= answer_limit, answers.groupby('user_id').apply(len).to_dict().items())) answers = answers[answers['user_id'].isin(valid_users)] if filter_invalid_response_time: invalid_users = answers[answers['response_time'] < 0]['user_id'].unique() answers = answers[~answers['user_id'].isin(invalid_users)] answers = pandas.merge(answers, flashcards, on='item_id', how='inner') if filter_invalid_tests: invalid_users = answers[answers['context_id'] == 17]['user_id'].unique() answers = answers[~answers['user_id'].isin(invalid_users)] invalid_users = set() last_user = None last_context = None counter = None for row in iterdicts(answers.sort(['user_id', 'context_name', 'term_type', 'id'])): if last_user != row['user_id'] or last_context != (row['context_name'], row['term_type']): last_user = row['user_id'] last_context = (row['context_name'], row['term_type']) counter = 0 if row['metainfo_id'] == 1 and counter % 10 != 0: invalid_users.add(row['user_id']) counter += 1 answers = answers[~answers['user_id'].isin(invalid_users)] answers = pa.decorate_session_number(answers, 3600 * 10) answers = decorate_school(answers, user_ip=user_ip) if rolling_success: answers = pa.decorate_last_in_session(answers) answers = pa.decorate_rolling_success(answers) return answers.sort(['user_id', 'id'])
def prepare_difficulty_and_prior_skill(answers, difficulty=None): ''' Compute the difficulty for places. Args: answers (pandas.DataFrame): data frame containing answer data Returns: dict: place -> difficulty, user's id -> prior skill ''' first = first_answers(answers, ['user']).sort('id').sort('id') if difficulty: env = PreserveDifficultyEnvironment(difficulty) else: env = InMemoryEnvironment() stream = DefaultAnswerStream(env) for a in iterdicts(first): stream.stream_answer(a) skill_items = env.export_prior_skill().items() ids, skills = zip(*skill_items) prior_skill = dict(zip(list(ids), map(lambda x: predict_simple(x, 0)[0], list(skills)))) return env.export_difficulty(), prior_skill
def success_before(feedback, answers, override=False): if len(feedback) == 0: return feedback if not override and 'success_before' in feedback: return feedback feedback.sort(['user', 'id'], inplace=True) last_user = None user_data = None last_date = None success_before_dict = {} for row in iterdicts(feedback): if row['user'] != last_user: last_user = row['user'] last_date = None user_data = answers[answers['user'] == last_user] filter_fun = lambda x: x['inserted'] < row['inserted'] and ( last_date is None or x['inserted'] > last_date) interval_data = user_data[user_data.apply(filter_fun, axis=1)] prob = sum(interval_data['place_asked'] == interval_data['place_answered']) / float(len(interval_data)) success_before_dict[row['id']] = int(5 * round(float(prob * 100) / 5)) feedback['success_before'] = feedback.apply( lambda x: success_before_dict[x['id']], axis=1) return feedback