def _context_series(group):
        if len(group) < context_answer_limit:
            return []
        user_answers_dict = defaultdict(list)
        for row in iterdicts(group):
            user_answers_dict[row['user_id']].append(save_fun(row))

        def _user_answers(answers):
            if reverse:
                answers = answers[::-1]
            if require_length:
                answers = answers[:min(len(answers), length)]
                nones = [None for _ in range(length - len(answers))]
            else:
                nones = []
            if reverse:
                answers = answers[::-1]
                return nones + answers
            else:
                return answers + nones

        return [
            _user_answers(answers)
            for answers in user_answers_dict.itervalues()
            if user_length is None or len(answers) >= user_length
        ]
def learning_points(data, length=5):
    context_answers = defaultdict(dict)
    for row in iterdicts(data):
        user_answers = context_answers[row['term_type'], row['context_name']]
        if row['user_id'] in user_answers:
            user_answers[row['user_id']].append((
                row['time'],
                (row['time'] - user_answers[row['user_id']][-1][0]).total_seconds(),
                len(user_answers[row['user_id']]),
                row['item_asked_id'] == row['item_answered_id']
            ))
        else:
            user_answers[row['user_id']] = [(
                row['time'],
                0,
                0,
                row['item_asked_id'] == row['item_answered_id']
            )]
    answers = [a for user_answers in context_answers.itervalues() for uas in user_answers.itervalues() for a in uas]
    thresholds = numpy.percentile(map(lambda xs: xs[1], answers), range(10, 100, 10))
    thresholds = [60, 120, 300, 600, 3600, 3600 * 24]
    thresholds = zip([0] + thresholds, thresholds + [60 * 60 * 24 * 375])
    result = []
    for attempt in range(length):
        for lower, upper in thresholds:
            filtered = map(lambda xs: xs[3], filter(lambda xs: xs[1] >= lower and xs[1] < upper and xs[2] == attempt, answers))
            result.append((attempt, lower, None if len(filtered) < 30 else numpy.mean(filtered), len(filtered)))
    return result
 def _attrition_bias(group):
     if len(group) < context_answer_limit:
         return []
     user_answers_dict = defaultdict(list)
     for row in iterdicts(group):
         user_answers_dict[row['user_id']].append(row['item_asked_id'] != row['item_answered_id'])
     return user_answers_dict.values()
def test_questions(data, length=100):
    last_user = None
    last_context = None
    counter = None
    result = defaultdict(lambda: 0)
    for row in iterdicts(data.sort(['user_id', 'context_name', 'term_type', 'id'])):
        if last_user != row['user_id'] or last_context != (row['context_name'], row['term_type']):
            last_user = row['user_id']
            last_context = (row['context_name'], row['term_type'])
            counter = 0
        if row['metainfo_id'] == 1 and counter < length:
            result[counter] += 1
        counter += 1
    return dict(result.items())
def success_before(feedback, answers, override=False):
    if len(feedback) == 0:
        return feedback
    if not override and 'success_before' in feedback:
        return feedback
    feedback.sort(['user', 'id'], inplace=True)
    last_user = None
    user_data = None
    last_date = None
    success_before_dict = {}
    for row in iterdicts(feedback):
        if row['user'] != last_user:
            last_user = row['user']
            last_date = None
            user_data = answers[answers['user'] == last_user]
        filter_fun = lambda x: x['inserted'] < row['inserted'] and (last_date is None or x['inserted'] > last_date)
        interval_data = user_data[user_data.apply(filter_fun, axis=1)]
        prob = sum(interval_data['place_asked'] == interval_data['place_answered']) / float(len(interval_data))
        success_before_dict[row['id']] = int(5 * round(float(prob * 100) / 5))
    feedback['success_before'] = feedback.apply(lambda x: success_before_dict[x['id']], axis=1)
    return feedback
def load_data(answer_limit, filter_invalid_tests=True, filter_invalid_response_time=True, rolling_success=False):
    answers = pandas.read_csv('./answers.csv', index_col=False, parse_dates=['time'])
    flashcards = pandas.read_csv('./flashcards.csv', index_col=False)
    user_ip = pandas.read_csv('./ip_address.csv', index_col=False)

    answers['experiment_setup_name'] = answers['experiment_setup_id'].apply(lambda i: SETUP[i])

    valid_users = map(lambda x: x[0], filter(lambda x: x[1] >= answer_limit, answers.groupby('user_id').apply(len).to_dict().items()))
    answers = answers[answers['user_id'].isin(valid_users)]

    if filter_invalid_response_time:
        invalid_users = answers[answers['response_time'] < 0]['user_id'].unique()
        answers = answers[~answers['user_id'].isin(invalid_users)]

    answers = pandas.merge(answers, flashcards, on='item_id', how='inner')

    if filter_invalid_tests:
        invalid_users = answers[answers['context_id'] == 17]['user_id'].unique()
        answers = answers[~answers['user_id'].isin(invalid_users)]

        invalid_users = set()
        last_user = None
        last_context = None
        counter = None
        for row in iterdicts(answers.sort(['user_id', 'context_name', 'term_type', 'id'])):
            if last_user != row['user_id'] or last_context != (row['context_name'], row['term_type']):
                last_user = row['user_id']
                last_context = (row['context_name'], row['term_type'])
                counter = 0
            if row['metainfo_id'] == 1 and counter % 10 != 0:
                invalid_users.add(row['user_id'])
            counter += 1
        answers = answers[~answers['user_id'].isin(invalid_users)]

    answers = pa.decorate_session_number(answers, 3600 * 10)
    answers = decorate_school(answers, user_ip=user_ip)
    if rolling_success:
        answers = pa.decorate_last_in_session(answers)
        answers = pa.decorate_rolling_success(answers)
    return answers.sort(['user_id', 'id'])
Exemple #7
0
def prepare_difficulty_and_prior_skill(answers, difficulty=None):
    '''
    Compute the difficulty for places.

    Args:
        answers (pandas.DataFrame):
            data frame containing answer data
    Returns:
        dict: place -> difficulty, user's id -> prior skill
    '''
    first = first_answers(answers, ['user']).sort('id').sort('id')
    if difficulty:
        env = PreserveDifficultyEnvironment(difficulty)
    else:
        env = InMemoryEnvironment()
    stream = DefaultAnswerStream(env)
    for a in iterdicts(first):
        stream.stream_answer(a)
    skill_items = env.export_prior_skill().items()
    ids, skills = zip(*skill_items)
    prior_skill = dict(zip(list(ids), map(lambda x: predict_simple(x, 0)[0], list(skills))))
    return env.export_difficulty(), prior_skill
def success_before(feedback, answers, override=False):
    if len(feedback) == 0:
        return feedback
    if not override and 'success_before' in feedback:
        return feedback
    feedback.sort(['user', 'id'], inplace=True)
    last_user = None
    user_data = None
    last_date = None
    success_before_dict = {}
    for row in iterdicts(feedback):
        if row['user'] != last_user:
            last_user = row['user']
            last_date = None
            user_data = answers[answers['user'] == last_user]
        filter_fun = lambda x: x['inserted'] < row['inserted'] and (
            last_date is None or x['inserted'] > last_date)
        interval_data = user_data[user_data.apply(filter_fun, axis=1)]
        prob = sum(interval_data['place_asked'] ==
                   interval_data['place_answered']) / float(len(interval_data))
        success_before_dict[row['id']] = int(5 * round(float(prob * 100) / 5))
    feedback['success_before'] = feedback.apply(
        lambda x: success_before_dict[x['id']], axis=1)
    return feedback