Exemple #1
0
def find_interesting_association_rules(support_count_threshold, target_course, target_grade, max_length=999, consider_non_attempted_courses=False):
    target_course = str(target_course)
    target_grade = int(target_grade)
    target_int = int(target_course) * 10 + target_grade
    transcripts = get_until_course_grade(simple_data, target_course, [0, 2, 4], False)
    transcripts_with_target_course = get_until_course_grade(simple_data, target_course, [0, 2, 4], True)
    transcripts = [set(int(a['id']) * 10 + int(a['grade']) for a in t) for t in transcripts]
    transcripts_with_target_course = [set(int(a['id']) * 10 + int(a['grade']) for a in t) for t in transcripts_with_target_course]
    if consider_non_attempted_courses:
        add_never_attempted_courses(support_count_threshold, transcripts)
        add_never_attempted_courses(support_count_threshold, transcripts_with_target_course)
    # for t in transcripts:
    #     has = False
    #     for a in t:
    #         if a/10 == 581305:
    #             has = True
    #             break
    #     if not has:
    #         t |= {5813059}
    #apriori_initial_itemsets = [{x} | {target_int} for x in unique_courses_frhom_codes(transcripts) if x // 10 != int(target_course)]
    apriori_initial_itemsets = [{x} for x in unique_courses_from_codes(transcripts)]
    frequent_itemsets = alg.apriori_new(support_count_threshold, apriori_initial_itemsets, transcripts, max_length)
    frequent_itemsets.sort(key=lambda x: x[1], reverse=True)
    itemset_supportc_supportcls = []
    for x in frequent_itemsets:
        itemset = x[0] | {target_int}
        itemset_supportc_supportcls.append((x[0], alg.support_count(itemset, transcripts_with_target_course), x[1]))
    itemset_supportc_supportcls_confidences = [x + (x[1] / x[2],) for x in itemset_supportc_supportcls]
    itemset_supportc_supportcls_confidence_lifts = [x + (x[-1] / alg.support({target_int}, transcripts_with_target_course),) for x in itemset_supportc_supportcls_confidences]
    #itemset_supportc_supportcls_confidence_lifts_interestingness = [x + (x[-1] if x[-1] >= 1 else 1.0/(x[-1] + 0.001),) for x in itemset_supportc_supportcls_confidence_lifts]
    itemset_supportc_supportcls_confidence_lifts.sort(key=lambda x: x[-1], reverse=True)
    print()
    redundant_rules = 0
    for i, x in enumerate(itemset_supportc_supportcls_confidence_lifts):
        if len(x[0]) + 1 > max_length:
            continue
        redundant = False
        if x[4] > 1:
            redundant = any(y[0] <= x[0] for y in itemset_supportc_supportcls_confidence_lifts[0:i])
        else:
            redundant = any(y[0] <= x[0] for y in itemset_supportc_supportcls_confidence_lifts[i + 1:])
        if redundant:
            redundant_rules += 1
            continue
        print("{} -> {{{}}}".format(x[0] - {target_int}, target_int))
        print("{} -> {{{}}}".format(set(code_to_str(a) for a in (x[0] - {target_int})), code_to_str(target_int)))
        print("support count lhs {}\nsupport lhs {}".format(x[2], round(x[2] / len(transcripts), decimals)))
        print("support count {}\nsupport {}".format(x[1], round(x[1] / len(transcripts), decimals)))
        print("confidence", round(x[3], decimals))
        print("lift", round(x[4], decimals))
        #print("interestingness", round(x[5], decimals))
        print()
    print("Transcripts in pruned data set:", len(transcripts))
    print("{} rules found".format(len(itemset_supportc_supportcls_confidence_lifts) - redundant_rules))
    print("{} redundant rules pruned".format(redundant_rules))
Exemple #2
0
def add_never_attempted_courses(support_count_threshold, transcripts):
    for course_code in unique_courses_from_codes([set(a['id'] for a in t) for t in simple_data]):
        support_count = 0
        for g in [0, 2, 4]:
            support_count += alg.support_count({int(course_code) * 10 + g}, transcripts)
        if support_count >= support_count_threshold:
            for t in transcripts:
                found = False
                for g in [0, 2, 4]:
                    if int(course_code) * 10 + g in t:
                        found = True
                        break
                if not found:
                    t |= {int(course_code) * 10 + 9}