def start(): # Initial website if request.method == "GET": return render_template("recommender.html", units=UNITS, courses=None, selected_section=None, recommendation=None) else: # The user wants a recommendation if request.get_json(): data = request.get_json() # unescape solves html badly formatted characters courses_found = [ html.unescape(course) for course in data['courses'] ] section_found = html.unescape(data['section']) recommendation = predict(section_found, courses_found) return render_template("recommender.html", units=UNITS, courses=None, selected_section=section_found, recommendation=recommendation) # The user selected a section else: section_found = request.form['section'] found_courses = load_enrolment_matrix(unit_name=section_found, from_pickle=True) found_courses = found_courses.columns.tolist() found_courses.sort() return render_template("recommender.html", units=UNITS, courses=found_courses, selected_section=section_found, recommendation=None)
def load_co_enrolment_matrix(unit_name="Informatique", from_pickle=False, verbose=False): """ Loads the co-enrolment matrix from disk or the database """ if verbose: print("Loading the {} co enrolment matrix".format(unit_name)) if from_pickle: return pd.read_pickle( DATA_FOLDER + '{}_co_enrolment_matrix.pkl'.format(UNITS[unit_name])) courses_matrix = load_enrolment_matrix(unit_name, from_pickle=True, verbose=verbose) co_enrolments = pd.DataFrame(data=0, columns=courses_matrix.columns, index=courses_matrix.columns) for row in courses_matrix.iterrows(): taken_courses = row[1][row[1] == 1].index.tolist() for i, course in enumerate(taken_courses): co_enrolments.loc[course, taken_courses[i + 1:]] += 1 # Copy the upper triangle matrix to lower triangle one co_enrolments = co_enrolments + co_enrolments.T # Transforming to probabilities and removing the rows summing to nan co_enrolments = co_enrolments / co_enrolments.sum(axis=0) return co_enrolments
def training_weight_coenrolments(user_index, unit_name="Informatique"): """ Returns the training weights of co-enrolment """ courses_matrix = load_enrolment_matrix(unit_name, from_pickle=True) courses_taken = courses_matrix.iloc[user_index][ courses_matrix.iloc[user_index] == 1].index.tolist() return [ get_coenrolment(c, courses_taken, unit_name) for c in courses_matrix.columns.tolist() ]
def training_weight_grade_corr(user_index, unit_name="Informatique"): """ Returns the grade correlation weights for a certain user """ courses_matrix = load_enrolment_matrix(unit_name, from_pickle=True) courses_taken = courses_matrix.iloc[user_index][ courses_matrix.iloc[user_index] == 1].index.tolist() return [ get_grades_corr(c, courses_taken) for c in courses_matrix.columns.tolist() ]
def train_all_individual_models(dropout=0.998, hidden_layers=27, verbosity=2): """ The aim of this method is to simply train all the models in order to store them on disk afterwards for dynamic loading. """ for i, unit in enumerate(UNITS): print("Training the model for {} ({}/{})".format( unit, i + 1, len(UNITS))) train_model(load_enrolment_matrix(unit, from_pickle=True), dropout, hidden_layers, verbosity, save=unit)
def load_grade_corr_matrix(from_pickle=False): """ Returns the matrix of grade correlations inbetween courses """ if from_pickle: return pd.read_pickle(DATA_FOLDER + 'grade_correlation_matrix.pkl') # Retrieve courses correlations grade_corr = pd.read_csv(DATA_FOLDER + 'correlation-subject-pair.csv') grade_corr = grade_corr[['sub1', 'sub2', "cor1", "cor2"]] grade_corr['cor_mean'] = grade_corr[['cor1', 'cor2']].apply( lambda x: correlation_series_mean(x[0], x[1]), axis=1) grade_corr = grade_corr[['sub1', 'sub2', 'cor_mean']] # Use SubjectName instead of SubjectID grade_corr['sub1_name'] = grade_corr.sub1.map(course_id_mapper) grade_corr['sub2_name'] = grade_corr.sub2.map(course_id_mapper) grade_corr = grade_corr.dropna()[['sub1_name', 'sub2_name', 'cor_mean']] # In case there are no correlations, we set to the mean of all of them mean_correlations = grade_corr.mean() # Let's make it a matrix grade_corr_matrix = grade_corr.set_index( ["sub1_name", "sub2_name"]).unstack(level=0).fillna(mean_correlations) # normalize correlations by adding 1 and dividing by the max grade_corr_matrix = (grade_corr_matrix + 1) / 2 # Set not found courses correlations to the mean of all correlations no_corr_courses = [ c for c in load_enrolment_matrix(from_pickle=True).columns.tolist() if c not in grade_corr_matrix.index.tolist() ] missing_correlations = pd.DataFrame(np.full( fill_value=mean_correlations, shape=(grade_corr_matrix.shape[0], len(no_corr_courses))), columns=no_corr_courses, index=grade_corr_matrix.index.tolist()) grade_corr_matrix.columns = grade_corr_matrix.columns.droplevel() grade_corr_matrix = pd.concat([grade_corr_matrix, missing_correlations], axis=1) # Let's transform it into probabilistic grade_corr_matrix = grade_corr_matrix / grade_corr_matrix.sum(axis=0) grade_corr_matrix.to_pickle(DATA_FOLDER + 'grade_correlation_matrix.pkl') return grade_corr_matrix
def predict(unit="Informatique", courses=COURSES): """ Recommends a list of courses from the ones you took or plan to take, and from your school unit. """ courses_matrix = load_enrolment_matrix(unit_name=unit, from_pickle=True) my_courses = pd.DataFrame(data=0, columns=courses_matrix.columns, index=[USERNAME]) my_courses[courses] = 1 taken_courses = my_courses.loc[USERNAME][my_courses.loc[USERNAME] == 1].index.tolist() my_binary_courses = my_courses.as_matrix() binary_courses_format = np.array([[1]], dtype=np.int32) model = models.load_model(DATA_FOLDER + '{}_cdae_model.hd5'.format(UNITS[unit])) prediction = model.predict(x=[my_binary_courses, binary_courses_format]) # CDAE + co-enrolment + grade correlations model #prediction = np.array([ np.array(training_weight_coenrolments(i, unit)) * np.array(training_weight_grade_corr(i, unit)) * np.array(nn_weights) for i, nn_weights in enumerate(prediction) ]) # CDAE + co-enrolment prediction = np.array([ np.array(training_weight_coenrolments(i, unit)) * np.array(nn_weights) for i, nn_weights in enumerate(prediction) ]) prediction = np.argsort(prediction) predicted_courses = [courses_matrix.columns[i] for i in prediction[0]] last_year_courses = list( get_last_year_registrations(unit_name=unit, from_pickle=True).index) predicted_courses = [ c for c in predicted_courses if c in last_year_courses and c not in taken_courses ] return predicted_courses[::-1][:10]