def build_dob_cache(dob_cache, assts): for ix, ass in enumerate(assts.iterrows()): id, ts, gb_id, gr_id = ass_extract(ass) students = list(get_student_list(gr_id)["user_id"]) # print("#{}: PREP: grp {} at {}".format(ix, gr_id, ts)) group_df = get_user_data(students) for psi in students: if psi not in dob_cache: # print("age gen...") age_df = get_age_df(ts, group_df) age_df["dob"] = pandas.to_datetime(age_df["dob"]) # age = age_df.loc[psi, "age"] for psi_inner in students: dob = age_df.loc[psi,"dob"] # print(type(dob)) assert isinstance(dob, Timestamp) dob_cache[psi_inner] = dob return dob_cache
def filter_assignments(assignments, book_only): # query = "select id, gameboard_id, group_id, owner_user_id, creation_date from assignments order by creation_date asc" assignments["include"] = True print(assignments.shape) map = make_gb_question_map() meta = get_meta_data() for ix in range(assignments.shape[0]): include = True gr_id = assignments.loc[ix, "group_id"] if book_only: gb_id = assignments.loc[ix, "gameboard_id"] hexes = map[gb_id] for hx in hexes: hx = hx.split("|")[0] if not (hx.startswith("ch_") or hx.startswith("ch-i")): include = False break if include: students = get_student_list([gr_id]) if students.empty: include = False if include: include = False for psi in list(students["user_id"]): # print("checking",psi) atts = get_attempts_from_db(psi) if not atts.empty: # print("OK") include = True break if not include: assignments.loc[ix, "include"] = False # assignments = assignments[assignments["include"]==True] print(assignments.shape) return assignments
def __init__(self, assts, batch_size=512, FRESSSH=False, return_qhist=False): self.assts: pandas.DataFrame = assts self.assts.loc[:,'creation_date'] = pandas.to_datetime(assts['creation_date']) self.gb_qmap = make_gb_question_map() self.batch_size=batch_size if batch_size!="assignment" else 0 self.return_qhist = return_qhist if not FRESSSH: print("APPEND mode") #recycle old pap try: f = open(prof_fname, 'rb') self.profiles = pickle.load(f) print("got this many profiles:",len(self.profiles)) # print(list(profiles.keys())[0:10]) f.close() except: self.profiles = {} # d = open(dob_cache, 'rb') # self.dob_cache = pickle.load(d) # print("loaded dob cache with {} entries".format(self.dob_cache)) # d.close() else: print("Baking FRESH, like cinnamon!") self.profiles = {} # self.dob_cache = {} self.ts_cache = {} self.assid_list = [] self.ts_master_list = [] self.gb_id_list = [] self.gr_id_list = [] self.students_list = [] print("building dob_cache") empty_cache = {} self.dob_cache = build_dob_cache(empty_cache, assts) print(len(empty_cache)) print("done") for ix, ass in enumerate(self.assts.iterrows()): id, ts, gb_id, gr_id = ass_extract(ass) self.assid_list.append(id) self.ts_master_list.append(ts) self.gb_id_list.append(gb_id) self.gr_id_list.append(gr_id) students = list(get_student_list(gr_id)["user_id"]) self.students_list.append(students) # print("#{}: PREP: grp {} at {}".format(ix, gr_id, ts)) for psi in students: if psi in self.ts_cache.keys(): # print("try add ts {}".ts) # temp = self.ts_cache[psi] # print(temp) # temp.append(ts) # self.ts_cache[psi] = temp t = self.ts_cache[psi] t.append(ts) self.ts_cache[psi] = t else: self.ts_cache[psi] = [ts] c=-1 for i,ts,gb_id,gr_id in zip(self.assid_list, self.ts_master_list, self.gb_id_list, self.gr_id_list): c += 1 has_changed = False students = list(get_student_list(gr_id)["user_id"]) for psi in students: # set up the training arrays here fn = "prof_{}_{}".format(psi, ts) if fn not in self.profiles: print("{}- - - - profile for {} .. not found .. will create all ={}".format(c,psi, SAVE_TO_PROF_CACHE)) has_changed = True group_df = get_user_data(students) ts_list = self.ts_cache[psi] print("ts_list", ts_list) print("s..") s_psi_list = gen_semi_static(psi, self.dob_cache, ts_list) print("done") print("x..") x_psi_list = gen_experience(psi, ts_list) print("done") print("u..") u_psi_list = gen_success(psi, ts_list) print("done") for ts,s_psi,x_psi,u_psi in zip(sorted(ts_list),s_psi_list,x_psi_list, u_psi_list): loopvar = "prof_{}_{}".format(psi, ts) self.profiles[fn] = zlib.compress(pickle.dumps((s_psi, x_psi, u_psi))) print("created profile for ",loopvar, "xp=",numpy.sum(x_psi),"sxp=",numpy.sum(u_psi),"S=",s_psi) else: print(".. {} f/cache".format(fn)) if has_changed: f = open(prof_fname, 'wb') pickle.dump(self.profiles, f) f.close() print("*** *** *** SAVED")
def __iter__(self): b = 0 # batch counter c = 0 # cumulative counter S = [] X = [] U = [] len_assts = len(self.assts) y = [] awgt = [] assids = [] psi_list = [] qhist_list = [] last_i = None for i, ts, gb_id, gr_id in zip(self.assid_list, self.ts_master_list, self.gb_id_list, self.gr_id_list): c += 1 hexagons = [self.gb_qmap[gb_id][0]] students = get_student_list(gr_id) students = list(students["user_id"]) print("...", ts, students, hexagons) for psi in students: # set up the training arrays here hexagons = [hx.split("|")[0] for hx in hexagons] fn = "prof_{}_{}".format(psi, ts) if fn not in self.profiles: print(fn, "not in profiles, why??") continue tripat = pickle.loads(zlib.decompress(self.profiles[fn])) if tripat is None: print(fn, "gives none") else: (s_psi, x_psi, u_psi) = tripat for hx in hexagons: if self.pid_override is not None and hx not in self.pid_override: print("pid problem", hx) continue print(">>>", ts, psi, hx, s_psi, numpy.sum(x_psi), numpy.sum(u_psi)) S.append(s_psi) X.append(x_psi) U.append(u_psi) y.append([hx]) assids.append(i) awgt.append([len(hexagons)]) psi_list.append(psi) if(self.return_qhist): qhist_list.append(gen_qhist(psi,ts)) else: qhist_list.append(None) print(len(X), "in the pipe...") bs = self.batch_size if (bs == 0 and i != last_i) or ((bs > 0) and (len(X) >= bs)): if last_i is None: last_i = i continue # special frist nop case print("b={}, n samples = {} ({}/{}={:.1f}%)".format(b, len(X), c, len_assts, (100.0 * c / len_assts))) b += 1 yield S, X, U, y, assids, awgt, psi_list, qhist_list last_i = i S = [] X = [] U = [] y = [] assids = [] awgt = [] psi_list = [] qhist_list = [] gc.collect() print("out of assts") yield S, X, U, y, assids, awgt, psi_list, qhist_list
def evaluate_phybook_loss(tt, sxua, model, sc): aid_list, s_list, x_list, u_list, a_list, y_list = augment_data(tt, sxua) # hex_list = [] # all_page_ids = pid_override # ailist = [] for row in tt.iterrows(): aid = row[1]["id"] # ts = row[1]["creation_date"] gr_id = row[1]["group_id"] gb_id = row[1]["gameboard_id"] student_ids = list(get_student_list(gr_id)["user_id"]) print(student_ids) student_data = get_user_data(student_ids) hexes = list(gb_qmap[gb_id]) print(hexes) for _ in student_ids: aid_list.append(aid) # hex_list.append(hexes) s_list = sc.transform(s_list) s_list = numpy.array(s_list) x_list = numpy.array(x_list) u_list = numpy.array(u_list) a_list = numpy.array(a_list) print(s_list.shape, x_list.shape, u_list.shape, a_list.shape) print("results") print(model.get_input_shape_at(0)) predictions = model.predict([s_list, u_list]) j_max = 0 thresh_max = 0 dir_hits_max = 0 for j_thresh in [0.01, 0.025, .05, 0.075, .1, .2, 0.3, 0.4, 0.5, 0.6, 0.7]: # for j_thresh in [0.4]: j_sum = 0 # dir_sum = 0 incl_sum = 0 dir_hits = 0 N = len(predictions) this_ai = None for ai, p, s, x, a, y in zip(aid_list, predictions, s_list, x_list, a_list, y_list): t = [pid_override[yix] for yix, yval in enumerate(y) if yval == 1] if ai != this_ai: print("\n...new asst", ai) this_ai = ai phxs = [] probs = [] print("pshape", p.shape) maxpox = numpy.argmax(p) print(maxpox, len(pid_override)) max_guess = pid_override[maxpox] phxs.append(max_guess) probs.append(p[maxpox]) for ix, el in enumerate(p): if el > j_thresh and pid_override[ix] not in phxs: phxs.append(pid_override[ix]) probs.append(p[ix]) probs_shortlist = list(reversed(sorted(probs))) Z = list(reversed([x for _, x in sorted(zip(probs, phxs))])) # if Z: # for t_el in t: # if t_el in Z:#'direct hit' # dir_sum += 1.0/len(t) print(t, Z) print(probs_shortlist) # print([all_page_ids[hx] for hx,el in enumerate(a) if el==1]) if max_guess not in t: robot = "BAD ROBOT" else: if max_guess == t[0]: robot = "GREAT ROBOT" dir_hits += 1 else: robot = "GOOD ROBOT" print("{} {}, XP={}".format(robot, sc.inverse_transform(s), numpy.sum(x))) t = set(t) phxs = set(phxs) if len(t.intersection(phxs)) > 0: incl_sum += 1 j_sum += len(t.intersection(phxs)) / len(t.union(phxs)) j_score = j_sum / N # dir_score = dir_sum/N if dir_hits > dir_hits_max: j_max = j_score thresh_max = j_thresh dir_hits_max = dir_hits # dir_for_j_max = dir_score print("j_thresh =", j_thresh) print("Jaccard:", j_score) print("Incl:", incl_sum / N) print("D/H:", dir_hits / N) print("~ ~ ~ ~") print("max thresh/jacc:", thresh_max, j_max, dir_hits_max / N) print("num examples", N)
def create_student_scorecards(tt, sxua, model, sc): names_df = get_q_names() names_df.index = names_df["question_id"] cat_list = [] ailist = [] # all_page_ids = pid_override aids = [] for row in tt.iterrows(): aid_list = [] a_list = [] x_list = [] u_list = [] y_list = [] s_list = [] hex_list = [] psi_list = [] print(row) aid = row[1]["id"] ts = row[1]["creation_date"] gr_id = row[1]["group_id"] gb_id = row[1]["gameboard_id"] student_ids = list(get_student_list(gr_id)["user_id"]) print(student_ids) student_data = get_user_data(student_ids) hexes = list(gb_qmap[gb_id]) print(hexes) # n-hot binarise the y vector here y_true = numpy.zeros(len(pid_override), dtype=numpy.int8) for hx in hexes: hxix = pid_override.index(hx) y_true[hxix] = 1.0 aid_list.append(aid) incl_psis = [] for psi in student_ids: S, X, U, A = pickle.loads(zlib.decompress(sxua[psi][ts])) if S[0] < 10: print("s0 under 10") continue if S[1] == 0: print("no time on plaform recorded") continue psi_list.append(psi) hex_list.append(hexes) y_list.append(y_true) # print(psi) # S,X,U,A = sxua[psi][ts] s_list.append(S) x_list.append(X) u_list.append(U) a_list.append(A) incl_psis.append(psi) print("student {} done".format(psi)) if len(s_list) == 0: continue s_arr = numpy.array(s_list) x_arr = numpy.array(x_list) u_arr = numpy.array(u_list) a_arr = numpy.array(a_list) predictions = model.predict([s_arr, u_arr]) save_class_report_card(ts, aid, gr_id, s_list, x_list, u_list, a_list, y_list, predictions, incl_psis, names_df) with open("a_ids.txt", "w+") as f: f.write("({})\n".format(len(aid_list))) f.writelines([str(a) + "\n" for a in sorted(aids)]) f.write("\n")
def augment_data(tr, sxua): concept_list = list(set().union(*concept_map.values())) print(concept_list) # yship = [] # qlist = pid_override # print(qlist) # print("investigate this mofo") # exit() # hex_counter = Counter() # tot = 0 # last_ts = None # for i, ass in enumerate(tr.iterrows()): # ass_id = ass[1]["gameboard_id"] # gb_id = ass[1]["gameboard_id"] # gr_id = ass[1]["group_id"] # ts = ass[1]["creation_date"] # hex_acc = [] # if last_ts is not None and ((ts-last_ts).days==0): # print("skipping same-day assignment") # hexagons = [gb_qmap[gb_id][0]] # for hx in hexagons: # if hx not in hex_acc: # hex_acc.append(hx) # continue # do not add same-day assignments # last_ts = ts # last_ass_id = ass_id # hexagons = [gb_qmap[gb_id][0]] # students = get_student_list(gr_id) # for psi in students: # for hx in hexagons: # if hx not in pid_override: # print(hx, " not in qlist") # pid_override.append(hx) # yship.append(hx) # hex_counter[hx] += 1 # tot += 1 # yship = list(concept_map.keys()) +yship # ylb = LabelBinarizer() # (classes=qlist) # qlist = numpy.unique(yship) # ylb.fit(qlist) # ylb.classes_ = yship # start by fitting the binariser to the shortlist of book qns # for hx in hex_counter.most_common(): # print(hx[0], hx[1]) # print(tot) # print(qlist) # print(ylb.classes_) # assert len(list(qlist)) == len(list(ylb.classes_)) # assert list(qlist) == list(ylb.classes_) # weights = {} # class_wgt = compute_class_weight('balanced', ylb.classes_, yship) # for clix, (cls, wgt) in enumerate(zip(ylb.classes_, class_wgt)): # print(clix, cls, wgt) # weights[clix] = wgt group_ids = pandas.unique(tr["group_id"]) aid_list = [] s_list = [] x_list = [] u_list = [] a_list = [] y_list = [] fout = open("tr_summ.csv", "w") for gr_id in group_ids: gr_ass = tr[tr["group_id"] == gr_id] last_ts = None for row in gr_ass.iterrows(): aid = row[1]["id"] ts = row[1]["creation_date"] gr_id = row[1]["group_id"] gb_id = row[1]["gameboard_id"] student_ids = list(get_student_list(gr_id)["user_id"]) hexes = list(gb_qmap[gb_id]) if last_ts is not None and ((ts - last_ts).days == 0): print("skipping same-day assignment") continue # do not add same-day assignments last_ts = ts for psi in student_ids: S, X, U, A = pickle.loads(zlib.decompress(sxua[psi][ts])) if S[0] < 10: #i.e. if student has no valid age continue if S[1] == 0: #no time in platform continue hexes_tried = [] hexes_to_try = [] # if len(hexes)==1: # hexes_to_try = hexes # else: # for ix, el in enumerate(X): # if el > 0: # page = all_qids[ix].split("|")[0] # if page not in hexes_tried: # hexes_tried.append(page) # for hx in hexes: # if hx not in hexes_tried: # hexes_to_try.append(hx) y_true = numpy.zeros( len(pid_override)) # numpy.zeros(len(all_page_ids)) # for hx in hexes_to_try: for hx in hexes: hxix = pid_override.index(hx) if X[hxix] == 0: hexes_to_try.append(hx) if hexes_to_try == []: print("no hexes to try") continue # decay = 0.5 # w = 1.0 # for hx in sorted(hexes_to_try): # hxix = pid_override.index(hx) # y_true[hxix] = 1 #/ len(hexes_to_try) # if len(hexes_to_try)>1: # print("trying", hx,w) # input("") # w = w * decay # y_true = y_true / y_true.sum() hxix = pid_override.index(sorted(hexes_to_try)[0]) y_true[hxix] = 1.0 # else: # hexes_tried.append(hx) # hexes_tried = [] # for i,el in enumerate(X): # if el>0: # pid = all_qids[i].split("|")[0] # if pid not in hexes_tried: # hexes_tried.append(pid) # print("hexes tried: {}".format(hexes_tried)) print("hexes t try: {}".format(hexes_to_try)) print("hexes : {}".format(hexes)) # print(numpy.sum(A)) # print([all_page_ids[hx] for hx,el in enumerate(A) if el==1]) aid_list.append(aid) # hex_list.append(hexes_to_try) s_list.append(S) # x_list.append(numpy.concatenate((X,U,A))) x_list.append(X) u_list.append(U) a_list.append(A) y_list.append(y_true) fout.write("{},{},{},{},{},{},{}\n".format( ts, psi, ",".join(map(str, S)), X.sum(), numpy.sum(X > 0), numpy.sum(U), " ".join(hexes_to_try))) fout.close() # exit() # input("nibit") gc.collect() s_list = numpy.array(s_list) x_list = numpy.array(x_list, dtype=numpy.int16) u_list = numpy.array(u_list, dtype=numpy.int8) a_list = numpy.array(a_list, dtype=numpy.int8) y_list = numpy.array(y_list, dtype=numpy.int8) return aid_list, s_list, x_list, u_list, a_list, y_list
print(len(assignments)) print(len(group_ids)) print(group_ids[0:20]) # exit() for gr_id in group_ids: gr_ass = assignments[assignments["group_id"] == gr_id] for row in gr_ass.iterrows(): # for row in assignments.iterrows(): aid = row[1]["id"] # print(row) ts = row[1]["creation_date"] # gr_id = row[1]["group_id"] gc.collect() gb_id = row[1]["gameboard_id"] student_ids = list(get_student_list(gr_id)["user_id"]) # print(student_ids) student_data = get_user_data(student_ids) now_hexes = list(gb_qmap[gb_id]) # print(now_hexes) # if 118651 not in student_ids: # continue for psi in student_ids: # if psi != 118651: # continue # print(psi) if psi not in SXUA: S = numpy.zeros(6) X = numpy.zeros(len(all_qids), dtype=numpy.int16) U = numpy.zeros(len(all_qids), dtype=numpy.int8) A = numpy.zeros(len(pid_override), dtype=numpy.int8)
def make_data(ass_n, pickle_at, APPEND=True): user_cache = {} ass_df = get_all_assignments() # ass_df = ass_df.iloc[27000:, :] # sprofs = pandas.read_csv(base + "student_profiling/users_all.csv") # sprofs["date_of_birth"] = pandas.to_datetime(sprofs["date_of_birth"]) gb_qmap = make_gb_question_map() ass_ct = 0 ass_df["creation_date"] = pandas.to_datetime(ass_df["creation_date"]) #ass_df = ass_df[ass_df.event_details!="{}"] #ass_df["event_details"] = ass_df["event_details"].str.replace("0L,", "0,") profile_df = get_user_data("*") profile_df["date_of_birth"] = pandas.to_datetime( profile_df["date_of_birth"]) ct = 0 if APPEND: print("APPEND mode") #recycle old pap f = open(asst_fname, 'rb') asses = pickle.load(f) f.close() tracking = open("tracking.dat", "w+") print("loaded {} existing assignments".format(len(asses))) else: f = open(asst_fname, 'wb') f.truncate(0) f.close() tracking = open("tracking.dat", "w") print("FRESH mode") #bake it fresh asses = OrderedDict() start_at = len(asses) number_to_do = ass_n - start_at if number_to_do <= 0: print("We already have {}>{} samples".format(start_at, ass_n)) exit(1) #if ass_n is -1 then this overrides the trimming of the assts ass_df = ass_df.iloc[start_at:, :] if (ass_n > 0) else ass_df for ass in ass_df.iterrows(): id = ass[1]["id"] if id in asses and False == FORCE_OVERWRITE: # print("this assignment has already been processed, skipping!") continue print("assct {} of {} ({} users cached)".format( ass_ct, ass_n, len(user_cache))) ts = ass[1]['creation_date'] # print(ts) # event_details = eval(ass[1]['event_details']) gb_id = ass[1]["gameboard_id"] if gb_id not in gb_qmap: print("gb id unknown") continue this_concepts = set() raw_qns = gb_qmap[gb_id] this_levels = [] this_qns = raw_qns if type(raw_qns) is str: this_qns = eval( raw_qns ) #TODO make sure this works hitting the database as well for q in this_qns: if "|" in q: q = q.split("|")[0] this_levels.append(lev_page_lookup[q]) cs = concept_extract(q) this_concepts.update(cs) gr_id = ass[1]["group_id"] students = get_student_list([gr_id]) if students.empty: print(gr_id, "no students") continue else: print(gr_id, "students!") students = list(students["user_id"]) profile_df = get_user_data(list(students)) # print("get group attempts") # attempts_df = get_attempts_from_db(students) # print("got group attempts") profiles = profile_students(students, profile_df, ts, concepts_all, hwdf, user_cache, attempts_df=None) print(len(profiles), len(students)) assert len(profiles) == len(students) assert len(profiles) > 0 # if len(profiles)==0: # print("no profiles") # continue print("compressing_profiles") c_profiles = zlib.compress(pickle.dumps(profiles)) print("compressed") ass_entry = (ts, gb_id, gr_id, this_qns, this_concepts, this_levels, students, c_profiles) tracking.write(str(ass_entry[0:7] + (len(profiles), ))) tracking.write("\n") # asses.append(ass_entry) asses[id] = ass_entry ass_ct += 1 print("...{} students".format(len(profiles))) # ct+=1 # afile.write(str(ass_entry)+"\n") # if ct > 100: # afile.flush() # ct=0 print("ass_ct", ass_ct) print("pickle at", pickle_at) print("%", (ass_ct % pickle_at)) if (ass_ct == number_to_do) or (ass_ct % pickle_at) == 0: f = open(asst_fname, 'wb') pickle.dump(asses, f) f.flush() print("***SAVED (hallelujah)") if ass_ct == number_to_do: print("we have hit maximum ass limit") break # print("taking massive dump") # # afile.write("]\n") # # afile.close() # # joblib.dump(asses, asst_fname) # # with gzip.open(asst_fname, 'w') as f: # # #_pickle.dump(asses, f) # # f.write(_pickle.dumps(asses)) # with open(asst_fname, 'wb') as f: # pickle.dump(asses, f) f.close() print("We now have {} assignments on disc".format(len(asses))) return tracking.close()
if __name__ == "__main__": teachers_df = pandas.DataFrame.from_csv("teachers.dat", header=0) teacher_ids = list(teachers_df.index) model = load_model(base + "hwg_model.hd5") (ylb, clb) = joblib.load(base + 'hwg_mlb.pkl') up_to_ts = pandas.datetime.now() fout = open("predictions.out", "w") for t in teacher_ids: class_list = get_group_list(t)["id"] print("groups:", class_list) for c in class_list: print("get student lsit for =>", c) students = get_student_list(c) students = list(students["user_id"]) print("students:", students) if not students: continue # students = list(students) profile_df = get_user_data(students) # print("profiles:",profile_df) X = [] for u in students: x_psi = gen_experience(u, up_to_ts) X.append(x_psi) X = numpy.array(X) predictions = model.predict(X) ymax = ylb.inverse_transform(predictions)