def get_log_reg_features(data): dropout_value = 0 #bin value for a student dropped out command_base = ["./HMM_EM", "PredictStateDistribution", models_dir] logreg_X = None logreg_Y = [] for student in range(len(data) / num_weeks): stud_data = data[student * num_weeks:(student + 1) * num_weeks] end_week = lag - 1 label_week = lead + end_week X = stud_data[0:end_week + 1, :].flatten() truth_val = stud_data[label_week, 0] if stud_data[end_week, 0] == dropout_value: continue #student has already dropped out features = np.array([]) for prediction_week in range(end_week + 1): # get hidden state distribution for each prediction_week command = command_base + [ str(prediction_week) ] + X.astype(str).tolist( ) #need to pass lead+end_week in- API asks for week to predict results = subprocess.check_output(command) state_dist = np.fromstring(results, sep=";")[1:-1] prediction_week_features = state_dist[:-1] features = np.concatenate( [features, np.atleast_1d(prediction_week_features)]) logreg_X = utils.add_to_data(logreg_X, features) logreg_Y += [truth_val] return logreg_X, logreg_Y
def execute_hmm(params): config_prefix, config_suffix, data_file_base, num_support, crossval_num = params.split( "___") config_file = config_prefix + data_file_base + "_%s" % crossval_num + config_suffix temp_dir = "temp_%s" % (crossval_num) time.sleep(1 * int(crossval_num)) utils.remove_and_make_dir(temp_dir) os.chdir(temp_dir) HMM_command = ["./../HMM_EM", "Train", "../" + config_file ] # need to concatenate since we are running binary results = subprocess.check_output(HMM_command) test_data = None for lead in range(1, 14): try: roc = run_inference_hmm.run_inference(data_file_base, num_support, "test", lead, plot_roc=False, crossval=True, crossval_num=crossval_num) test_data = utils.add_to_data(test_data, [lead, roc]) except: pass os.chdir("..") return np.atleast_2d(test_data)
def get_log_reg_features(data): dropout_value = 0 #bin value for a student dropped out command_base = ["./HMM_EM", "PredictStateDistribution", models_dir] logreg_X = None logreg_Y = [] for student in range(len(data) / num_weeks): stud_data = data[student * num_weeks: (student + 1) * num_weeks] end_week = lag -1 label_week = lead + end_week X = stud_data[0: end_week + 1, :].flatten() truth_val = stud_data[label_week, 0] if stud_data[end_week, 0] == dropout_value: continue #student has already dropped out features = np.array([]) for prediction_week in range(end_week + 1): # get hidden state distribution for each prediction_week command = command_base + [str(prediction_week)]+ X.astype(str).tolist() #need to pass lead+end_week in- API asks for week to predict results = subprocess.check_output(command) state_dist = np.fromstring(results, sep=";")[1:-1] prediction_week_features = state_dist[:-1] features = np.concatenate([features, np.atleast_1d(prediction_week_features)]) logreg_X = utils.add_to_data(logreg_X, features) logreg_Y += [truth_val] return logreg_X, logreg_Y
def execute_hmm(params): config_prefix, config_suffix, data_file_base, num_support, crossval_num = params.split("___") config_file = config_prefix + data_file_base + "_%s" % crossval_num + config_suffix temp_dir = "temp_%s" % (crossval_num) time.sleep(1 * int(crossval_num)) utils.remove_and_make_dir(temp_dir) os.chdir(temp_dir) HMM_command = ["./../HMM_EM", "Train", "../" + config_file] # need to concatenate since we are running binary results = subprocess.check_output(HMM_command) test_data = None for lead in range(1, 14): try: roc = run_inference_hmm.run_inference( data_file_base, num_support, "test", lead, plot_roc=False, crossval=True, crossval_num=crossval_num ) test_data = utils.add_to_data(test_data, [lead, roc]) except: pass os.chdir("..") return np.atleast_2d(test_data)
def run_hmm(data_file_base, num_support, num_pools, num_iterations, train=True): #run crossval run_hmm_cross_val.do_crossval(data_file_base, num_support, num_iterations=num_iterations, num_pools=num_pools) #If train is true- actually build the model if train: run_train_hmm.train_model(data_file_base, num_support, num_pools=num_pools, num_iterations=num_iterations) header = "lead,auc" #create results_file name test_results_file = "results/hmm_" + data_file_base + "_support_%s_test.csv" % (num_support) test_data = None pool = Pool(num_pools) rocs = pool.map(execute_hmm, ["___".join([data_file_base, str(num_support), str(lead)]) for lead in range(1,14)]) for idx, roc in enumerate(rocs): lead = idx + 1 if roc is not None: test_data = utils.add_to_data(test_data, [lead, roc]) np.savetxt(test_results_file, np.atleast_2d(test_data), fmt="%s", delimiter=",", header= header, comments='')
def run_experiments(data_file_base, num_support, num_pools, num_iterations): header = "lead,lag,support,auc" features_base = "features_" cohort = data_file_base[len(features_base):len("_bin_5") * -1] start_time = time.time() train_results_file = "results/logistic_reg_hmm_" + features_base + cohort + "_bin_5_support_%s_train" % num_support + ".csv" test_results_file = "results/logistic_reg_hmm_" + features_base + cohort + "_bin_5_support_%s_test" % num_support + ".csv" crossval_results_file = "results/logistic_reg_hmm_" + features_base + cohort + "_bin_5_support_%s_crossval" % num_support + ".csv" train_data = None test_data = None crossval_data = None data_file_base = features_base + cohort + "_bin_5" run_train_hmm.train_model(data_file_base, num_support, num_pools=num_pools, num_iterations=num_iterations, logreg=True, do_parallel=True) pool = Pool(num_pools) args_list = [] for lead in range(1, 14): for lag in range(1, 15 - lead): args_list += [ "___".join([ features_base, cohort, str(num_support), str(lead), str(lag), str(num_pools), str(num_iterations) ]) ] lead_lag_train_test_crossvals = pool.map(execute_log_reg_hmm, args_list) for lead_lag_train_test_crossval in lead_lag_train_test_crossvals: if lead_lag_train_test_crossval: lead, lag, train_auc, test_auc, crossval_auc = lead_lag_train_test_crossval.split( "___") if train_auc: train_data = utils.add_to_data( train_data, [int(lead), int(lag), num_support, float(train_auc)]) if test_auc: test_data = utils.add_to_data( test_data, [int(lead), int(lag), num_support, float(test_auc)]) if crossval_auc: crossval_data = utils.add_to_data( crossval_data, [int(lead), int(lag), num_support, float(crossval_auc)]) print "Ran logistic regression for %s support %s in %s seconds" % ( cohort, num_support, time.time() - start_time) start_time = time.time() np.savetxt(train_results_file, np.atleast_2d(train_data), fmt="%s", delimiter=",", header=header, comments='') np.savetxt(test_results_file, np.atleast_2d(test_data), fmt="%s", delimiter=",", header=header, comments='') np.savetxt(crossval_results_file, np.atleast_2d(crossval_data), fmt="%s", delimiter=",", header=header, comments='')
def do_crossval(data_file_base, num_support, num_iterations=100, num_pools=12): num_crossval = 10 num_weeks = 15 data_prefix = "data/" config_prefix = "configs/" data_suffix = ".csv" config_suffix = ".txt" in_data_file = data_prefix + data_file_base + "_train" + data_suffix assert os.path.exists( in_data_file), "There is no data file %s" % (in_data_file) train_data = np.genfromtxt(in_data_file, delimiter=';', skip_header=0) #split into 5 folds num_students = len(train_data) / num_weeks rs = cross_validation.ShuffleSplit(num_students, n_iter=num_crossval, test_size=0.1, indices=True) crossval_train = None crossval_test = None crossval_num = 0 for train_index, test_index in rs: data_file_crossval_train = data_prefix + data_file_base + "_train_%s_train" % crossval_num + data_suffix data_file_crossval_test = data_prefix + data_file_base + "_train_%s_test" % crossval_num + data_suffix config_file = config_prefix + data_file_base + "_%s" % crossval_num + config_suffix for stud_idx in train_index: stud_data = train_data[stud_idx * num_weeks:(stud_idx + 1) * num_weeks] crossval_train = utils.add_to_data(crossval_train, stud_data) for stud_idx in test_index: stud_data = train_data[stud_idx * num_weeks:(stud_idx + 1) * num_weeks] crossval_test = utils.add_to_data(crossval_test, stud_data) np.savetxt(data_file_crossval_train, crossval_train, fmt="%d", delimiter=";") np.savetxt(data_file_crossval_test, crossval_test, fmt="%d", delimiter=";") num_features = crossval_train.shape[1] observed_support = 5 hidden_supports = " ".join( str(x) for x in [2] + [observed_support] * (num_features - 1)) features = " ".join(str(x) for x in range(num_features)) crossval_num += 1 crossval_train = None crossval_test = None config_file_contents = \ """%s %s %s %s %s .0000001 %s OTHER""" % (num_features, hidden_supports, num_support, num_iterations, "../" + data_file_crossval_train, features) with open(config_file, "w") as text_file: text_file.write(config_file_contents) pool = Pool(num_pools) crossval_rocs = pool.map(execute_hmm, [ "___".join([ config_prefix, config_suffix, data_file_base, str(num_support), str(crossval_num) ]) for crossval_num in range(num_crossval) ]) for x in range(num_crossval): shutil.rmtree("temp_%s/" % x) header = "crossval,lead,auc" crossval_file = "results/hmm_" + data_file_base + "_support_%s_crossval.csv" % ( num_support) data = None for crossval_num, rocs in enumerate(crossval_rocs): if not rocs[0][0] == None: for (lead, auc) in rocs: data = utils.add_to_data(data, [crossval_num, lead, auc]) np.savetxt(crossval_file, np.atleast_2d(data), fmt="%s", delimiter=",", header=header, comments='') np.savetxt(crossval_file, np.atleast_2d(data), fmt="%s", delimiter=",", header=header, comments='')
data_file_prefix = "data/" + features_base data_file_suffix = ".csv" for cohort in cohorts: start_time = time.time() # figure out how to save and graph both train and test set train_results_file = "results/logistic_reg_" + features_base + cohort + "_train" + ".csv" train_graph_file = "results/images/logistic_reg_" + features_base + cohort + "_train" test_results_file = "results/logistic_reg_" + features_base + cohort + "_test" + ".csv" test_graph_file = "results/images/logistic_reg_" + features_base + cohort + "_test" train_data = None test_data = None crossval_data = None for lead in range (1,14): for lag in range(1, 15 - lead): train_file = data_file_prefix + cohort + "_train" + data_file_suffix test_file = data_file_prefix + cohort + "_test" + data_file_suffix try: train_auc, test_auc, crossval_auc = logistic_regression.load_and_run_regression(train_file, test_file, lead, lag) train_data = utils.add_to_data(train_data, [lead, lag, train_auc]) test_data = utils.add_to_data(test_data, [lead, lag, test_auc]) crossval_data = utils.add_to_data(crossval_data, [lead, lag, crossval_auc]) except: pass print "Ran logistic regression for %s in %s seconds" % (cohort, time.time() - start_time) np.savetxt(train_results_file, np.atleast_2d(train_data), fmt="%s", delimiter=",", header= header, comments='') np.savetxt(test_results_file, np.atleast_2d(test_data), fmt="%s", delimiter=",", header= header, comments='')
def do_crossval(data_file_base, num_support, num_iterations=100, num_pools=12): num_crossval = 10 num_weeks = 15 data_prefix = "data/" config_prefix = "configs/" data_suffix = ".csv" config_suffix = ".txt" in_data_file = data_prefix + data_file_base + "_train" + data_suffix assert os.path.exists(in_data_file), "There is no data file %s" % (in_data_file) train_data = np.genfromtxt(in_data_file, delimiter=";", skip_header=0) # split into 5 folds num_students = len(train_data) / num_weeks rs = cross_validation.ShuffleSplit(num_students, n_iter=num_crossval, test_size=0.1, indices=True) crossval_train = None crossval_test = None crossval_num = 0 for train_index, test_index in rs: data_file_crossval_train = data_prefix + data_file_base + "_train_%s_train" % crossval_num + data_suffix data_file_crossval_test = data_prefix + data_file_base + "_train_%s_test" % crossval_num + data_suffix config_file = config_prefix + data_file_base + "_%s" % crossval_num + config_suffix for stud_idx in train_index: stud_data = train_data[stud_idx * num_weeks : (stud_idx + 1) * num_weeks] crossval_train = utils.add_to_data(crossval_train, stud_data) for stud_idx in test_index: stud_data = train_data[stud_idx * num_weeks : (stud_idx + 1) * num_weeks] crossval_test = utils.add_to_data(crossval_test, stud_data) np.savetxt(data_file_crossval_train, crossval_train, fmt="%d", delimiter=";") np.savetxt(data_file_crossval_test, crossval_test, fmt="%d", delimiter=";") num_features = crossval_train.shape[1] observed_support = 5 hidden_supports = " ".join(str(x) for x in [2] + [observed_support] * (num_features - 1)) features = " ".join(str(x) for x in range(num_features)) crossval_num += 1 crossval_train = None crossval_test = None config_file_contents = """%s %s %s %s %s .0000001 %s OTHER""" % ( num_features, hidden_supports, num_support, num_iterations, "../" + data_file_crossval_train, features, ) with open(config_file, "w") as text_file: text_file.write(config_file_contents) pool = Pool(num_pools) crossval_rocs = pool.map( execute_hmm, [ "___".join([config_prefix, config_suffix, data_file_base, str(num_support), str(crossval_num)]) for crossval_num in range(num_crossval) ], ) for x in range(num_crossval): shutil.rmtree("temp_%s/" % x) header = "crossval,lead,auc" crossval_file = "results/hmm_" + data_file_base + "_support_%s_crossval.csv" % (num_support) data = None for crossval_num, rocs in enumerate(crossval_rocs): if not rocs[0][0] == None: for (lead, auc) in rocs: data = utils.add_to_data(data, [crossval_num, lead, auc]) np.savetxt(crossval_file, np.atleast_2d(data), fmt="%s", delimiter=",", header=header, comments="") np.savetxt(crossval_file, np.atleast_2d(data), fmt="%s", delimiter=",", header=header, comments="")
for cohort in cohorts: data_file = data_file_prefix + features_base + cohort + data_file_suffix results_file_time = results_prefix + features_base + cohort + "_time_averaged" + results_suffix total_weights = [0] * 27 num_weights = 0 lags_averaged = np.zeros([13, 13]) for lead in range(1, 14): for lag in range(1, 15 - lead): try: weights = run_regression(data_file, lead, lag) averaged_weights = np.mean(np.reshape(weights, (-1, 27)), axis=0) averaged_weights_weeks = np.mean(np.reshape(weights, (-1, 27)), axis=1) lags_averaged[lag][0:lag] += averaged_weights_weeks.tolist() data = utils.add_to_data(data, [cohort, lead, lag] + averaged_weights.tolist()) total_weights += averaged_weights num_weights += 1 except Exception as e: pass np.savetxt(results_file_time, lags_averaged, fmt="%s", delimiter=",") average_weights = [weight / num_weights for weight in total_weights] data = utils.add_to_data(data, [cohort, "-", "-"] + average_weights) np.savetxt(results_file, np.atleast_2d(data), fmt="%s", delimiter=",", header=header, comments='')
header = "cohort,lead,lag," + ",".join(["feature_%s" % x for x in range(2,29)]) data = None for cohort in cohorts: data_file = data_file_prefix + features_base + cohort + data_file_suffix results_file_time = results_prefix + features_base + cohort + "_time_averaged" + results_suffix total_weights = [0]*27 num_weights = 0 lags_averaged = np.zeros([13, 13]) for lead in range (1,14): for lag in range(1, 15 - lead): try: weights = run_regression(data_file, lead, lag) averaged_weights = np.mean(np.reshape(weights, (-1, 27)), axis=0) averaged_weights_weeks = np.mean(np.reshape(weights, (-1, 27)), axis=1) lags_averaged[lag][0: lag] += averaged_weights_weeks.tolist() data = utils.add_to_data(data, [cohort, lead, lag] + averaged_weights.tolist()) total_weights += averaged_weights num_weights += 1 except Exception as e: pass np.savetxt(results_file_time, lags_averaged, fmt="%s", delimiter=",") average_weights = [weight / num_weights for weight in total_weights] data = utils.add_to_data(data, [cohort, "-", "-"] + average_weights) np.savetxt(results_file, np.atleast_2d(data), fmt="%s", delimiter=",", header= header, comments='')
train_results_file = "results/logistic_reg_" + features_base + cohort + "_train" + ".csv" train_graph_file = "results/images/logistic_reg_" + features_base + cohort + "_train" test_results_file = "results/logistic_reg_" + features_base + cohort + "_test" + ".csv" test_graph_file = "results/images/logistic_reg_" + features_base + cohort + "_test" train_data = None test_data = None crossval_data = None for lead in range(1, 14): for lag in range(1, 15 - lead): train_file = data_file_prefix + cohort + "_train" + data_file_suffix test_file = data_file_prefix + cohort + "_test" + data_file_suffix try: train_auc, test_auc, crossval_auc = logistic_regression.load_and_run_regression( train_file, test_file, lead, lag) train_data = utils.add_to_data(train_data, [lead, lag, train_auc]) test_data = utils.add_to_data(test_data, [lead, lag, test_auc]) crossval_data = utils.add_to_data(crossval_data, [lead, lag, crossval_auc]) except: pass print "Ran logistic regression for %s in %s seconds" % ( cohort, time.time() - start_time) np.savetxt(train_results_file, np.atleast_2d(train_data), fmt="%s", delimiter=",", header=header, comments='') np.savetxt(test_results_file,