def get_log_reg_features(data):
        dropout_value = 0  #bin value for a student dropped out
        command_base = ["./HMM_EM", "PredictStateDistribution", models_dir]

        logreg_X = None
        logreg_Y = []
        for student in range(len(data) / num_weeks):
            stud_data = data[student * num_weeks:(student + 1) * num_weeks]

            end_week = lag - 1
            label_week = lead + end_week
            X = stud_data[0:end_week + 1, :].flatten()
            truth_val = stud_data[label_week, 0]

            if stud_data[end_week, 0] == dropout_value:
                continue  #student has already dropped out

            features = np.array([])
            for prediction_week in range(end_week + 1):
                # get hidden state distribution for each prediction_week
                command = command_base + [
                    str(prediction_week)
                ] + X.astype(str).tolist(
                )  #need to pass lead+end_week in- API asks for week to predict
                results = subprocess.check_output(command)
                state_dist = np.fromstring(results, sep=";")[1:-1]
                prediction_week_features = state_dist[:-1]
                features = np.concatenate(
                    [features,
                     np.atleast_1d(prediction_week_features)])
            logreg_X = utils.add_to_data(logreg_X, features)
            logreg_Y += [truth_val]
        return logreg_X, logreg_Y
Example #2
0
def execute_hmm(params):
    config_prefix, config_suffix, data_file_base, num_support, crossval_num = params.split(
        "___")
    config_file = config_prefix + data_file_base + "_%s" % crossval_num + config_suffix
    temp_dir = "temp_%s" % (crossval_num)
    time.sleep(1 * int(crossval_num))
    utils.remove_and_make_dir(temp_dir)
    os.chdir(temp_dir)
    HMM_command = ["./../HMM_EM", "Train", "../" + config_file
                   ]  # need to concatenate since we are running binary
    results = subprocess.check_output(HMM_command)
    test_data = None
    for lead in range(1, 14):
        try:
            roc = run_inference_hmm.run_inference(data_file_base,
                                                  num_support,
                                                  "test",
                                                  lead,
                                                  plot_roc=False,
                                                  crossval=True,
                                                  crossval_num=crossval_num)
            test_data = utils.add_to_data(test_data, [lead, roc])
        except:
            pass
    os.chdir("..")
    return np.atleast_2d(test_data)
	def get_log_reg_features(data):
		dropout_value = 0 #bin value for a student dropped out
		command_base = ["./HMM_EM", "PredictStateDistribution", models_dir]

		logreg_X = None
		logreg_Y = []
		for student in range(len(data) / num_weeks):
			stud_data = data[student * num_weeks: (student + 1) * num_weeks]

			end_week = lag -1
			label_week = lead + end_week
			X = stud_data[0: end_week + 1, :].flatten()
			truth_val = stud_data[label_week, 0]
			
			if stud_data[end_week, 0] == dropout_value:
				continue #student has already dropped out

			features = np.array([])
			for prediction_week in range(end_week + 1):
				# get hidden state distribution for each prediction_week
				command = command_base + [str(prediction_week)]+ X.astype(str).tolist() #need to pass lead+end_week in- API asks for week to predict
				results = subprocess.check_output(command)
				state_dist = np.fromstring(results, sep=";")[1:-1]
				prediction_week_features = state_dist[:-1]
				features = np.concatenate([features, np.atleast_1d(prediction_week_features)])
			logreg_X = utils.add_to_data(logreg_X, features)
			logreg_Y += [truth_val]
		return logreg_X, logreg_Y
def execute_hmm(params):
    config_prefix, config_suffix, data_file_base, num_support, crossval_num = params.split("___")
    config_file = config_prefix + data_file_base + "_%s" % crossval_num + config_suffix
    temp_dir = "temp_%s" % (crossval_num)
    time.sleep(1 * int(crossval_num))
    utils.remove_and_make_dir(temp_dir)
    os.chdir(temp_dir)
    HMM_command = ["./../HMM_EM", "Train", "../" + config_file]  # need to concatenate since we are running binary
    results = subprocess.check_output(HMM_command)
    test_data = None
    for lead in range(1, 14):
        try:
            roc = run_inference_hmm.run_inference(
                data_file_base, num_support, "test", lead, plot_roc=False, crossval=True, crossval_num=crossval_num
            )
            test_data = utils.add_to_data(test_data, [lead, roc])
        except:
            pass
    os.chdir("..")
    return np.atleast_2d(test_data)
Example #5
0
def run_hmm(data_file_base, num_support, num_pools, num_iterations, train=True):
	#run crossval
	run_hmm_cross_val.do_crossval(data_file_base, num_support, num_iterations=num_iterations, num_pools=num_pools)

	#If train is true- actually build the model
	if train:
		run_train_hmm.train_model(data_file_base, num_support, num_pools=num_pools, num_iterations=num_iterations)	

	header = "lead,auc"

	#create results_file name
	test_results_file = "results/hmm_" + data_file_base + "_support_%s_test.csv" % (num_support)

	test_data = None

	pool = Pool(num_pools)
	rocs = pool.map(execute_hmm, ["___".join([data_file_base, str(num_support), str(lead)]) for lead in range(1,14)])
	for idx, roc in enumerate(rocs):
		lead = idx + 1
		if roc is not None:
			test_data = utils.add_to_data(test_data, [lead, roc])

	np.savetxt(test_results_file, np.atleast_2d(test_data), fmt="%s", delimiter=",", header= header, comments='')
Example #6
0
def run_experiments(data_file_base, num_support, num_pools, num_iterations):
    header = "lead,lag,support,auc"
    features_base = "features_"
    cohort = data_file_base[len(features_base):len("_bin_5") * -1]

    start_time = time.time()
    train_results_file = "results/logistic_reg_hmm_" + features_base + cohort + "_bin_5_support_%s_train" % num_support + ".csv"
    test_results_file = "results/logistic_reg_hmm_" + features_base + cohort + "_bin_5_support_%s_test" % num_support + ".csv"
    crossval_results_file = "results/logistic_reg_hmm_" + features_base + cohort + "_bin_5_support_%s_crossval" % num_support + ".csv"

    train_data = None
    test_data = None
    crossval_data = None
    data_file_base = features_base + cohort + "_bin_5"
    run_train_hmm.train_model(data_file_base,
                              num_support,
                              num_pools=num_pools,
                              num_iterations=num_iterations,
                              logreg=True,
                              do_parallel=True)
    pool = Pool(num_pools)
    args_list = []
    for lead in range(1, 14):
        for lag in range(1, 15 - lead):
            args_list += [
                "___".join([
                    features_base, cohort,
                    str(num_support),
                    str(lead),
                    str(lag),
                    str(num_pools),
                    str(num_iterations)
                ])
            ]
    lead_lag_train_test_crossvals = pool.map(execute_log_reg_hmm, args_list)
    for lead_lag_train_test_crossval in lead_lag_train_test_crossvals:
        if lead_lag_train_test_crossval:
            lead, lag, train_auc, test_auc, crossval_auc = lead_lag_train_test_crossval.split(
                "___")
            if train_auc:
                train_data = utils.add_to_data(
                    train_data,
                    [int(lead),
                     int(lag), num_support,
                     float(train_auc)])
            if test_auc:
                test_data = utils.add_to_data(
                    test_data,
                    [int(lead),
                     int(lag), num_support,
                     float(test_auc)])
            if crossval_auc:
                crossval_data = utils.add_to_data(
                    crossval_data,
                    [int(lead),
                     int(lag), num_support,
                     float(crossval_auc)])

    print "Ran logistic regression for %s support %s in %s seconds" % (
        cohort, num_support, time.time() - start_time)
    start_time = time.time()
    np.savetxt(train_results_file,
               np.atleast_2d(train_data),
               fmt="%s",
               delimiter=",",
               header=header,
               comments='')
    np.savetxt(test_results_file,
               np.atleast_2d(test_data),
               fmt="%s",
               delimiter=",",
               header=header,
               comments='')
    np.savetxt(crossval_results_file,
               np.atleast_2d(crossval_data),
               fmt="%s",
               delimiter=",",
               header=header,
               comments='')
Example #7
0
def do_crossval(data_file_base, num_support, num_iterations=100, num_pools=12):
    num_crossval = 10
    num_weeks = 15

    data_prefix = "data/"
    config_prefix = "configs/"
    data_suffix = ".csv"
    config_suffix = ".txt"

    in_data_file = data_prefix + data_file_base + "_train" + data_suffix
    assert os.path.exists(
        in_data_file), "There is no data file %s" % (in_data_file)
    train_data = np.genfromtxt(in_data_file, delimiter=';', skip_header=0)

    #split into 5 folds
    num_students = len(train_data) / num_weeks
    rs = cross_validation.ShuffleSplit(num_students,
                                       n_iter=num_crossval,
                                       test_size=0.1,
                                       indices=True)

    crossval_train = None
    crossval_test = None
    crossval_num = 0
    for train_index, test_index in rs:
        data_file_crossval_train = data_prefix + data_file_base + "_train_%s_train" % crossval_num + data_suffix
        data_file_crossval_test = data_prefix + data_file_base + "_train_%s_test" % crossval_num + data_suffix
        config_file = config_prefix + data_file_base + "_%s" % crossval_num + config_suffix

        for stud_idx in train_index:
            stud_data = train_data[stud_idx * num_weeks:(stud_idx + 1) *
                                   num_weeks]
            crossval_train = utils.add_to_data(crossval_train, stud_data)
        for stud_idx in test_index:
            stud_data = train_data[stud_idx * num_weeks:(stud_idx + 1) *
                                   num_weeks]
            crossval_test = utils.add_to_data(crossval_test, stud_data)

        np.savetxt(data_file_crossval_train,
                   crossval_train,
                   fmt="%d",
                   delimiter=";")
        np.savetxt(data_file_crossval_test,
                   crossval_test,
                   fmt="%d",
                   delimiter=";")

        num_features = crossval_train.shape[1]
        observed_support = 5
        hidden_supports = " ".join(
            str(x) for x in [2] + [observed_support] * (num_features - 1))
        features = " ".join(str(x) for x in range(num_features))

        crossval_num += 1
        crossval_train = None
        crossval_test = None

        config_file_contents = \
       """%s
%s
%s
%s
%s
.0000001
%s
OTHER""" % (num_features, hidden_supports, num_support, num_iterations, "../" + data_file_crossval_train, features)

        with open(config_file, "w") as text_file:
            text_file.write(config_file_contents)

    pool = Pool(num_pools)
    crossval_rocs = pool.map(execute_hmm, [
        "___".join([
            config_prefix, config_suffix, data_file_base,
            str(num_support),
            str(crossval_num)
        ]) for crossval_num in range(num_crossval)
    ])

    for x in range(num_crossval):
        shutil.rmtree("temp_%s/" % x)

    header = "crossval,lead,auc"
    crossval_file = "results/hmm_" + data_file_base + "_support_%s_crossval.csv" % (
        num_support)
    data = None
    for crossval_num, rocs in enumerate(crossval_rocs):
        if not rocs[0][0] == None:
            for (lead, auc) in rocs:
                data = utils.add_to_data(data, [crossval_num, lead, auc])
                np.savetxt(crossval_file,
                           np.atleast_2d(data),
                           fmt="%s",
                           delimiter=",",
                           header=header,
                           comments='')
    np.savetxt(crossval_file,
               np.atleast_2d(data),
               fmt="%s",
               delimiter=",",
               header=header,
               comments='')
data_file_prefix = "data/" + features_base
data_file_suffix = ".csv"


for cohort in cohorts:
	start_time = time.time()
	# figure out how to save and graph both train and test set
	train_results_file = "results/logistic_reg_" + features_base + cohort + "_train" + ".csv"
	train_graph_file = "results/images/logistic_reg_" + features_base + cohort + "_train"
	test_results_file = "results/logistic_reg_" + features_base + cohort + "_test" + ".csv"
	test_graph_file = "results/images/logistic_reg_" + features_base + cohort + "_test"

	train_data = None
	test_data = None
	crossval_data = None
	for lead in range (1,14):
		for lag in range(1, 15 - lead):
			train_file = data_file_prefix + cohort + "_train" + data_file_suffix
			test_file = data_file_prefix + cohort + "_test" + data_file_suffix
			try:
				train_auc, test_auc, crossval_auc = logistic_regression.load_and_run_regression(train_file, test_file, lead, lag)
				train_data = utils.add_to_data(train_data, [lead, lag, train_auc])
				test_data = utils.add_to_data(test_data, [lead, lag, test_auc])
				crossval_data = utils.add_to_data(crossval_data, [lead, lag, crossval_auc])
			except:
				pass
	print "Ran logistic regression for %s in %s seconds" % (cohort, time.time() - start_time)

	np.savetxt(train_results_file, np.atleast_2d(train_data), fmt="%s", delimiter=",", header= header, comments='')
	np.savetxt(test_results_file, np.atleast_2d(test_data), fmt="%s", delimiter=",", header= header, comments='')
def do_crossval(data_file_base, num_support, num_iterations=100, num_pools=12):
    num_crossval = 10
    num_weeks = 15

    data_prefix = "data/"
    config_prefix = "configs/"
    data_suffix = ".csv"
    config_suffix = ".txt"

    in_data_file = data_prefix + data_file_base + "_train" + data_suffix
    assert os.path.exists(in_data_file), "There is no data file %s" % (in_data_file)
    train_data = np.genfromtxt(in_data_file, delimiter=";", skip_header=0)

    # split into 5 folds
    num_students = len(train_data) / num_weeks
    rs = cross_validation.ShuffleSplit(num_students, n_iter=num_crossval, test_size=0.1, indices=True)

    crossval_train = None
    crossval_test = None
    crossval_num = 0
    for train_index, test_index in rs:
        data_file_crossval_train = data_prefix + data_file_base + "_train_%s_train" % crossval_num + data_suffix
        data_file_crossval_test = data_prefix + data_file_base + "_train_%s_test" % crossval_num + data_suffix
        config_file = config_prefix + data_file_base + "_%s" % crossval_num + config_suffix

        for stud_idx in train_index:
            stud_data = train_data[stud_idx * num_weeks : (stud_idx + 1) * num_weeks]
            crossval_train = utils.add_to_data(crossval_train, stud_data)
        for stud_idx in test_index:
            stud_data = train_data[stud_idx * num_weeks : (stud_idx + 1) * num_weeks]
            crossval_test = utils.add_to_data(crossval_test, stud_data)

        np.savetxt(data_file_crossval_train, crossval_train, fmt="%d", delimiter=";")
        np.savetxt(data_file_crossval_test, crossval_test, fmt="%d", delimiter=";")

        num_features = crossval_train.shape[1]
        observed_support = 5
        hidden_supports = " ".join(str(x) for x in [2] + [observed_support] * (num_features - 1))
        features = " ".join(str(x) for x in range(num_features))

        crossval_num += 1
        crossval_train = None
        crossval_test = None

        config_file_contents = """%s
%s
%s
%s
%s
.0000001
%s
OTHER""" % (
            num_features,
            hidden_supports,
            num_support,
            num_iterations,
            "../" + data_file_crossval_train,
            features,
        )

        with open(config_file, "w") as text_file:
            text_file.write(config_file_contents)

    pool = Pool(num_pools)
    crossval_rocs = pool.map(
        execute_hmm,
        [
            "___".join([config_prefix, config_suffix, data_file_base, str(num_support), str(crossval_num)])
            for crossval_num in range(num_crossval)
        ],
    )

    for x in range(num_crossval):
        shutil.rmtree("temp_%s/" % x)

    header = "crossval,lead,auc"
    crossval_file = "results/hmm_" + data_file_base + "_support_%s_crossval.csv" % (num_support)
    data = None
    for crossval_num, rocs in enumerate(crossval_rocs):
        if not rocs[0][0] == None:
            for (lead, auc) in rocs:
                data = utils.add_to_data(data, [crossval_num, lead, auc])
                np.savetxt(crossval_file, np.atleast_2d(data), fmt="%s", delimiter=",", header=header, comments="")
    np.savetxt(crossval_file, np.atleast_2d(data), fmt="%s", delimiter=",", header=header, comments="")
for cohort in cohorts:
    data_file = data_file_prefix + features_base + cohort + data_file_suffix
    results_file_time = results_prefix + features_base + cohort + "_time_averaged" + results_suffix
    total_weights = [0] * 27
    num_weights = 0
    lags_averaged = np.zeros([13, 13])
    for lead in range(1, 14):
        for lag in range(1, 15 - lead):
            try:
                weights = run_regression(data_file, lead, lag)
                averaged_weights = np.mean(np.reshape(weights, (-1, 27)),
                                           axis=0)
                averaged_weights_weeks = np.mean(np.reshape(weights, (-1, 27)),
                                                 axis=1)
                lags_averaged[lag][0:lag] += averaged_weights_weeks.tolist()
                data = utils.add_to_data(data, [cohort, lead, lag] +
                                         averaged_weights.tolist())
                total_weights += averaged_weights
                num_weights += 1
            except Exception as e:
                pass
    np.savetxt(results_file_time, lags_averaged, fmt="%s", delimiter=",")
    average_weights = [weight / num_weights for weight in total_weights]
    data = utils.add_to_data(data, [cohort, "-", "-"] + average_weights)

np.savetxt(results_file,
           np.atleast_2d(data),
           fmt="%s",
           delimiter=",",
           header=header,
           comments='')
header = "cohort,lead,lag," + ",".join(["feature_%s" % x for x in range(2,29)])

data = None
for cohort in cohorts:
	data_file = data_file_prefix + features_base + cohort + data_file_suffix
	results_file_time = results_prefix + features_base  + cohort + "_time_averaged" + results_suffix
	total_weights = [0]*27
	num_weights = 0
	lags_averaged = np.zeros([13, 13])
	for lead in range (1,14):
		for lag in range(1, 15 - lead):
			try:
				weights = run_regression(data_file, lead, lag)
				averaged_weights = np.mean(np.reshape(weights, (-1, 27)), axis=0)
				averaged_weights_weeks = np.mean(np.reshape(weights, (-1, 27)), axis=1)
				lags_averaged[lag][0: lag] += averaged_weights_weeks.tolist()
				data = utils.add_to_data(data, [cohort, lead, lag] + averaged_weights.tolist())
				total_weights += averaged_weights
				num_weights += 1
			except Exception as e:
				pass
	np.savetxt(results_file_time, lags_averaged, fmt="%s", delimiter=",")
	average_weights = [weight / num_weights for weight in  total_weights]
	data = utils.add_to_data(data, [cohort, "-", "-"] + average_weights)

	
np.savetxt(results_file, np.atleast_2d(data), fmt="%s", delimiter=",", header= header, comments='')


    train_results_file = "results/logistic_reg_" + features_base + cohort + "_train" + ".csv"
    train_graph_file = "results/images/logistic_reg_" + features_base + cohort + "_train"
    test_results_file = "results/logistic_reg_" + features_base + cohort + "_test" + ".csv"
    test_graph_file = "results/images/logistic_reg_" + features_base + cohort + "_test"

    train_data = None
    test_data = None
    crossval_data = None
    for lead in range(1, 14):
        for lag in range(1, 15 - lead):
            train_file = data_file_prefix + cohort + "_train" + data_file_suffix
            test_file = data_file_prefix + cohort + "_test" + data_file_suffix
            try:
                train_auc, test_auc, crossval_auc = logistic_regression.load_and_run_regression(
                    train_file, test_file, lead, lag)
                train_data = utils.add_to_data(train_data,
                                               [lead, lag, train_auc])
                test_data = utils.add_to_data(test_data, [lead, lag, test_auc])
                crossval_data = utils.add_to_data(crossval_data,
                                                  [lead, lag, crossval_auc])
            except:
                pass
    print "Ran logistic regression for %s in %s seconds" % (
        cohort, time.time() - start_time)

    np.savetxt(train_results_file,
               np.atleast_2d(train_data),
               fmt="%s",
               delimiter=",",
               header=header,
               comments='')
    np.savetxt(test_results_file,