def run_log_reg_hmm(data_file_base,
                    num_support,
                    num_pools,
                    num_iterations,
                    lead,
                    lag,
                    train=False,
                    do_parallel=True):
    start_time = time.time()
    num_weeks = 15

    data_prefix = "data/"
    data_suffix = ".csv"
    models_prefix = "models/"
    models_suffix = "_support_%s_logreg" % (num_support)
    data_file_train_input = data_prefix + data_file_base + "_train" + data_suffix
    data_file_train_hmm = data_prefix + data_file_base + "_train_logreg" + data_suffix
    data_file_test = data_prefix + data_file_base + "_test" + data_suffix
    models_dir = models_prefix + data_file_base + models_suffix

    train_data = np.genfromtxt(data_file_train_input,
                               delimiter=';',
                               skip_header=0)
    test_data = np.genfromtxt(data_file_test, delimiter=";", skip_header=0)

    #split into train 1 and train 2
    num_students = len(train_data) / num_weeks
    num_students_train_hmm = num_students / 2
    train_hmm_data = train_data[:num_students_train_hmm * num_weeks]
    train_logreg_data = train_data[num_students_train_hmm * num_weeks:]

    #train hmm on train_hmm_data
    np.savetxt(data_file_train_hmm, train_hmm_data, fmt="%d", delimiter=";")

    if not os.path.exists(models_dir) or train:
        run_train_hmm.train_model(data_file_base,
                                  num_support,
                                  num_pools=num_pools,
                                  num_iterations=num_iterations,
                                  logreg=True,
                                  do_parallel=do_parallel)

    assert os.path.exists(
        models_dir), "There is no trained model in directory %s" % (models_dir)

    def get_log_reg_features(data):
        dropout_value = 0  #bin value for a student dropped out
        command_base = ["./HMM_EM", "PredictStateDistribution", models_dir]

        logreg_X = None
        logreg_Y = []
        for student in range(len(data) / num_weeks):
            stud_data = data[student * num_weeks:(student + 1) * num_weeks]

            end_week = lag - 1
            label_week = lead + end_week
            X = stud_data[0:end_week + 1, :].flatten()
            truth_val = stud_data[label_week, 0]

            if stud_data[end_week, 0] == dropout_value:
                continue  #student has already dropped out

            features = np.array([])
            for prediction_week in range(end_week + 1):
                # get hidden state distribution for each prediction_week
                command = command_base + [
                    str(prediction_week)
                ] + X.astype(str).tolist(
                )  #need to pass lead+end_week in- API asks for week to predict
                results = subprocess.check_output(command)
                state_dist = np.fromstring(results, sep=";")[1:-1]
                prediction_week_features = state_dist[:-1]
                features = np.concatenate(
                    [features,
                     np.atleast_1d(prediction_week_features)])
            logreg_X = utils.add_to_data(logreg_X, features)
            logreg_Y += [truth_val]
        return logreg_X, logreg_Y

    # do inference on hmm to get features for logreg
    X_train, Y_train = get_log_reg_features(train_logreg_data)
    print "got train log_reg features for lead %s lag %s cohort %s support %s" % (
        lead, lag, data_file_base,
        num_support), time.time() - start_time, "seconds"

    #do inference on test set to get logreg features
    start_time = time.time()
    X_test, Y_test = get_log_reg_features(test_data)
    print "got test log_reg features for lead %s lag %s cohort %s support %s" % (
        lead, lag, data_file_base,
        num_support), time.time() - start_time, "seconds"

    return logistic_regression.run_regression(X_train, Y_train, X_test, Y_test,
                                              lead, lag)
Ejemplo n.º 2
0
	#Split data into training and testing sets
	x_train, x_test, y_train, y_test = train_test_split(X, T, test_size=0.40)
	
	#Run SVM with a linear kernel
	#svm.svm_linear(x_train, y_train, x_test, y_test)
	
	#Run SVM with a polynomial kernel
	#svm.svm_poly(x_train, y_train, x_test, y_test)

	#Run SVM with a radial basis function kernel
	#svm.svm_rbf(x_train, y_train, x_test, y_test)

	#Run SVM with a sigmoid kernel
	#svm.svm_sigmoid(x_train, y_train, x_test, y_test)

	#Run Logistic Regression
	lr.run_regression(True, 30, "Logistic Regression", x_train, y_train, x_test, y_test)

	#Run Scaled Logistic Regression
	lr.run_regression(True, 30, "Scaled Logistic Regression", x_train, y_train, x_test, y_test)

	#Run Log-transform Logistic Regression
	lr.run_regression(True, 30, "Log-transform Logistic Regression", x_train, y_train, x_test, y_test)

	#Run Binary-transform Logistic Regression
	lr.run_regression(True, 30, "Binary-transform Logistic Regression", x_train, y_train, x_test, y_test)




def run_log_reg_hmm(data_file_base, num_support, num_pools, num_iterations, lead, lag, train=False, do_parallel=True):
	start_time = time.time()
	num_weeks = 15

	data_prefix = "data/"
	data_suffix = ".csv"
	models_prefix = "models/"
	models_suffix = "_support_%s_logreg" % (num_support)
	data_file_train_input = data_prefix + data_file_base + "_train" + data_suffix
	data_file_train_hmm = data_prefix + data_file_base + "_train_logreg" + data_suffix
	data_file_test = data_prefix + data_file_base + "_test" + data_suffix
	models_dir = models_prefix + data_file_base + models_suffix

	train_data = np.genfromtxt(data_file_train_input, delimiter = ';', skip_header = 0)
	test_data = np.genfromtxt(data_file_test, delimiter = ";", skip_header = 0)

	#split into train 1 and train 2
	num_students = len(train_data) / num_weeks
	num_students_train_hmm =  num_students / 2
	train_hmm_data = train_data[: num_students_train_hmm * num_weeks]
	train_logreg_data = train_data[num_students_train_hmm * num_weeks :]

	#train hmm on train_hmm_data
	np.savetxt(data_file_train_hmm, train_hmm_data, fmt="%d", delimiter=";")

	if not os.path.exists(models_dir) or train:
		run_train_hmm.train_model(data_file_base, num_support, num_pools=num_pools, num_iterations=num_iterations, logreg=True, do_parallel=do_parallel)

	assert os.path.exists(models_dir), "There is no trained model in directory %s" % (models_dir)

	def get_log_reg_features(data):
		dropout_value = 0 #bin value for a student dropped out
		command_base = ["./HMM_EM", "PredictStateDistribution", models_dir]

		logreg_X = None
		logreg_Y = []
		for student in range(len(data) / num_weeks):
			stud_data = data[student * num_weeks: (student + 1) * num_weeks]

			end_week = lag -1
			label_week = lead + end_week
			X = stud_data[0: end_week + 1, :].flatten()
			truth_val = stud_data[label_week, 0]
			
			if stud_data[end_week, 0] == dropout_value:
				continue #student has already dropped out

			features = np.array([])
			for prediction_week in range(end_week + 1):
				# get hidden state distribution for each prediction_week
				command = command_base + [str(prediction_week)]+ X.astype(str).tolist() #need to pass lead+end_week in- API asks for week to predict
				results = subprocess.check_output(command)
				state_dist = np.fromstring(results, sep=";")[1:-1]
				prediction_week_features = state_dist[:-1]
				features = np.concatenate([features, np.atleast_1d(prediction_week_features)])
			logreg_X = utils.add_to_data(logreg_X, features)
			logreg_Y += [truth_val]
		return logreg_X, logreg_Y

	# do inference on hmm to get features for logreg
	X_train, Y_train = get_log_reg_features(train_logreg_data)
	print "got train log_reg features for lead %s lag %s cohort %s support %s" % (lead, lag, data_file_base, num_support), time.time() - start_time, "seconds"

	#do inference on test set to get logreg features
	start_time = time.time()
	X_test, Y_test = get_log_reg_features(test_data)
	print "got test log_reg features for lead %s lag %s cohort %s support %s" % (lead, lag, data_file_base, num_support), time.time() - start_time, "seconds"

	return logistic_regression.run_regression(X_train, Y_train, X_test, Y_test, lead, lag)