Exemple #1
0
def return_best_rf_regressor(df, target, num_trees_hyperparameter, num_trees_final_clf, num_iterations):
	print "entering return best rf regressor function"
	if df.shape[0] < 10000:
		num_samples = df.shape[0]
	else:
		num_samples = int(df.shape[0]*0.7)

	print "Sample dataframe"
	#use
	X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples)

	# figure out a vary this some how
	"""
	param_dist = {"max_depth": [5, None],
              "max_features": sp_randint(1, df.shape[1]),
              "min_samples_split": sp_randint(1, 15),
              "min_samples_leaf": sp_randint(1, 15),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}
    """
	param_dist = {"max_depth": [5, None], "max_features": sp_randint(1, df.shape[1]), "min_samples_split": sp_randint(1, 15), "min_samples_leaf": sp_randint(1, 15), "bootstrap": [True]}

	clf = RandomForestRegressor(n_estimators=num_trees_hyperparameter)
	print "starting hyperparameter search"
	clf_best, best_params = hyperparameter_search_random(X, y, clf, param_dist, num_iterations)

	print "sample data for fitting model"
    #train new classifier on the entire dataset
	X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples=df.shape[0])

	clf_final = RandomForestRegressor(n_estimators=num_trees_final_clf, max_depth = best_params["max_depth"], min_samples_leaf = best_params["min_samples_leaf"],  min_samples_split = best_params["min_samples_split"], bootstrap = best_params["bootstrap"], max_features = best_params["max_features"])

	print "Fitting Random Forest Regressor"
	clf_final.fit(X,y)
	return clf_final, column_list_for_sampled
Exemple #2
0
def run_lasso_on_input(df, target):
   
	X_part, y_part, _ = sample_data_frame_return_x_y_column_name(df, True, target, int(0.7*df.shape[0]))

	X_part, _ = scale_input_data(X_part)

	print "#######################################"
	print "Starting LARS CV"
	print "#######################################"

	lars_cv = LassoLarsCV(cv=10).fit(X_part, y_part)

	print "#######################################"
	print "Done with LARS CV"
	print "#######################################"

	#alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6)
	
	X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, df.shape[0])

	X, _ = scale_input_data(X)

	print "#######################################"
	print "Starting main lasso"
	print "#######################################"

	clf = RandomizedLasso(alpha= lars_cv.alphas_, random_state=12, n_resampling= 400, normalize=True).fit(X, y) 

	print "#######################################"
	print "Done with main lasso"
	print "#######################################"

	return clf, column_list_for_sampled
def orthogonal_variable_selection_cannot_query_black_box(df, target, non_linear, no_bootstrap_iter, num_samples):

	master_dictionary = {}
	for name in list(df.columns):
		if name != target:
			master_dictionary[name] = []

	for iteration in range(no_bootstrap_iter):

		print "going through iteration " + str(iteration) + " of orthogonal feature selection"

		X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(df, True, target, num_samples)

		master_dictionary = perform_orthogonal_variable_selection(X, y, column_list_for_sampled, non_linear, master_dictionary)

	return master_dictionary
Exemple #4
0
def orthogonal_variable_selection_cannot_query_black_box(
        df, target, non_linear, no_bootstrap_iter, num_samples):

    master_dictionary = {}
    for name in list(df.columns):
        if name != target:
            master_dictionary[name] = []

    for iteration in range(no_bootstrap_iter):

        print "going through iteration " + str(
            iteration) + " of orthogonal feature selection"

        X, y, column_list_for_sampled = sample_data_frame_return_x_y_column_name(
            df, True, target, num_samples)

        master_dictionary = perform_orthogonal_variable_selection(
            X, y, column_list_for_sampled, non_linear, master_dictionary)

    return master_dictionary