from sklearn.feature_selection import f_regression
from sklearn.model_selection import KFold

import general_functions as general_f
import data_postprocess as postprocess

# Various parameters that can be adjusted
# Hardcoded to the values found best for this model
imputation_strategy = "mean"
frac_missing_values_cutoff = 1.0
K = 400
max_depth = 10
n_estimators = 15

# Read in the data
data = general_f.check_if_data_exists_if_not_open_and_read()

# Clean up the outcomes and corresponding data if the outcome is NA
data_to_use, outcomes_to_use = postprocess.remove_NA_from_outcomes_and_data(
    data['survey_data_matched_to_outcomes'],
    [
        item[2] for  # item[2] corresponds to grit
        item in data['training_outcomes_matched_to_outcomes']
    ])

# Converts all the NA values to NaN, so the imputer can impute over them
# Also convert negative values to NaN, if also_convert_negatives=True
data_to_use = postprocess.convert_NA_values_to_NaN(data_to_use,
                                                   also_convert_negatives=True,
                                                   deepcopy=False)
Beispiel #2
0
import numpy as np
import os
import general_functions as general_f

try:
    os.remove(general_f.pickle_file_name)
except OSError:
    pass

data = general_f.check_if_data_exists_if_not_open_and_read(
    remove_bad_columns=False)

fathid_column = [
    i for i in range(len(data['survey_data_header']))
    if 'fathid' in data['survey_data_header'][i]
]
print "Father ID columns: " + str(fathid_column)

mothid_column = [
    i for i in range(len(data['survey_data_header']))
    if 'mothid' in data['survey_data_header'][i]
]
print "Mother ID columns: " + str(mothid_column)

all_NA_j = []

for j in range(len(data['survey_data_matched_to_outcomes'][0])):
    if all('NA' == item[j]
           for item in data['survey_data_matched_to_outcomes']):
        all_NA_j.append(j)
    elif all('Missing' == item[j]