from sklearn.feature_selection import f_regression from sklearn.model_selection import KFold import general_functions as general_f import data_postprocess as postprocess # Various parameters that can be adjusted # Hardcoded to the values found best for this model imputation_strategy = "mean" frac_missing_values_cutoff = 1.0 K = 400 max_depth = 10 n_estimators = 15 # Read in the data data = general_f.check_if_data_exists_if_not_open_and_read() # Clean up the outcomes and corresponding data if the outcome is NA data_to_use, outcomes_to_use = postprocess.remove_NA_from_outcomes_and_data( data['survey_data_matched_to_outcomes'], [ item[2] for # item[2] corresponds to grit item in data['training_outcomes_matched_to_outcomes'] ]) # Converts all the NA values to NaN, so the imputer can impute over them # Also convert negative values to NaN, if also_convert_negatives=True data_to_use = postprocess.convert_NA_values_to_NaN(data_to_use, also_convert_negatives=True, deepcopy=False)
import numpy as np import os import general_functions as general_f try: os.remove(general_f.pickle_file_name) except OSError: pass data = general_f.check_if_data_exists_if_not_open_and_read( remove_bad_columns=False) fathid_column = [ i for i in range(len(data['survey_data_header'])) if 'fathid' in data['survey_data_header'][i] ] print "Father ID columns: " + str(fathid_column) mothid_column = [ i for i in range(len(data['survey_data_header'])) if 'mothid' in data['survey_data_header'][i] ] print "Mother ID columns: " + str(mothid_column) all_NA_j = [] for j in range(len(data['survey_data_matched_to_outcomes'][0])): if all('NA' == item[j] for item in data['survey_data_matched_to_outcomes']): all_NA_j.append(j) elif all('Missing' == item[j]