def aggregate_orthogonal_rankings(input_dictionary, ranking_path): file_name = "orthogonal_projection_feature_ranking.pickle" file_path = ranking_path + '/' + file_name final_output_dictionary = {} for feature in input_dictionary: scores = input_dictionary[feature] final_output_dictionary[feature] = (np.mean(np.array(scores)) , np.std(np.array(scores))) return pickle_this_variable_with_this_name_to_this_folder(final_output_dictionary, file_path)
def aggregate_orthogonal_rankings(input_dictionary, ranking_path): file_name = "orthogonal_projection_feature_ranking.pickle" file_path = ranking_path + '/' + file_name final_output_dictionary = {} for feature in input_dictionary: scores = input_dictionary[feature] final_output_dictionary[feature] = (np.mean(np.array(scores)), np.std(np.array(scores))) return pickle_this_variable_with_this_name_to_this_folder( final_output_dictionary, file_path)
def confirm_input_arguments_and_set_analysis_folders(options): print "checking input file" #check input file try: f = open(options.input_file) except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) except: print "Unexpected error:", sys.exc_info()[0] raise print "checking if file can be read by pandas" #now read the file and check if target variable is in there. try: full_input_data = pd.read_csv(filepath_or_buffer=options.input_file, sep=',') except: print "Unexpected error while reading the input file", sys.exc_info( )[0] raise #now input file read as csv #get list of column names column_names = list(full_input_data.columns) duplicate_columns = len(column_names) - len(set(column_names)) print "checking duplicate columns" #check for duplicate attributes if duplicate_columns > 0: raise "Your Input File has duplicate attributes, please remove duplicates" print "checking if specified target is in input file" #check if target is in list of column names column_names_lower = [a.lower() for a in column_names] if options.target.lower() not in column_names_lower: raise "Your input file does not contain the target variable that you specificied. Please check \ that you have the correct spelling " target_name_in_csv = column_names[column_names_lower.index( options.target.lower())] #check that the sensitive attributes are in the list of variables if options.sensitive_variable_list != None: sensitive = [a.lower() for a in options.sensitive_variable_list] for attribute in sensitive: if attribute not in column_names_lower: raise "Your input file does not contain the %s that you specificied. Please check \ that you have the correct spelling ", attribute print "dropping n/a from dataframe" #drop rows with no na full_input_data.dropna(axis=0, how='any', inplace=True) print "converting all columns to floats" #now convert all the columns to float64 for name in column_names: #print "we are working with {0} now".format(name) if full_input_data[name].dtype != float: full_input_data[name] = full_input_data[name].map( transform_column_float) print "done converting all columns to floats" print "creating analysis folders" #set up the relevant analysis folders. folder_paths_and_target = create_analysis_folders(options) #now write input file for mrmr analysis #ricci_new_df.to_csv(path_or_buf="ricci_data_processed.csv", sep=',', index=False) where_to_write_mrmr_input = folder_paths_and_target[ "mrmr_input"] + "/input_file.csv" no_columns_in_df = full_input_data.shape[1] new_column_name = ['feature' + str(i) for i in range(no_columns_in_df)] new_full_input_data = full_input_data.copy() new_full_input_data.columns = new_column_name original_columns = list(full_input_data.columns) mapping_for_new_columns_names = {} for i in range(len(new_column_name)): mapping_for_new_columns_names[new_column_name[i]] = original_columns[i] #write out mapping to ranking mapping_name = "mrmr_column_feature_name_mapping.pickle" pickle_this_variable_with_this_name_to_this_folder( mapping_for_new_columns_names, folder_paths_and_target["ranking_results"] + "/" + mapping_name) new_full_input_data.to_csv(path_or_buf=where_to_write_mrmr_input, sep=',', index=False) #target name mrmr folder_paths_and_target["target_mrmr"] = new_column_name[ original_columns.index(target_name_in_csv)] #and the name of target in the csv into the dictionary folder_paths_and_target["target"] = target_name_in_csv print "target name is " + folder_paths_and_target["target"] print "mrmr target name is " + folder_paths_and_target["target_mrmr"] return full_input_data, folder_paths_and_target
def confirm_input_arguments_and_set_analysis_folders(options): print "checking input file" #check input file try: f = open(options.input_file) except IOError as e: print "I/O error({0}): {1}".format(e.errno, e.strerror) except: print "Unexpected error:", sys.exc_info()[0] raise print "checking if file can be read by pandas" #now read the file and check if target variable is in there. try: full_input_data = pd.read_csv(filepath_or_buffer=options.input_file, sep=',') except: print "Unexpected error while reading the input file", sys.exc_info()[0] raise #now input file read as csv #get list of column names column_names = list(full_input_data.columns) duplicate_columns = len(column_names) - len(set(column_names)) print "checking duplicate columns" #check for duplicate attributes if duplicate_columns > 0: raise "Your Input File has duplicate attributes, please remove duplicates" print "checking if specified target is in input file" #check if target is in list of column names column_names_lower = [a.lower() for a in column_names] if options.target.lower() not in column_names_lower: raise "Your input file does not contain the target variable that you specificied. Please check \ that you have the correct spelling " target_name_in_csv = column_names[column_names_lower.index(options.target.lower())] #check that the sensitive attributes are in the list of variables if options.sensitive_variable_list != None: sensitive = [a.lower() for a in options.sensitive_variable_list] for attribute in sensitive: if attribute not in column_names_lower: raise "Your input file does not contain the %s that you specificied. Please check \ that you have the correct spelling ", attribute print "dropping n/a from dataframe" #drop rows with no na full_input_data.dropna(axis=0, how='any', inplace=True) print "converting all columns to floats" #now convert all the columns to float64 for name in column_names: #print "we are working with {0} now".format(name) if full_input_data[name].dtype != float: full_input_data[name] = full_input_data[name].map(transform_column_float) print "done converting all columns to floats" print "creating analysis folders" #set up the relevant analysis folders. folder_paths_and_target = create_analysis_folders(options) #now write input file for mrmr analysis #ricci_new_df.to_csv(path_or_buf="ricci_data_processed.csv", sep=',', index=False) where_to_write_mrmr_input = folder_paths_and_target["mrmr_input"] + "/input_file.csv" no_columns_in_df = full_input_data.shape[1] new_column_name = ['feature'+str(i) for i in range(no_columns_in_df)] new_full_input_data = full_input_data.copy() new_full_input_data.columns = new_column_name original_columns = list(full_input_data.columns) mapping_for_new_columns_names = {} for i in range(len(new_column_name)): mapping_for_new_columns_names[new_column_name[i]] = original_columns[i] #write out mapping to ranking mapping_name = "mrmr_column_feature_name_mapping.pickle" pickle_this_variable_with_this_name_to_this_folder(mapping_for_new_columns_names, folder_paths_and_target["ranking_results"] + "/" + mapping_name) new_full_input_data.to_csv(path_or_buf=where_to_write_mrmr_input , sep=',', index=False) #target name mrmr folder_paths_and_target["target_mrmr"] = new_column_name[original_columns.index(target_name_in_csv)] #and the name of target in the csv into the dictionary folder_paths_and_target["target"] = target_name_in_csv print "target name is " + folder_paths_and_target["target"] print "mrmr target name is " + folder_paths_and_target["target_mrmr"] return full_input_data, folder_paths_and_target