コード例 #1
0
def aggregate_orthogonal_rankings(input_dictionary, ranking_path):

	file_name = "orthogonal_projection_feature_ranking.pickle"
	file_path = ranking_path + '/' + file_name

	final_output_dictionary = {}
	for feature in input_dictionary:
		scores = input_dictionary[feature]
		final_output_dictionary[feature] = (np.mean(np.array(scores)) , np.std(np.array(scores)))

	return pickle_this_variable_with_this_name_to_this_folder(final_output_dictionary, file_path)
コード例 #2
0
def aggregate_orthogonal_rankings(input_dictionary, ranking_path):

    file_name = "orthogonal_projection_feature_ranking.pickle"
    file_path = ranking_path + '/' + file_name

    final_output_dictionary = {}
    for feature in input_dictionary:
        scores = input_dictionary[feature]
        final_output_dictionary[feature] = (np.mean(np.array(scores)),
                                            np.std(np.array(scores)))

    return pickle_this_variable_with_this_name_to_this_folder(
        final_output_dictionary, file_path)
コード例 #3
0
def confirm_input_arguments_and_set_analysis_folders(options):

    print "checking input file"
    #check input file
    try:
        f = open(options.input_file)
    except IOError as e:
        print "I/O error({0}): {1}".format(e.errno, e.strerror)
    except:
        print "Unexpected error:", sys.exc_info()[0]
        raise

    print "checking if file can be read by pandas"
    #now read the file and check if target variable is in there.
    try:
        full_input_data = pd.read_csv(filepath_or_buffer=options.input_file,
                                      sep=',')
    except:
        print "Unexpected error while reading the input file", sys.exc_info(
        )[0]
        raise

    #now input file read as csv
    #get list of column names
    column_names = list(full_input_data.columns)
    duplicate_columns = len(column_names) - len(set(column_names))

    print "checking duplicate columns"
    #check for duplicate attributes
    if duplicate_columns > 0:
        raise "Your Input File has duplicate attributes, please remove duplicates"

    print "checking if specified target is in input file"
    #check if target is in list of column names
    column_names_lower = [a.lower() for a in column_names]
    if options.target.lower() not in column_names_lower:
        raise "Your input file does not contain the target variable that you specificied. Please check \
			 that you have the correct spelling "

    target_name_in_csv = column_names[column_names_lower.index(
        options.target.lower())]
    #check that the sensitive attributes are in the list of variables
    if options.sensitive_variable_list != None:
        sensitive = [a.lower() for a in options.sensitive_variable_list]
        for attribute in sensitive:
            if attribute not in column_names_lower:
                raise "Your input file does not contain the %s that you specificied. Please check \
			 	that you have the correct spelling ", attribute

    print "dropping n/a from dataframe"

    #drop rows with no na
    full_input_data.dropna(axis=0, how='any', inplace=True)

    print "converting all columns to floats"
    #now convert all the columns to float64
    for name in column_names:
        #print "we are working with {0} now".format(name)
        if full_input_data[name].dtype != float:
            full_input_data[name] = full_input_data[name].map(
                transform_column_float)

    print "done converting all columns to floats"

    print "creating analysis folders"
    #set up the relevant analysis folders.
    folder_paths_and_target = create_analysis_folders(options)

    #now write input file for mrmr analysis
    #ricci_new_df.to_csv(path_or_buf="ricci_data_processed.csv", sep=',', index=False)

    where_to_write_mrmr_input = folder_paths_and_target[
        "mrmr_input"] + "/input_file.csv"

    no_columns_in_df = full_input_data.shape[1]

    new_column_name = ['feature' + str(i) for i in range(no_columns_in_df)]
    new_full_input_data = full_input_data.copy()
    new_full_input_data.columns = new_column_name

    original_columns = list(full_input_data.columns)

    mapping_for_new_columns_names = {}
    for i in range(len(new_column_name)):
        mapping_for_new_columns_names[new_column_name[i]] = original_columns[i]

    #write out mapping to ranking
    mapping_name = "mrmr_column_feature_name_mapping.pickle"
    pickle_this_variable_with_this_name_to_this_folder(
        mapping_for_new_columns_names,
        folder_paths_and_target["ranking_results"] + "/" + mapping_name)

    new_full_input_data.to_csv(path_or_buf=where_to_write_mrmr_input,
                               sep=',',
                               index=False)

    #target name mrmr

    folder_paths_and_target["target_mrmr"] = new_column_name[
        original_columns.index(target_name_in_csv)]

    #and the name of target in the csv into the dictionary
    folder_paths_and_target["target"] = target_name_in_csv

    print "target name is " + folder_paths_and_target["target"]
    print "mrmr target name is " + folder_paths_and_target["target_mrmr"]
    return full_input_data, folder_paths_and_target
コード例 #4
0
ファイル: fairml.py プロジェクト: adebayoj/FairML
def confirm_input_arguments_and_set_analysis_folders(options):

	print "checking input file"
	#check input file
	try:
		f = open(options.input_file)
	except IOError as e:
		print "I/O error({0}): {1}".format(e.errno, e.strerror)
	except:
		print "Unexpected error:", sys.exc_info()[0]
		raise

	print "checking if file can be read by pandas"
	#now read the file and check if target variable is in there. 
	try:
		full_input_data = pd.read_csv(filepath_or_buffer=options.input_file, sep=',')
	except:
		print "Unexpected error while reading the input file", sys.exc_info()[0]
		raise

	#now input file read as csv
	#get list of column names
	column_names = list(full_input_data.columns)
	duplicate_columns = len(column_names) - len(set(column_names))

	print "checking duplicate columns"
	#check for duplicate attributes
	if duplicate_columns > 0:
		raise "Your Input File has duplicate attributes, please remove duplicates"

	print "checking if specified target is in input file"
	#check if target is in list of column names
	column_names_lower = [a.lower() for a in column_names]
	if options.target.lower() not in column_names_lower:
		raise "Your input file does not contain the target variable that you specificied. Please check \
			 that you have the correct spelling "

	target_name_in_csv = column_names[column_names_lower.index(options.target.lower())]
	#check that the sensitive attributes are in the list of variables
	if options.sensitive_variable_list != None:
		sensitive = [a.lower() for a in options.sensitive_variable_list]
		for attribute in sensitive:
			if attribute not in column_names_lower:
				raise "Your input file does not contain the %s that you specificied. Please check \
			 	that you have the correct spelling ", attribute


	print "dropping n/a from dataframe"

	#drop rows with no na
	full_input_data.dropna(axis=0, how='any', inplace=True)


	print "converting all columns to floats"
	#now convert all the columns to float64
	for name in column_names:
		#print "we are working with {0} now".format(name)
		if full_input_data[name].dtype != float:
			full_input_data[name] = full_input_data[name].map(transform_column_float)

	print "done converting all columns to floats"

	print "creating analysis folders"
	#set up the relevant analysis folders. 
	folder_paths_and_target = create_analysis_folders(options)

	#now write input file for mrmr analysis
	#ricci_new_df.to_csv(path_or_buf="ricci_data_processed.csv", sep=',', index=False)

	where_to_write_mrmr_input = folder_paths_and_target["mrmr_input"] + "/input_file.csv"
	
	no_columns_in_df = full_input_data.shape[1]


	new_column_name = ['feature'+str(i) for i in range(no_columns_in_df)]
	new_full_input_data = full_input_data.copy()
	new_full_input_data.columns = new_column_name

	original_columns = list(full_input_data.columns)

	mapping_for_new_columns_names = {}
	for i in range(len(new_column_name)):
		mapping_for_new_columns_names[new_column_name[i]] = original_columns[i]

	#write out mapping to ranking
	mapping_name = "mrmr_column_feature_name_mapping.pickle"
	pickle_this_variable_with_this_name_to_this_folder(mapping_for_new_columns_names, folder_paths_and_target["ranking_results"] + "/" + mapping_name)

	new_full_input_data.to_csv(path_or_buf=where_to_write_mrmr_input , sep=',', index=False)

	#target name mrmr
	
	folder_paths_and_target["target_mrmr"] = new_column_name[original_columns.index(target_name_in_csv)]

	#and the name of target in the csv into the dictionary
	folder_paths_and_target["target"] = target_name_in_csv

	print "target name is " + folder_paths_and_target["target"]
	print "mrmr target name is " + folder_paths_and_target["target_mrmr"]
	return full_input_data, folder_paths_and_target