def landmarker_metafeatures(X, y, categorical, folds): utils.input_check(X, y, categorical) features = OrderedDict() # TODO: When having folds, do cross-validation instead as it takes more time # end also can give a fair indication of predictive accuracy for i in [1,2,4]: accuracy, total_time, mean_time = cross_validate_classifier(DecisionTreeClassifier(criterion='gini', max_depth=i), X, y, folds) features["DecisionTreeGiniDepth{}Accuracy".format(i)] = accuracy features["DecisionTreeGiniDepth{}TimeSum".format(i)] = total_time features["DecisionTreeGiniDepth{}TimeMean".format(i)] = mean_time accuracy, total_time, mean_time = cross_validate_classifier(DecisionTreeClassifier(criterion='entropy', max_depth=i), X, y, folds) features["DecisionTreeEntropyDepth{}Accuracy".format(i)] = accuracy features["DecisionTreeEntropyDepth{}TimeSum".format(i)] = total_time features["DecisionTreeEntropyDepth{}TimeMean".format(i)] = mean_time accuracy, total_time, mean_time = cross_validate_classifier(DecisionTreeClassifier(), X, y, folds) features["DecisionTreeGiniAccuracy".format(i)] = accuracy features["DecisionTreeGiniTimeSum".format(i)] = total_time features["DecisionTreeGiniTimeMean".format(i)] = mean_time accuracy, total_time, mean_time = cross_validate_classifier(GaussianNB(), X, y, folds) features["GaussianNBAccuracy".format(i)] = accuracy features["GaussianNBTimeSum".format(i)] = total_time features["GaussianNBTimeMean".format(i)] = mean_time accuracy, total_time, mean_time = cross_validate_classifier(KNeighborsClassifier(n_neighbors=1), X, y, folds) features["1NNAccuracy".format(i)] = accuracy features["1NNTimeSum".format(i)] = total_time features["1NNTimeMean".format(i)] = mean_time return features
def information_theoretic_metafeatures(X, y, categorical): utils.input_check(X, y, categorical) features = OrderedDict() classes, counts = np.unique(y, return_counts = True) features["ClassEntropy"] = scipy.stats.entropy(counts, base = 2) # Information theoretic meta-features below only apply to categorical values if(sum(categorical) == 0): return OrderedDict.fromkeys(information_theoretic_metafeature_names(), value = -1) with utils.stopwatch() as sw: feature_entropies = [scipy.stats.entropy(column[0]) for column in X[:,np.where(categorical)].T] mean_feature_entropy = np.mean(feature_entropies) features["MeanFeatureEntropy"] = np.mean(mean_feature_entropy) mutual_informations = [sklearn.metrics.mutual_info_score(y, column[0]) for column in X[:, np.where(categorical)].T] mean_mutual_information = np.mean(mutual_informations) features["MeanMutualInformation"] = mean_mutual_information if(mean_mutual_information == 0): features["NoiseToSignalRatio"] = 0 features["NoiseToSignalRatio"] = (mean_feature_entropy - mean_mutual_information) / mean_mutual_information features["InformationFeatureTime"] = sw.duration return features
def simple_metafeatures(X, y, categorical): utils.input_check(X, y, categorical) features = OrderedDict() n = X.shape[0] p = X.shape[1] with utils.stopwatch() as sw: features["NumberOfInstances"] = n features["LogNumberOfInstances"] = np.log(n) features["NumberOfFeatures"] = p features["LogNumberOfFeatures"] = np.log(p) features["DatasetRatio"] = p / n features["LogDatasetRatio"] = np.log(p / n) features["InverseDatasetRatio"] = n / p features["LogInverseDatasetRatio"] = np.log(n / p) classes, counts = np.unique(y, return_counts=True) nrNominal = sum(categorical) nrNumeric = len(categorical) - sum(categorical) features["NumberOfClasses"] = classes.shape[0] features["NumberOfCategoricalFeatures"] = nrNominal features["NumberOfNumericFeatures"] = nrNumeric features[ "RatioNumericalToNominal"] = nrNumeric / nrNominal if nrNominal > 0 else 0 features[ "RatioNominalToNumerical"] = nrNominal / nrNumeric if nrNumeric > 0 else 0 class_probabilities = [count / n for count in counts] features["ClassProbabilityMin"] = np.min(class_probabilities) features["ClassProbabilityMax"] = np.max(class_probabilities) features["ClassProbabilityMean"] = np.mean(class_probabilities) features["ClassProbabilitySTD"] = np.std(class_probabilities) symbols_per_column = [ np.unique(column).shape[0] for column in X[:, np.where(categorical)].T ] if len(symbols_per_column) > 0: features["SymbolsMin"] = np.min(symbols_per_column) features["SymbolsMax"] = np.max(symbols_per_column) features["SymbolsMean"] = np.mean(symbols_per_column) features["SymbolsSTD"] = np.std(symbols_per_column) features["SymbolsSum"] = np.sum(symbols_per_column) else: features["SymbolsMin"] = features["SymbolsMax"] = features[ "SymbolsMean"] = features["SymbolsSTD"] = features[ "SymbolsSum"] = 0 features["SimpleFeatureTime"] = sw.duration # Missing value features missing for now since only datasets without missing features were selected. return features
def simple_metafeatures(X, y, categorical): utils.input_check(X, y, categorical) features = OrderedDict() n = X.shape[0] p = X.shape[1] with utils.stopwatch() as sw: features["NumberOfInstances"] = n features["LogNumberOfInstances"] = np.log(n) features["NumberOfFeatures"] = p features["LogNumberOfFeatures"] = np.log(p) features["DatasetRatio"] = p / n features["LogDatasetRatio"] = np.log(p / n) features["InverseDatasetRatio"] = n / p features["LogInverseDatasetRatio"] = np.log(n / p) classes, counts = np.unique(y, return_counts = True) nrNominal = sum(categorical) nrNumeric = len(categorical)-sum(categorical) features["NumberOfClasses"] = classes.shape[0] features["NumberOfCategoricalFeatures"] = nrNominal features["NumberOfNumericFeatures"] = nrNumeric features["RatioNumericalToNominal"] = nrNumeric / nrNominal if nrNominal > 0 else 0 features["RatioNominalToNumerical"] = nrNominal / nrNumeric if nrNumeric > 0 else 0 class_probabilities = [ count / n for count in counts ] features["ClassProbabilityMin"] = np.min(class_probabilities) features["ClassProbabilityMax"] = np.max(class_probabilities) features["ClassProbabilityMean"] = np.mean(class_probabilities) features["ClassProbabilitySTD"] = np.std(class_probabilities) symbols_per_column = [ np.unique(column).shape[0] for column in X[:, np.where(categorical)].T] if len(symbols_per_column) > 0: features["SymbolsMin"] = np.min(symbols_per_column) features["SymbolsMax"] = np.max(symbols_per_column) features["SymbolsMean"] = np.mean(symbols_per_column) features["SymbolsSTD"] = np.std(symbols_per_column) features["SymbolsSum"] = np.sum(symbols_per_column) else: features["SymbolsMin"] = features["SymbolsMax"] = features["SymbolsMean"] = features["SymbolsSTD"] = features["SymbolsSum"] = 0 features["SimpleFeatureTime"] = sw.duration # Missing value features missing for now since only datasets without missing features were selected. return features
def subsample_metafeatures(X, y, categorical, folds): utils.input_check(X, y, categorical) features = OrderedDict() accuracy, total_time, mean_time = cross_validate_classifier(ensemble.RandomForestClassifier(), X, y, folds) features["SubsampleRandomForestAccuracy"] = accuracy features["SubsampleRandomForestMeanTime"] = mean_time accuracy, total_time, mean_time = cross_validate_classifier(svm.SVC(), X, y, folds) features["SubsampleSVCAccuracy"] = accuracy features["SubsampleSVCMeanTime"] = mean_time accuracy, total_time, mean_time = cross_validate_classifier(ensemble.GradientBoostingClassifier(), X, y, folds) features["SubsampleBoostingAccuracy"] = accuracy features["SubsampleBoostingMeanTime"] = mean_time return features
def statistical_metafeatures(X, y, categorical): utils.input_check(X, y, categorical) features = OrderedDict() numerical = [not cat for cat in categorical] # Statistical meta-features are only for the numerical attributes, if there are none, we list them as -1 # we should see if there is a better way to deal with this, as -1 is a valid value for some of these features.. if (sum(numerical) == 0): return OrderedDict.fromkeys(statistical_metafeature_names(), value=-1) with utils.stopwatch() as sw: # Taking taking kurtosis of kurtosis and skewness of kurtosis is suggested by Reif et al. in Meta2-features (2012) kurtosisses = [ scipy.stats.kurtosis(column[0]) for column in X[:, np.where(numerical)].T ] features["KurtosisMin"] = np.min(kurtosisses) features["KurtosisMax"] = np.max(kurtosisses) features["KurtosisMean"] = np.mean(kurtosisses) features["KurtosisSTD"] = np.std(kurtosisses) features["KurtosisKurtosis"] = scipy.stats.kurtosis(kurtosisses) features["KurtosisSkewness"] = scipy.stats.skew(kurtosisses) skewnesses = [ scipy.stats.skew(column[0]) for column in X[:, np.where(numerical)].T ] features["SkewnessMin"] = np.min(skewnesses) features["SkewnessMax"] = np.max(skewnesses) features["SkewnessMean"] = np.mean(skewnesses) features["SkewnessSTD"] = np.std(skewnesses) features["SkewnessKurtosis"] = scipy.stats.kurtosis(skewnesses) features["SkewnessSkewness"] = scipy.stats.skew(skewnesses) standard_deviations = [ np.std(column[0]) for column in X[:, np.where(numerical)].T ] features["MeanSTDOfNumerical"] = np.mean(standard_deviations) features["STDSTDOfNumerical"] = np.std(standard_deviations) features["StatisticalFeatureTime"] = sw.duration return features
def subsample_metafeatures(X, y, categorical, folds): utils.input_check(X, y, categorical) features = OrderedDict() accuracy, total_time, mean_time = cross_validate_classifier( ensemble.RandomForestClassifier(), X, y, folds) features["SubsampleRandomForestAccuracy"] = accuracy features["SubsampleRandomForestMeanTime"] = mean_time accuracy, total_time, mean_time = cross_validate_classifier( svm.SVC(), X, y, folds) features["SubsampleSVCAccuracy"] = accuracy features["SubsampleSVCMeanTime"] = mean_time accuracy, total_time, mean_time = cross_validate_classifier( ensemble.GradientBoostingClassifier(), X, y, folds) features["SubsampleBoostingAccuracy"] = accuracy features["SubsampleBoostingMeanTime"] = mean_time return features
def load_data_from_db(prop=None, optsets=None, structsets=None, calcsets=None): input_check( prop=prop, optsets=optsets, structsets=structsets, calcsets=calcsets) where_cond1 = "structset in %s" % (tuple(structsets), ) where_cond2 = "optset in %s and calcset in %s" % ( tuple(optsets), tuple(calcsets)) where_cond1 = where_cond1.replace(",)", ")") where_cond2 = where_cond2.replace(",)", ")") # The double nested selects are to fix an issue where the empty data points were # getting joined together into the last empty data point. string = """ SELECT a.name, group_concat(a.prop) FROM ( SELECT d.name_id, d.name, IFNULL(data.{prop}, '') as prop FROM ( SELECT names.id as name_id, names.name, ds.id as ds_id FROM (SELECT id, name FROM names WHERE {where1}) as names CROSS JOIN (SELECT id FROM datasets WHERE {where2}) as ds ) as d LEFT JOIN data ON d.name_id = data.name_id and d.ds_id = data.dataset_id ) as a GROUP BY a.name_id; """.format(prop=prop, where1=where_cond1, where2=where_cond2) data = [] with sqlite3.connect("database.sqlite") as conn: for name, values in conn.execute(string): try: data.append( (name, [float(x) if x else None for x in values.split(',')])) except: pass string = "SELECT optset, calcset FROM datasets WHERE %s" % (where_cond2, ) columns = [x for x in conn.execute(string)] return data, columns
def landmarker_metafeatures(X, y, categorical, folds): utils.input_check(X, y, categorical) features = OrderedDict() # TODO: When having folds, do cross-validation instead as it takes more time # and also can give a fair indication of predictive accuracy for i in [1, 2, 4]: accuracy, total_time, mean_time = cross_validate_classifier( DecisionTreeClassifier(criterion='gini', max_depth=i), X, y, folds) features["DecisionTreeGiniDepth{}Accuracy".format(i)] = accuracy features["DecisionTreeGiniDepth{}TimeSum".format(i)] = total_time features["DecisionTreeGiniDepth{}TimeMean".format(i)] = mean_time accuracy, total_time, mean_time = cross_validate_classifier( DecisionTreeClassifier(criterion='entropy', max_depth=i), X, y, folds) features["DecisionTreeEntropyDepth{}Accuracy".format(i)] = accuracy features["DecisionTreeEntropyDepth{}TimeSum".format(i)] = total_time features["DecisionTreeEntropyDepth{}TimeMean".format(i)] = mean_time accuracy, total_time, mean_time = cross_validate_classifier( DecisionTreeClassifier(), X, y, folds) features["DecisionTreeGiniAccuracy".format(i)] = accuracy features["DecisionTreeGiniTimeSum".format(i)] = total_time features["DecisionTreeGiniTimeMean".format(i)] = mean_time accuracy, total_time, mean_time = cross_validate_classifier( GaussianNB(), X, y, folds) features["GaussianNBAccuracy".format(i)] = accuracy features["GaussianNBTimeSum".format(i)] = total_time features["GaussianNBTimeMean".format(i)] = mean_time accuracy, total_time, mean_time = cross_validate_classifier( KNeighborsClassifier(n_neighbors=1), X, y, folds) features["1NNAccuracy".format(i)] = accuracy features["1NNTimeSum".format(i)] = total_time features["1NNTimeMean".format(i)] = mean_time return features
def information_theoretic_metafeatures(X, y, categorical): utils.input_check(X, y, categorical) features = OrderedDict() classes, counts = np.unique(y, return_counts=True) features["ClassEntropy"] = scipy.stats.entropy(counts, base=2) # Information theoretic meta-features below only apply to categorical values if (sum(categorical) == 0): return OrderedDict.fromkeys(information_theoretic_metafeature_names(), value=-1) with utils.stopwatch() as sw: feature_entropies = [ scipy.stats.entropy(column[0]) for column in X[:, np.where(categorical)].T ] mean_feature_entropy = np.mean(feature_entropies) features["MeanFeatureEntropy"] = np.mean(mean_feature_entropy) mutual_informations = [ sklearn.metrics.mutual_info_score(y, column[0]) for column in X[:, np.where(categorical)].T ] mean_mutual_information = np.mean(mutual_informations) features["MeanMutualInformation"] = mean_mutual_information if (mean_mutual_information == 0): features["NoiseToSignalRatio"] = 0 features["NoiseToSignalRatio"] = ( mean_feature_entropy - mean_mutual_information) / mean_mutual_information features["InformationFeatureTime"] = sw.duration return features
def statistical_metafeatures(X, y, categorical): utils.input_check(X, y, categorical) features = OrderedDict() numerical = [not cat for cat in categorical] # Statistical meta-features are only for the numerical attributes, if there are none, we list them as -1 # we should see if there is a better way to deal with this, as -1 is a valid value for some of these features.. if(sum(numerical) == 0): return OrderedDict.fromkeys(statistical_metafeature_names(), value = -1) with utils.stopwatch() as sw: # Taking taking kurtosis of kurtosis and skewness of kurtosis is suggested by Reif et al. in Meta2-features (2012) kurtosisses = [scipy.stats.kurtosis(column[0]) for column in X[:,np.where(numerical)].T] features["KurtosisMin"] = np.min(kurtosisses) features["KurtosisMax"] = np.max(kurtosisses) features["KurtosisMean"] = np.mean(kurtosisses) features["KurtosisSTD"] = np.std(kurtosisses) features["KurtosisKurtosis"] = scipy.stats.kurtosis(kurtosisses) features["KurtosisSkewness"] = scipy.stats.skew(kurtosisses) skewnesses = [scipy.stats.skew(column[0]) for column in X[:,np.where(numerical)].T] features["SkewnessMin"] = np.min(skewnesses) features["SkewnessMax"] = np.max(skewnesses) features["SkewnessMean"] = np.mean(skewnesses) features["SkewnessSTD"] = np.std(skewnesses) features["SkewnessKurtosis"] = scipy.stats.kurtosis(skewnesses) features["SkewnessSkewness"] = scipy.stats.skew(skewnesses) standard_deviations = [np.std(column[0]) for column in X[:,np.where(numerical)].T] features["MeanSTDOfNumerical"] = np.mean(standard_deviations) features["STDSTDOfNumerical"] = np.std(standard_deviations) features["StatisticalFeatureTime"] = sw.duration return features
def MW_for_PCA(n, m, d, B, weight=None, alpha=None, beta=None, eta=1, T=10, verbose=False, report_all_obj=False, n_X_last=0, dual_function=None, stopping_gap=1e-6, primal_function=None, NSW_update=False): ''' Arguments: n: size of matrices in B m: number of objectives d: target dimension B: list of m matrices weight: any initial weight, if user wants to specify. By default, this is set to all 1/m. alpha: list of m numbers, used in the objectives By default, this is set to all 1. beta: list of m numbers, used in the objectives. By default, this is set to all 0. eta: learning rate of MW or an array of length T. Changing learning rate can be done by specifying the learning rate for each of T iterations as an array of length T T: number of iteration verbose: will print the objective value in each single iteration. This may not be needed as the function will output pandas dataframe of all statistics already when report_all_obj=True. report_all_obj: if objective and weight on each group will be included in the output dataframe statistics n_X_last = number of X_last I will keep. It will keep n_X_last last iterates' solutions (iterates T-n_X_last up to T) rather than just the last one, if specified. Note that it does not keep any if MW terminates early due to close duality gap - in that case, the last or average iterate would be the choice to use, not the last few iterates. dual_function(w,B,X): given weight vector w=[w_1,...,w_m], B=[B_1,...,B_m], specify the dual objective function to calculate. This function can be obtained after knowing the social utility welfare objective (see 'Obj' in fairDimReduction_MW method). By default, it is None, so no dual will be calculated. Optionally, to speedup runtime, the dual will also receive the optimum solution to weighted PCA of that iteration. (Can be ignored when specifying the function definition) stopping_gap: if not None and positive, and if calculate_dual is true, the MW will stop automatically when primal and dual is no more than the gap specified. By default, this is set to 1e-6. primal_function(B,X): the primal by default is specfied as minimum of alpha_i <B_i,X> + beta_i. One can also specify others, such as for NSW, here. This can be used for comparing the MW on other objectives. NSW_update: a different update rule is applied by the calculation of the gradient of weight in dual objective function *** Depreciated. Do not recommend NSW_update due to bad performance on dual space that is not simplex *** Note: in theory, for eps < 1 approximation to optimization problem with bounded objective in [0,1], eta = eps/8 T = 32log(m)/eps^2 analyzed in "The Price of Fair PCA: One Extra Dimension." Given m objectives to maximize simultanously alpha_1 <B_1,X> + beta_1 alpha_2 <B_2,X> + beta_2 ... alpha_m <B_m,X> + beta_m subject to tr(X) <= d 0 << X << I (matrix inequality) the function uses MW to maximize the minimum of m objectives. Output: [X_last,X_avg,runstats] X_last: n x n matrix X from the last iterate. X_avg: averge over T matrices of n x n matrices X from each of T iterates. runstats: value of weights and objectives in each iterate or [when n_X_last > 0] X_last: list of n_X_last n x n matrices X from the last n_X_last iterates (ordered from (T - n_X_last +1)th iterates till the very last iterate). ''' #input check if (input_check(n, m, d, B, function_name='MW_for_PCA') > 0): return -1 if (weight is None): weight = np.full(m, 1 / m) weight = weight / weight.sum( ) #Without loss of generality, make sure the sum of weight is 1 if (alpha is None): alpha = np.full(m, 1) if (beta is None): beta = np.zeros(m) if (weight.shape != (m, ) or alpha.shape != (m, ) or beta.shape != (m, )): print( "Error: MW_for_PCA is called with wrong weight or alpha or beta coefficient size. They should be numpy vectors of length m" ) return -1 if isinstance(eta, list): #given a list if len(eta) < T: print( 'Error: MW_for_PCA is called with list of eta having less than T numbers' ) return -1 elif (eta > 0): #This is good case. Make eta an array for simplicity of code eta = [eta for i in range(T)] else: print( "Error: MW_for_PCA is called eta not a positive real number nor a list." ) return -1 run_stats = pd.DataFrame() X_avg = np.zeros((n, n)) if (n_X_last > 0): # I want to keep a few last iterates, not just the last one list_X = [] for t in range(T): [X, _, Obj] = weightedPCA(n, m, d, B, weight=weight, alpha=alpha, beta=beta, calculate_objective=True) if (n_X_last > 0): # I want to keep this if it is closer to the end if (t + n_X_last >= T): list_X.append(X) #update the average solution of X. In MW, the average is guaranteed to converge, not the last iterate, at least in theory X_avg = t * (X_avg / (t + 1)) + X / (t + 1) #this stats below keeps the weight and objective value of this iterate this_t_stats = {'iteration': t} if report_all_obj: this_t_stats.update( dict(('weight' + str(i), weight[i]) for i in range(m))) this_t_stats.update({'minimum of m objective, that iterate': min(Obj)}) avg_Obj = min([ alpha[i] * np.multiply(B[i], X_avg).sum() + beta[i] for i in range(m) ]) this_t_stats.update({'minimum of m objective, avg iterate': avg_Obj}) #add the primal objective, if specified if (primal_function is not None): this_t_stats.update( {'primal objective, that iterate': primal_function(B, X)}) this_t_stats.update( {'primal objective, avg iterate': primal_function(B, X_avg)}) #add the dual objective if (dual_function is not None): dual_val = dual_function(weight, B, X) #dual bound is the best we see so far if (t > 0): dual_val = min([dual_val, dual_val_previous]) this_t_stats.update({'dual objective': dual_val}) dual_val_previous = dual_val #update with the objective if report_all_obj: this_t_stats.update( dict(('Obj' + str(i), Obj[i]) for i in range(m))) if (verbose): print("stats at iteration " + str(t) + " is :") print(this_t_stats) run_stats = run_stats.append(pd.DataFrame(this_t_stats, index=[t])) #now the update of the weight Loss = np.multiply(-1, Obj) for i in range(m): #gradient of dual will also have -1/w_i term if (NSW_update): Loss[i] -= 1 / (weight[i]) weight[i] = math.exp(eta[t] * Loss[i]) * weight[i] #boudn away from 0 to bound gradient norm 1/(weight[i]) if (NSW_update): weight[i] = min([1e-4, weight[i]]) if (NSW_update == False): #normal MW, else no need to do this weight = weight / weight.sum() if ((dual_function is not None) and (stopping_gap is not None) and (stopping_gap > 0)): #we have to check if we need to stop # min(Obj) is the minimum of utility of all groups, which is the social welfare in MM_Loss and MM_Var case if (abs(dual_val - min(Obj)) < stopping_gap): print( "MW terminated at T=", t, " iterations: current iterate solution achieved primal-dual gap of", stopping_gap) break elif (abs(dual_val - avg_Obj) < stopping_gap): print( "MW terminated at T=", t, " iterations: average iterate solution achieved primal-dual gap of", stopping_gap) break if ( (n_X_last > 0) and (len(list_X) > 0) ): #return the whole list of last few X's if not empty. Else, this happens when gap is reached earlier than T iterations return [list_X, X_avg, run_stats] else: return [X, X_avg, run_stats]
def fairDimReductionFractional(n,k,B,list_n='one',Obj='MM_Loss',list_d='all',verbose=1,print_other_obj=True,return_option='run_statistics',save=True,savedPath='fairDimReductionFractional.csv'): """ Given k PSD n-by-n matrices B1,...,Bk, solve the (fractional) convex optimization of fair dimensional reduction. Arguments: k: number of groups n: original number of features (size of all B_i's) B: list of PSD matrices, as numpy matrices. It must contain at least k matrices. If there are more than k matrices provided, the first k will be used as k groups. list_n: by default, this is simply n, which is the total number of features. If 'all', this n0 (number of dimension, first n0 features are used) ranges from d0+1 to n (d0 is the target dimension of that iteration) Else, you can specify as a list of n_0. list_d: list of target dimensions to project to. By default ('all'), it is from 1 to n-1. print_other_obj:setting to True will also print other welfare economic objective (total of four, including one specified as input Obj) verbose: set to 1 if the details are to be printed. Set to 2 to print the information table of each iteration save: will save to csv if set to True savedPath: path of the file to export the result to. Obj:the objective to optimize. Must be MM_Var (maximize the minimum variance), MM_Lose (default) (minimize the maximum loss, output the negative), or NSW (Nash social welfare) return_option: by default, it returns the runtime, n,d, the rank, and several objectives of each group. Another option 'frac_sol' is to return a list of fractional solution X. list_X (the additional output) will be a list, each row containing one run of particular value of n and d. In each row, it contains value of n,d and solution X as cvx matrix. One can convert this back to numpy by: import numpy as np array(list_X[0][2]) --> this gives numpy solution matrix X of the first setting of (n,d). array(list_X[i][2]) --> this gives numpy solution matrix X of the (i+1)th setting of (n,d). """ #input check if (input_check(n, k, 1, B, function_name='fairDimReductionFractional') > 0): return -1 #for storing results of the optimization runstats = pd.DataFrame() if (return_option == 'frac_sol'): list_X = [] #list of all d if (list_d == 'all'): list_d = range(1,n) for d in list_d: #valid value of n_0 if (list_n == 'one'): list_n_this_d = [n] elif (list_n == 'all'): list_n_this_d = range(d+1,n+1) else: list_n_this_d = list_n for n0 in list_n_this_d: #shorten version of the matrix, in case we want to delete any earlier features for experiments Bnumpy_s = [B[i][np.ix_(range(0,n0),range(0,n0))] for i in range(k)] #now define the problem B_s = [matrix(B[i][np.ix_(range(0,n0),range(0,n0))]) for i in range(k)] fairPCA = pic.Problem() n = n0 I =pic.new_param('I',cvx.spmatrix([1]*n,range(n),range(n),(n,n))) #identity matrix # Add the symmetric matrix variable. X=fairPCA.add_variable('X',(n,n),'symmetric') #projection matrix, should be rank d but relaxed z=fairPCA.add_variable('z',1) #scalar, for the objective # Add parameters for each group A = [pic.new_param('A'+str(i),B_s[i]) for i in range(k)] #best possible variance for each group best = [np.sum(np.sort(np.linalg.eigvalsh(Bnumpy_s[i]))[-d:]) for i in range(k)] # Constrain X on trace fairPCA.add_constraint(I|X<=d) # Constrain X to be positive semidefinite. fairPCA.add_constraint(X>>0) fairPCA.add_constraint(X<<I) #the following depends on the type of the problems. Here we coded 3 of them: #1) max min variance 2) min max loss 3) Nash social welfare of variance if(Obj=='MM_Loss'): # Add loss constriant fairPCA.add_list_of_constraints([(A[i]|X) - best[i] >= z for i in range(k)]) #constraints # Set the objective. fairPCA.set_objective('max',z) elif (Obj=='MM_Var'): # Add variance constriant fairPCA.add_list_of_constraints([(A[i]|X) >= z for i in range(k)]) #constraints # Set the objective. fairPCA.set_objective('max',z) elif (Obj=='NSW'): s=fairPCA.add_variable('s',k) #vector of variances # Add variance constriant fairPCA.add_list_of_constraints([(A[i]|X) >= s[i] for i in range(k)]) #constraints # Set the objective. fairPCA.add_constraint( z <= pic.geomean(s) ) fairPCA.set_objective('max',z) else: fairPCA.set_objective('max',z) print("Error: fairDimReductionFractional is called with invalid Objective. Supported Obj augements are: ... Exit the method") return solveInfo=fairPCA.solve(verbose = 0,solver='cvxopt') var = [np.sum(np.multiply(Bnumpy_s[i],X.value)) for i in range(k)] loss = [var[i] - best[i] for i in range(k)] #dictionary of info for this iterate solveInfoShort = dict((key, solveInfo[key]) for key in ('time','obj','status')) if (print_other_obj): solveInfoShort.update({'MM_Var':np.amin(var),'MM_loss':np.amin(loss),'NSW':geo_mean_through_log(var),'Total_Var':np.sum(var)}) solveInfoShort.update({'n':n0,'gap':solveInfo['cvxopt_sol']['gap'],'d':d,'rank':np.linalg.matrix_rank(array(X.value),tol=1e-6,hermitian =True)}) for i in range(k): solveInfoShort.update({'Loss'+str(i):loss[i],'Var'+str(i):var[i],'Best'+str(i):best[i]}) #add information of this optimization for this d,n0 runstats = runstats.append(pd.DataFrame(solveInfoShort,index=[n0])) #add this info if (return_option == 'frac_sol'): list_X.append([n0,d,X.value]) if(verbose==2): print(runstats) if(verbose==1): print("The total number of cases tested is:") print(len(runstats)) print("The number of cases where the rank is exact is:") print(len(runstats[runstats['d']==runstats['rank']])) if(save): runstats.to_csv(savedPath,index=False) if (return_option == 'frac_sol'): return [runstats,list_X] return runstats;
def MW_for_PCA(n, m, d, B, weight=None, alpha=None, beta=None, eta=1, T=10, verbose=False): ''' Arguments: n: size of matrices in B m: number of objectives d: target dimension B: list of m matrices weight: any initial weight, if user wants to specify. By default, this is set to all 1/m. alpha: list of m numbers, used in the objectives By default, this is set to all 1. beta: list of m numbers, used in the objectives. By default, this is set to all 0. eta: learning rate of MW. T: number of iteration verbose: will print the objective value in each single iteration. This may not be needed as the function will output pandas dataframe of all statistics already. Note: in theory, for eps < 1 approximation to optimization problem with bounded objective in [0,1], eta = eps/8 T = 32log(m)/eps^2 analyzed in "The Price of Fair PCA: One Extra Dimension." Given m objectives to maximize simultanously alpha_1 <B_1,X> + beta_1 alpha_2 <B_2,X> + beta_2 ... alpha_m <B_m,X> + beta_m subject to tr(X) <= d 0 << X << I (matrix inequality) the function uses MW to maximize the minimum of m objectives. Output: [X_last,X_avg,runstats] X_last: n x n matrix X from the last iterate. X_avg: averge over T matrices of n x n matrices X from each of T iterates. runstats: value of weights and objectives in each iterate ''' #input check if (input_check(n, m, d, B, function_name='MW_for_PCA') > 0): return -1 if (weight is None): weight = np.full(m, 1 / m) weight = weight / weight.sum( ) #Without loss of generality, make sure the sum of weight is 1 if (alpha is None): alpha = np.full(m, 1) if (beta is None): beta = np.zeros(m) if (weight.shape != (m, ) or alpha.shape != (m, ) or beta.shape != (m, )): print( "Error: MW_for_PCA is called with wrong weight or alpha or beta coefficient size. They should be numpy vectors of length m" ) return -1 if ((eta > 0) == False): print("Error: MW_for_PCA is called eta not a positive real number.") return -1 run_stats = pd.DataFrame() X_avg = np.zeros((n, n)) for t in range(T): [X, _, Obj] = weightedPCA(n, m, d, B, weight=weight, alpha=alpha, beta=beta, calculate_objective=True) #update the average solution of X. In MW, the average is guaranteed to converge, not the last iterate, at least in theory X_avg = t * (X_avg / (t + 1)) + X / (t + 1) #this stats below keeps the weight and objective value of this iterate this_t_stats = {'iteration': t} this_t_stats.update( dict(('weight' + str(i), weight[i]) for i in range(m))) this_t_stats.update({'minimum of m objective, that iterate': min(Obj)}) this_t_stats.update({ 'minimum of m objective, avg iterate': min([ alpha[i] * np.multiply(B[i], X_avg).sum() + beta[i] for i in range(m) ]) }) this_t_stats.update(dict(('Obj' + str(i), Obj[i]) for i in range(m))) if (verbose): print("stats at iteration " + str(t) + " is :") print(this_t_stats) run_stats = run_stats.append(pd.DataFrame(this_t_stats, index=[t])) #now the update of the weight Loss = np.multiply(-1, Obj) for i in range(m): weight[i] = math.exp(eta * Loss[i]) * weight[i] weight = weight / weight.sum() return [X, X_avg, run_stats]
def weightedPCA(n, k, d, B, weight=None, alpha=None, beta=None, calculate_objective=False): ''' Arguments: n: dimension of (symmetric real) matrix B k: number of B_i d: target dimension of the PCA weight: vector of k numbers as numpy list specifying weight of PCA to each group to combine. This is all 1/k by default. alpha: additional weight to multiply to <B_i,X>, if any. This is all 1 by default. beta: constant adding to the objective: alpha_i (<B_i,X>) + beta_i. Task: Given the objective sum_{i=1}^k w_i * (alpha_i (<B_i,X>) + beta_i). where B_i and X are n-by-n matrices, solve to get the rank-d solution X. This is simply standard PCA on the weighted data sum_{i=1}^k w_i alpha_i B_i Note: - The solution is independent of beta. Beta only affects the objective function. - It seems redundant to have weight and alpha, as both are just weights multiplied together. The only reason for separating them is to calculate the objective value alpha_i (<B_i,X>) + beta_i which is independent of weight but dependent on alpha. - Assume matrices B[i] are all symmetric reals. If not, change 'np.linalg.eigh(W)' to 'np.linalg.eig(W) Output: [X = P P^T , P , Obj_list = [Obj_1, ..., Obj_k] ], or [X = P P^T , P] if calculate_objective = False - solution X which is n x n matrix of rank d. - P is the n x d matrix of d principle eigenvectors as columns, sorted from ones with biggest eigenvalue first to lower. - objective value Obj_i = alpha_i (<B_i,X>) + beta_i to save time, calculate_objective can be set to false when calculating Obj_i is not needed. This is the default. ''' if (input_check(n, k, d, B, function_name='weightedPCA') > 0): return -1 #I have to define default value in the function, not at the declaration of funciton, as the defaul value is dependent on k if (weight is None): weight = np.full(k, 1 / k) if (alpha is None): alpha = np.full(k, 1) if (beta is None): beta = np.zeros(k) if (weight.shape != (k, ) or alpha.shape != (k, ) or beta.shape != (k, )): print( "Error: weightedPCA is called with wrong weight or alpha or beta coefficient size. They should be numpy vectors of length k" ) return -1 #normalization weight = weight / weight.sum() #alpha_normalized = alpha/alpha.sum() #no need for this #Note that I define new alpha because I don't want to edit alpha when I calculate objective value later, which has alpha in the expression W = np.zeros((n, n)) for i in range(k): W = W + (weight[i] * alpha[i]) * B[i] [eigenValues, eigenVectors] = np.linalg.eigh(W) #note: sometimes the numerical problems makes small complex parts, Put np.real to avoid them, as we know W should be PSD. We put linalg.eigh instead of linalg.eig for this, or alternatively do what follow(s): #eigenValues = eigenValues.real #eigenVectors= eigenVectors.real #sort eigenvalues and eigenvectors in decending orders idx = eigenValues.argsort()[::-1] eigenValues = eigenValues[idx] eigenVectors = eigenVectors[:, idx] #take the first d vectors. Obtained the solution P = eigenVectors[:, :d] X = P @ P.T if (calculate_objective == False): return [X, P] else: Obj_list = [ alpha[i] * np.multiply(B[i], X).sum() + beta[i] for i in range(k) ] return [X, P, Obj_list]
def fairDimReduction_MW(n, k, d, B, Obj='MM_Loss', eta=1, T=10, verbose=False, timed=True, n_X_last=0, return_time_only=False, calculate_dual=False, eps=1e-9, stopping_gap=1e-6): ''' Arguments: n: size of matrices in B k: number of objectives d: target dimension B: list of k matrices Obj:the objective to optimize. Must be MM_Var (maximize the minimum variance) or MM_Lose (default) (minimize the maximum loss, output the negative number variance - best) *** Obj can also be NSW (Nash social welfare, which is the sum of log of variances across groups), but due to bad performance of MW when the dual space is not simplex, we do not recommend solving NSW by MW. Use Frank-Wolfe for NSW instead *** eta: learning rate of MW. See changing learning rate by putting eta as an array of T numbers in MW_for_PCA T: number of iteration verbose: will print the objective value in each single iteration of MW. This may not be needed as the function will output pandas dataframe of all statistics already. timed: will print amount of time used by this method in total, in seconds. n_X_last: the number of X_last I will keep. It will keep n_X_last last iterates' solutions. return_time_only: put this to true if one wants to measure the runtime only. It will return only the time in seconds of this method. calculate_dual: if this is true, for each weight vector w during the multiplcative weight update, the method calculates the dual objective by w: D(w) := max_{n-by-n matrix X: tr(X)<=d, 0<=X<=I} {sum_{group i} w_i*<B,X>} - f_*(w). Here, f is the objective function (for example, with MM_Var, f(z_1,...,z_k)=min_i {z_i}), and f_* is the concave conjugate of function f_*. Note on the dual formulation: 1) strong duality holds for concave f. 2) We maximize f, and try to minimize the dual objective D(w) over reals w. 3) Some of the concave congulate functions (denote z_i=<B_i,X>): f = MM_Var = min_i {z_i} --> f_*(z) = 0 if w>=0 and w_1+...+w_k = 1; = -\infty otherwise. f = MM_Loss = min_i {z_i + beta_i} for beta_i the best variance of group i --> f_*(z) = -sum_i w_i*beta_i if w>=0 and w_1+...+w_k = 1; = -\infty otherwise. f = NSW = sum_i {log z_i} --> f_*(z) = sum_i {1 + log w_i} for w > 0; -\infty otherwise eps: numerical error threshold for checking if w satisfies sum w_i = 1 and w>=0 stopping_gap: if not None and positive, and if calculate_dual is true, the MW will stop automatically when primal and dual is no more than the gap specified. By default, this is set to 1e-6. Output: [X_last,X_avg,runstats] X_last: n x n matrix X from the last iterate. X_avg: averge over T matrices of n x n matrices X from each of T iterates. runstats: value of weights and objectives in each iterate if n_X_last > 0: X_last: list of n_X_last n x n matrices X from the last n_X_last iterates (ordered from (T - n_X_last +1)th iterates till the very last iterate). OR Output: runtime runtime: time in seconds it takes to run all iterations. (Helpful for checking the running time) ''' #input check if (input_check(n, k, d, B, function_name='fairDimReduction_MW') > 0): return -1 #we just take Obj and convert that into alpha-beta notation for MW for PCA if (Obj == 'MM_Loss'): #best possible PCA projection for each group is easy to calculate: take d best eigenvalues best = [ np.sum(np.sort(np.linalg.eigvalsh(B[i]))[-d:]) for i in range(k) ] #shift the objective by the best possible PCA for that single group beta = np.multiply(-1, best) #no need to modify anything in MW method primal_function = None elif (Obj == 'MM_Var'): beta = np.zeros(k) #no need to modify anything in MW method primal_function = None elif (Obj == 'NSW'): beta = np.zeros(k) #modify the objective, since NSW is not the min max form covered by MW method as it is def primal_function(B, X): utility = 0 for i in range(len(B)): dot_product = np.multiply(B[i], X).sum() if (dot_product < 0): print("Warning: the dot product <B[", i, "],X> is not positive. The value is", dot_product) print("Eigenvalues of X is", np.linalg.eig(X)[0]) print("Eigenvalues of B[i] is", np.linalg.eig(B[i])[0]) utility += math.log(dot_product) return utility else: print( "Error:fairDimReduction_MW is called with an invalid input objective." ) return -1 #specify the dual objective, if need to dual_function = None if (calculate_dual): if (Obj == 'MM_Var'): def dual_function(w, B, X): if (abs(np.sum(w) - 1) > 1e-9): print( "Warning: dual is infeasible with w not summing to 1") if (np.amin(w) < 0): print("Warning: dual is infeasible with some w_i < 0") weighted_matrix = np.full_like( B[0], 0 ) #create a matrix of same size as first B initially as 0 matrix for i in range(len(B)): weighted_matrix += w[i] * B[i] return np.sum( np.multiply(weighted_matrix, X)) #dot product <sum {w_i B_i}, X> - f_*(w) elif (Obj == 'MM_Loss'): def dual_function(w, B, X): if (abs(np.sum(w) - 1) > 1e-9): print( "Warning: dual is infeasible with w not summing to 1") if (np.amin(w) < 0): print("Warning: dual is infeasible with some w_i < 0") weighted_matrix = np.full_like( B[0], 0 ) #create a matrix of same size as first B initially as 0 matrix for i in range(len(B)): weighted_matrix += w[i] * B[i] return np.sum(np.multiply(weighted_matrix, X)) + np.sum( np.multiply(w, beta) ) #dot product <sum {w_i B_i}, X> - f_*(w) and f_*(z) = -sum_i w_i*beta_i elif (Obj == 'NSW'): def dual_function(w, B, X): if (np.amin(w) < 0): print( "Warning: dual is infeasible with some w_i < 0. The minimum found is", np.amin(w)) elif (np.amin(w) == 0): return float("inf") #log(0) is -infinity dual = 0 for i in range(len(B)): dual += np.multiply(B[i], X).sum() - 1 - math.log(w[i]) return dual if (stopping_gap <= 0): stopping_gap = None start = timeit.default_timer() [X_last, X_avg, Obj] = MW_for_PCA(n, k, d, B, weight=None, alpha=None, beta=beta, eta=eta, T=T, verbose=verbose, n_X_last=n_X_last, dual_function=dual_function, stopping_gap=stopping_gap, primal_function=primal_function, NSW_update=(Obj == 'NSW')) stop = timeit.default_timer() if (timed): print("fairDimReduction_MW is called. Total time used is: ", stop - start, " seconds.") best_obj = max([ Obj['minimum of m objective, that iterate'].max(), Obj['minimum of m objective, avg iterate'].max() ]) best_dual = Obj['dual objective'].min() print( 'The best solution found from avg and single iterate acheieves primal', best_obj, '. The dual is', best_dual, '. Gap is', best_dual - best_obj, 'which is', abs((best_dual - best_obj) / best_dual) * 100, '%.') if (return_time_only): return stop - start return [X_last, X_avg, Obj]
def FW(n, k, d, B, Obj='NSW', delta=1e-4, start_solution='uniform', update_rule='1/t', duality_gap=1e-4, num_iterations=None): """ Solve the fair PCA with social welfare objective specified by Obj The algorithm is by Frank-Wolf. Parameters update_rule, duality_gap, num_iterations are parameters of Frank-Wolfe algorithm. It returns the last iterate of Frank-Wolfe. Arguments: n: original dimension of the data k: number of groups d: target dimenion B: list of all k groups' data. Must be a list of k n-by-n matrices. delta: buffer in the objective function for its stability and Lipschitzness. start_solution: options for the starting solution X_0. 'uniform' --> X_0 = d/n I_n 'standard_PCA' --> [not yet implemented] update_rule, duality_gap, num_iterations: see documentation in FrankWolfe method. """ #input check if (input_check(n, k, d, B, function_name='FW_NSW') > 0): return -1 #specify parameters to feed to FrankWolfe algorithm if (start_solution == 'uniform'): init_X = d * np.eye(n) / n else: print( "Warning: starting X_0 for FW_NSW not yet implemented. Using uniform as starting rule" ) init_X = d * np.eye(n) / n def linear_oracle(G): P = std_PCA(G, d) #this is n x d matrix of d top sigular values of G return P @ P.T #return an n x n matrix #define functions based on objective function f if (Obj == 'NSW'): #if using line_search, we also need to put the function definition into FrankWolfe as well def primal(X): #return sum of log of variances return sum( [np.log(np.sum(np.multiply(B[i], X))) for i in range(k)]) def grad_function(X): #sum of 1/(<B_i,X>+delta)*B[i] is the gradient of f(X) return sum([ 1 / (np.multiply(B[i], X).sum() + delta) * B[i] for i in range(len(B)) ]) elif (Obj == 'MM_Var'): def primal(X): return np.min([np.sum(np.multiply(B[i], X)) for i in range(k)]) def grad_function(X): #B[j] of group j with lowest objective return B[np.argmin( [np.sum(np.multiply(B[i], X)) for i in range(k)])] elif (Obj == 'MM_Loss'): #the best possible variance for each group. Constant independent of X best = [ np.sum(np.sort(np.linalg.eigvalsh(B[i]))[-d:]) for i in range(k) ] def primal(X): return np.min( [np.sum(np.multiply(B[i], X)) - best[i] for i in range(k)]) def grad_function(X): #B[j] of group j with lowest objective #return B[ np.argmin([np.sum(np.multiply(B[i],X)) - best[i] for i in range(k)]) ] #------------------- #try softmax lam = -50 #should be -log k / eps for error eps. Use negative for softmin. positive for softmax w = dict() #weights of groups is_0_w = dict() #is 1 if the weight is so small for i in range(k): #weight of group i w[i] = 0 #find the sum in denominator for j in range(k): exponent = lam * (np.sum(np.multiply(B[j] - B[i], X)) - best[j] + best[i]) #if exponent is too low, ignore. If it is too high, this weight (after inverting) is pretty much 0 if exponent < -20: continue elif exponent > 20: is_0_w = True break #done with this w_i else: w[i] += math.exp(exponent) if is_0_w: w[i] = 0 else: w[i] = 1 / w[i] #for checking print('sanity check: sum of w is', sum([w[i] for i in range(k)]), 'w is', [w[i] for i in range(k)]) return sum([w[i] * B[i] for i in range(k)]) else: print('Error: objective for FW is invalid. Return None for FW method.') return None #perform FrankWolfe X_final, dual_gap = FrankWolfe(init_X, grad_function=grad_function, linear_oracle=linear_oracle, update_rule=update_rule, duality_gap=duality_gap, num_iterations=num_iterations, function=primal) primal = primal(X_final) dual = primal + dual_gap if (Obj == 'NSW'): print('NSW primal value (sum of log) is', primal, '. The (multiplicative) gap of product objective is ', (np.exp(dual_gap) - 1) * 100, '%.') elif (Obj == 'MM_Var') or (Obj == 'MM_Loss'): print(Obj, 'primal value is', primal, '. Dual is', dual, '. The gap is', dual_gap, ', which is', abs((dual - primal) / dual) * 100, '%.') return X_final