def landmarker_metafeatures(X, y, categorical, folds):
	utils.input_check(X, y, categorical)
	features = OrderedDict()

	# TODO: When having folds, do cross-validation instead as it takes more time
	# end also can give a fair indication of predictive accuracy
	for i in [1,2,4]:
		accuracy, total_time, mean_time = cross_validate_classifier(DecisionTreeClassifier(criterion='gini', max_depth=i), X, y, folds)
		features["DecisionTreeGiniDepth{}Accuracy".format(i)] = accuracy
		features["DecisionTreeGiniDepth{}TimeSum".format(i)] = total_time
		features["DecisionTreeGiniDepth{}TimeMean".format(i)] = mean_time

		accuracy, total_time, mean_time = cross_validate_classifier(DecisionTreeClassifier(criterion='entropy', max_depth=i), X, y, folds)
		features["DecisionTreeEntropyDepth{}Accuracy".format(i)] = accuracy
		features["DecisionTreeEntropyDepth{}TimeSum".format(i)] = total_time
		features["DecisionTreeEntropyDepth{}TimeMean".format(i)] = mean_time

	accuracy, total_time, mean_time = cross_validate_classifier(DecisionTreeClassifier(), X, y, folds)
	features["DecisionTreeGiniAccuracy".format(i)] = accuracy
	features["DecisionTreeGiniTimeSum".format(i)] = total_time
	features["DecisionTreeGiniTimeMean".format(i)] = mean_time

	accuracy, total_time, mean_time = cross_validate_classifier(GaussianNB(), X, y, folds)
	features["GaussianNBAccuracy".format(i)] = accuracy
	features["GaussianNBTimeSum".format(i)] = total_time
	features["GaussianNBTimeMean".format(i)] = mean_time

	accuracy, total_time, mean_time = cross_validate_classifier(KNeighborsClassifier(n_neighbors=1), X, y, folds)
	features["1NNAccuracy".format(i)] = accuracy
	features["1NNTimeSum".format(i)] = total_time
	features["1NNTimeMean".format(i)] = mean_time
	return features
def information_theoretic_metafeatures(X, y, categorical):
	utils.input_check(X, y, categorical)
	features = OrderedDict()

	classes, counts = np.unique(y, return_counts = True)
	features["ClassEntropy"] = scipy.stats.entropy(counts, base = 2)

	# Information theoretic meta-features below only apply to categorical values
	if(sum(categorical) == 0):
		return OrderedDict.fromkeys(information_theoretic_metafeature_names(), value = -1)

	with utils.stopwatch() as sw:
		feature_entropies = [scipy.stats.entropy(column[0]) for column in X[:,np.where(categorical)].T]
		mean_feature_entropy = np.mean(feature_entropies)
		features["MeanFeatureEntropy"] = np.mean(mean_feature_entropy)

		mutual_informations = [sklearn.metrics.mutual_info_score(y, column[0]) for column in X[:, np.where(categorical)].T]
		mean_mutual_information = np.mean(mutual_informations)
		features["MeanMutualInformation"] = mean_mutual_information

		if(mean_mutual_information == 0):
			features["NoiseToSignalRatio"] = 0

		features["NoiseToSignalRatio"] = (mean_feature_entropy - mean_mutual_information) / mean_mutual_information

	features["InformationFeatureTime"] = sw.duration
	return features
Example #3
0
def simple_metafeatures(X, y, categorical):
    utils.input_check(X, y, categorical)
    features = OrderedDict()
    n = X.shape[0]
    p = X.shape[1]

    with utils.stopwatch() as sw:
        features["NumberOfInstances"] = n
        features["LogNumberOfInstances"] = np.log(n)
        features["NumberOfFeatures"] = p
        features["LogNumberOfFeatures"] = np.log(p)
        features["DatasetRatio"] = p / n
        features["LogDatasetRatio"] = np.log(p / n)
        features["InverseDatasetRatio"] = n / p
        features["LogInverseDatasetRatio"] = np.log(n / p)

        classes, counts = np.unique(y, return_counts=True)
        nrNominal = sum(categorical)
        nrNumeric = len(categorical) - sum(categorical)

        features["NumberOfClasses"] = classes.shape[0]
        features["NumberOfCategoricalFeatures"] = nrNominal
        features["NumberOfNumericFeatures"] = nrNumeric

        features[
            "RatioNumericalToNominal"] = nrNumeric / nrNominal if nrNominal > 0 else 0
        features[
            "RatioNominalToNumerical"] = nrNominal / nrNumeric if nrNumeric > 0 else 0

        class_probabilities = [count / n for count in counts]
        features["ClassProbabilityMin"] = np.min(class_probabilities)
        features["ClassProbabilityMax"] = np.max(class_probabilities)
        features["ClassProbabilityMean"] = np.mean(class_probabilities)
        features["ClassProbabilitySTD"] = np.std(class_probabilities)

        symbols_per_column = [
            np.unique(column).shape[0] for column in X[:,
                                                       np.where(categorical)].T
        ]
        if len(symbols_per_column) > 0:
            features["SymbolsMin"] = np.min(symbols_per_column)
            features["SymbolsMax"] = np.max(symbols_per_column)
            features["SymbolsMean"] = np.mean(symbols_per_column)
            features["SymbolsSTD"] = np.std(symbols_per_column)
            features["SymbolsSum"] = np.sum(symbols_per_column)
        else:
            features["SymbolsMin"] = features["SymbolsMax"] = features[
                "SymbolsMean"] = features["SymbolsSTD"] = features[
                    "SymbolsSum"] = 0

    features["SimpleFeatureTime"] = sw.duration
    # Missing value features missing for now since only datasets without missing features were selected.

    return features
def simple_metafeatures(X, y, categorical):
	utils.input_check(X, y, categorical)
	features = OrderedDict()
	n = X.shape[0]
	p = X.shape[1]

	with utils.stopwatch() as sw:
		features["NumberOfInstances"] = n
		features["LogNumberOfInstances"] = np.log(n)
		features["NumberOfFeatures"] = p
		features["LogNumberOfFeatures"] = np.log(p)
		features["DatasetRatio"] = p / n
		features["LogDatasetRatio"] = np.log(p / n)
		features["InverseDatasetRatio"] = n / p 
		features["LogInverseDatasetRatio"] = np.log(n / p)

		classes, counts = np.unique(y, return_counts = True)
		nrNominal = sum(categorical)
		nrNumeric = len(categorical)-sum(categorical)

		features["NumberOfClasses"] = classes.shape[0]
		features["NumberOfCategoricalFeatures"] = nrNominal
		features["NumberOfNumericFeatures"] = nrNumeric

		features["RatioNumericalToNominal"] = nrNumeric / nrNominal if nrNominal > 0 else 0
		features["RatioNominalToNumerical"] = nrNominal / nrNumeric if nrNumeric > 0 else 0

		class_probabilities = [ count / n for count in counts ]
		features["ClassProbabilityMin"] = np.min(class_probabilities)
		features["ClassProbabilityMax"] = np.max(class_probabilities)
		features["ClassProbabilityMean"] = np.mean(class_probabilities)
		features["ClassProbabilitySTD"] = np.std(class_probabilities)

		symbols_per_column = [ np.unique(column).shape[0] for column in X[:, np.where(categorical)].T]
		if len(symbols_per_column) > 0:
			features["SymbolsMin"] = np.min(symbols_per_column)
			features["SymbolsMax"] = np.max(symbols_per_column)
			features["SymbolsMean"] = np.mean(symbols_per_column)
			features["SymbolsSTD"] = np.std(symbols_per_column)
			features["SymbolsSum"] = np.sum(symbols_per_column)
		else:
			features["SymbolsMin"] = features["SymbolsMax"] = features["SymbolsMean"] = features["SymbolsSTD"] = features["SymbolsSum"] = 0

	features["SimpleFeatureTime"] = sw.duration
	# Missing value features missing for now since only datasets without missing features were selected.

	return features
def subsample_metafeatures(X, y, categorical, folds):
	utils.input_check(X, y, categorical)
	features = OrderedDict()

	accuracy, total_time, mean_time = cross_validate_classifier(ensemble.RandomForestClassifier(), X, y, folds)
	features["SubsampleRandomForestAccuracy"] = accuracy
	features["SubsampleRandomForestMeanTime"] = mean_time

	accuracy, total_time, mean_time = cross_validate_classifier(svm.SVC(), X, y, folds)
	features["SubsampleSVCAccuracy"] = accuracy
	features["SubsampleSVCMeanTime"] = mean_time

	accuracy, total_time, mean_time = cross_validate_classifier(ensemble.GradientBoostingClassifier(), X, y, folds)
	features["SubsampleBoostingAccuracy"] = accuracy
	features["SubsampleBoostingMeanTime"] = mean_time

	return features
Example #6
0
def statistical_metafeatures(X, y, categorical):
    utils.input_check(X, y, categorical)
    features = OrderedDict()

    numerical = [not cat for cat in categorical]

    # Statistical meta-features are only for the numerical attributes, if there are none, we list them as -1
    # we should see if there is a better way to deal with this, as -1 is a valid value for some of these features..
    if (sum(numerical) == 0):
        return OrderedDict.fromkeys(statistical_metafeature_names(), value=-1)

    with utils.stopwatch() as sw:
        # Taking taking kurtosis of kurtosis and skewness of kurtosis is suggested by Reif et al. in Meta2-features (2012)
        kurtosisses = [
            scipy.stats.kurtosis(column[0])
            for column in X[:, np.where(numerical)].T
        ]
        features["KurtosisMin"] = np.min(kurtosisses)
        features["KurtosisMax"] = np.max(kurtosisses)
        features["KurtosisMean"] = np.mean(kurtosisses)
        features["KurtosisSTD"] = np.std(kurtosisses)
        features["KurtosisKurtosis"] = scipy.stats.kurtosis(kurtosisses)
        features["KurtosisSkewness"] = scipy.stats.skew(kurtosisses)

        skewnesses = [
            scipy.stats.skew(column[0]) for column in X[:,
                                                        np.where(numerical)].T
        ]
        features["SkewnessMin"] = np.min(skewnesses)
        features["SkewnessMax"] = np.max(skewnesses)
        features["SkewnessMean"] = np.mean(skewnesses)
        features["SkewnessSTD"] = np.std(skewnesses)
        features["SkewnessKurtosis"] = scipy.stats.kurtosis(skewnesses)
        features["SkewnessSkewness"] = scipy.stats.skew(skewnesses)

        standard_deviations = [
            np.std(column[0]) for column in X[:, np.where(numerical)].T
        ]
        features["MeanSTDOfNumerical"] = np.mean(standard_deviations)
        features["STDSTDOfNumerical"] = np.std(standard_deviations)

    features["StatisticalFeatureTime"] = sw.duration

    return features
Example #7
0
def subsample_metafeatures(X, y, categorical, folds):
    utils.input_check(X, y, categorical)
    features = OrderedDict()

    accuracy, total_time, mean_time = cross_validate_classifier(
        ensemble.RandomForestClassifier(), X, y, folds)
    features["SubsampleRandomForestAccuracy"] = accuracy
    features["SubsampleRandomForestMeanTime"] = mean_time

    accuracy, total_time, mean_time = cross_validate_classifier(
        svm.SVC(), X, y, folds)
    features["SubsampleSVCAccuracy"] = accuracy
    features["SubsampleSVCMeanTime"] = mean_time

    accuracy, total_time, mean_time = cross_validate_classifier(
        ensemble.GradientBoostingClassifier(), X, y, folds)
    features["SubsampleBoostingAccuracy"] = accuracy
    features["SubsampleBoostingMeanTime"] = mean_time

    return features
Example #8
0
def load_data_from_db(prop=None, optsets=None, structsets=None, calcsets=None):
    input_check(
        prop=prop, optsets=optsets, structsets=structsets, calcsets=calcsets)

    where_cond1 = "structset in %s" % (tuple(structsets), )
    where_cond2 = "optset in %s and calcset in %s" % (
        tuple(optsets), tuple(calcsets))
    where_cond1 = where_cond1.replace(",)", ")")
    where_cond2 = where_cond2.replace(",)", ")")

    # The double nested selects are to fix an issue where the empty data points were
    # getting joined together into the last empty data point.
    string = """
    SELECT a.name, group_concat(a.prop)
    FROM (
    SELECT d.name_id, d.name, IFNULL(data.{prop}, '') as prop
    FROM (
        SELECT names.id as name_id, names.name, ds.id as ds_id
        FROM (SELECT id, name FROM names WHERE {where1}) as names
        CROSS JOIN (SELECT id FROM datasets WHERE {where2}) as ds
        ) as d
    LEFT JOIN data
        ON d.name_id = data.name_id and d.ds_id = data.dataset_id
    ) as a
    GROUP BY a.name_id;
    """.format(prop=prop, where1=where_cond1, where2=where_cond2)

    data = []
    with sqlite3.connect("database.sqlite") as conn:
        for name, values in conn.execute(string):
            try:
                data.append(
                    (name, [float(x) if x else None for x in values.split(',')]))
            except:
                pass

    string = "SELECT optset, calcset FROM datasets WHERE %s" % (where_cond2, )
    columns = [x for x in conn.execute(string)]
    return data, columns
Example #9
0
def landmarker_metafeatures(X, y, categorical, folds):
    utils.input_check(X, y, categorical)
    features = OrderedDict()

    # TODO: When having folds, do cross-validation instead as it takes more time
    # and also can give a fair indication of predictive accuracy
    for i in [1, 2, 4]:
        accuracy, total_time, mean_time = cross_validate_classifier(
            DecisionTreeClassifier(criterion='gini', max_depth=i), X, y, folds)
        features["DecisionTreeGiniDepth{}Accuracy".format(i)] = accuracy
        features["DecisionTreeGiniDepth{}TimeSum".format(i)] = total_time
        features["DecisionTreeGiniDepth{}TimeMean".format(i)] = mean_time

        accuracy, total_time, mean_time = cross_validate_classifier(
            DecisionTreeClassifier(criterion='entropy', max_depth=i), X, y,
            folds)
        features["DecisionTreeEntropyDepth{}Accuracy".format(i)] = accuracy
        features["DecisionTreeEntropyDepth{}TimeSum".format(i)] = total_time
        features["DecisionTreeEntropyDepth{}TimeMean".format(i)] = mean_time

    accuracy, total_time, mean_time = cross_validate_classifier(
        DecisionTreeClassifier(), X, y, folds)
    features["DecisionTreeGiniAccuracy".format(i)] = accuracy
    features["DecisionTreeGiniTimeSum".format(i)] = total_time
    features["DecisionTreeGiniTimeMean".format(i)] = mean_time

    accuracy, total_time, mean_time = cross_validate_classifier(
        GaussianNB(), X, y, folds)
    features["GaussianNBAccuracy".format(i)] = accuracy
    features["GaussianNBTimeSum".format(i)] = total_time
    features["GaussianNBTimeMean".format(i)] = mean_time

    accuracy, total_time, mean_time = cross_validate_classifier(
        KNeighborsClassifier(n_neighbors=1), X, y, folds)
    features["1NNAccuracy".format(i)] = accuracy
    features["1NNTimeSum".format(i)] = total_time
    features["1NNTimeMean".format(i)] = mean_time
    return features
Example #10
0
def information_theoretic_metafeatures(X, y, categorical):
    utils.input_check(X, y, categorical)
    features = OrderedDict()

    classes, counts = np.unique(y, return_counts=True)
    features["ClassEntropy"] = scipy.stats.entropy(counts, base=2)

    # Information theoretic meta-features below only apply to categorical values
    if (sum(categorical) == 0):
        return OrderedDict.fromkeys(information_theoretic_metafeature_names(),
                                    value=-1)

    with utils.stopwatch() as sw:
        feature_entropies = [
            scipy.stats.entropy(column[0])
            for column in X[:, np.where(categorical)].T
        ]
        mean_feature_entropy = np.mean(feature_entropies)
        features["MeanFeatureEntropy"] = np.mean(mean_feature_entropy)

        mutual_informations = [
            sklearn.metrics.mutual_info_score(y, column[0])
            for column in X[:, np.where(categorical)].T
        ]
        mean_mutual_information = np.mean(mutual_informations)
        features["MeanMutualInformation"] = mean_mutual_information

        if (mean_mutual_information == 0):
            features["NoiseToSignalRatio"] = 0

        features["NoiseToSignalRatio"] = (
            mean_feature_entropy -
            mean_mutual_information) / mean_mutual_information

    features["InformationFeatureTime"] = sw.duration
    return features
def statistical_metafeatures(X, y, categorical):
	utils.input_check(X, y, categorical)
	features = OrderedDict()

	numerical = [not cat for cat in categorical]

	# Statistical meta-features are only for the numerical attributes, if there are none, we list them as -1
	# we should see if there is a better way to deal with this, as -1 is a valid value for some of these features..
	if(sum(numerical) == 0):
		return OrderedDict.fromkeys(statistical_metafeature_names(), value = -1)

	with utils.stopwatch() as sw:
		# Taking taking kurtosis of kurtosis and skewness of kurtosis is suggested by Reif et al. in Meta2-features (2012)
		kurtosisses = [scipy.stats.kurtosis(column[0]) for column in X[:,np.where(numerical)].T]	
		features["KurtosisMin"] = np.min(kurtosisses)
		features["KurtosisMax"] = np.max(kurtosisses)
		features["KurtosisMean"] = np.mean(kurtosisses)
		features["KurtosisSTD"] = np.std(kurtosisses)
		features["KurtosisKurtosis"] = scipy.stats.kurtosis(kurtosisses)
		features["KurtosisSkewness"] = scipy.stats.skew(kurtosisses)

		skewnesses = [scipy.stats.skew(column[0]) for column in X[:,np.where(numerical)].T]
		features["SkewnessMin"] = np.min(skewnesses)
		features["SkewnessMax"] = np.max(skewnesses)
		features["SkewnessMean"] = np.mean(skewnesses)
		features["SkewnessSTD"] = np.std(skewnesses)
		features["SkewnessKurtosis"] = scipy.stats.kurtosis(skewnesses)
		features["SkewnessSkewness"] = scipy.stats.skew(skewnesses)

		standard_deviations = [np.std(column[0]) for column in X[:,np.where(numerical)].T]
		features["MeanSTDOfNumerical"] = np.mean(standard_deviations)
		features["STDSTDOfNumerical"] = np.std(standard_deviations)

	features["StatisticalFeatureTime"] = sw.duration	

	return features
def MW_for_PCA(n,
               m,
               d,
               B,
               weight=None,
               alpha=None,
               beta=None,
               eta=1,
               T=10,
               verbose=False,
               report_all_obj=False,
               n_X_last=0,
               dual_function=None,
               stopping_gap=1e-6,
               primal_function=None,
               NSW_update=False):
    '''
    Arguments:
        n: size of matrices in B
        m: number of objectives
        d: target dimension
        B: list of m matrices
        weight: any initial weight, if user wants to specify. By default, this is set to all 1/m.
        alpha: list of m numbers, used in the objectives By default, this is set to all 1.
        beta: list of m numbers, used in the objectives. By default, this is set to all 0.
        eta: learning rate of MW or an array of length T. Changing learning rate can be done by specifying the learning rate for each of T iterations as an array of length T
        T: number of iteration
        verbose: will print the objective value in each single iteration. 
                 This may not be needed as the function will output pandas dataframe of all statistics already when report_all_obj=True.
        report_all_obj: if objective and weight on each group will be included in the output dataframe statistics 
        n_X_last = number of X_last I will keep. It will keep n_X_last last iterates' solutions (iterates T-n_X_last up to T) rather than just the last one, if specified. Note that it does not keep any if MW terminates early due to close duality gap - in that case, the last or average iterate would be the choice to use, not the last few iterates.
        dual_function(w,B,X): given weight vector w=[w_1,...,w_m], B=[B_1,...,B_m], specify the dual objective function to calculate. This function can be obtained after knowing the social utility welfare objective (see 'Obj' in fairDimReduction_MW method). By default, it is None, so no dual will be calculated.
        Optionally, to speedup runtime, the dual will also receive the optimum solution to weighted PCA of that iteration. (Can be ignored when specifying the function definition)
        
        stopping_gap: if not None and positive, and if calculate_dual is true, the MW will stop automatically when primal and dual is no more than the gap specified. By default, this is set to 1e-6.
        
        primal_function(B,X): the primal by default is specfied as minimum of alpha_i <B_i,X> + beta_i. One can also specify others, such as for NSW, here. This can be used for comparing the MW on other objectives. 
        
        NSW_update: a different update rule is applied by the calculation of the gradient of weight in dual objective function 
        *** Depreciated. Do not recommend NSW_update due to bad performance on dual space that is not simplex ***
        
    Note: in theory, for eps < 1 approximation to optimization problem with bounded objective in [0,1],
        eta = eps/8
        T = 32log(m)/eps^2
    analyzed in "The Price of Fair PCA: One Extra Dimension."
        
    Given m objectives to maximize simultanously
        alpha_1 <B_1,X> + beta_1
        alpha_2 <B_2,X> + beta_2
        ...
        alpha_m <B_m,X> + beta_m
    subject to
        tr(X) <= d
        0 << X << I (matrix inequality)
    the function uses MW to maximize the minimum of m objectives.
    
    Output:
    [X_last,X_avg,runstats]
        X_last: n x n matrix X from the last iterate. 
        X_avg: averge over T matrices of n x n matrices X from each of T iterates. 
        runstats: value of weights and objectives in each iterate
    or [when n_X_last > 0]
        X_last: list of n_X_last n x n matrices X from the last n_X_last iterates (ordered from (T - n_X_last +1)th iterates till the very last iterate).
    
    '''
    #input check
    if (input_check(n, m, d, B, function_name='MW_for_PCA') > 0):
        return -1

    if (weight is None):
        weight = np.full(m, 1 / m)
    weight = weight / weight.sum(
    )  #Without loss of generality, make sure the sum of weight is 1

    if (alpha is None):
        alpha = np.full(m, 1)

    if (beta is None):
        beta = np.zeros(m)

    if (weight.shape != (m, ) or alpha.shape != (m, ) or beta.shape != (m, )):
        print(
            "Error: MW_for_PCA is called with wrong weight or alpha or beta coefficient size. They should be numpy vectors of length m"
        )
        return -1

    if isinstance(eta, list):  #given a list
        if len(eta) < T:
            print(
                'Error: MW_for_PCA is called with list of eta having less than T numbers'
            )
            return -1

    elif (eta > 0):
        #This is good case. Make eta an array for simplicity of code
        eta = [eta for i in range(T)]

    else:
        print(
            "Error: MW_for_PCA is called eta not a positive real number nor a list."
        )
        return -1

    run_stats = pd.DataFrame()
    X_avg = np.zeros((n, n))

    if (n_X_last >
            0):  # I want to keep a few last iterates, not just the last one
        list_X = []

    for t in range(T):

        [X, _, Obj] = weightedPCA(n,
                                  m,
                                  d,
                                  B,
                                  weight=weight,
                                  alpha=alpha,
                                  beta=beta,
                                  calculate_objective=True)

        if (n_X_last > 0):  # I want to keep this if it is closer to the end
            if (t + n_X_last >= T):
                list_X.append(X)

        #update the average solution of X. In MW, the average is guaranteed to converge, not the last iterate, at least in theory
        X_avg = t * (X_avg / (t + 1)) + X / (t + 1)

        #this stats below keeps the weight and objective value of this iterate
        this_t_stats = {'iteration': t}
        if report_all_obj:
            this_t_stats.update(
                dict(('weight' + str(i), weight[i]) for i in range(m)))
        this_t_stats.update({'minimum of m objective, that iterate': min(Obj)})
        avg_Obj = min([
            alpha[i] * np.multiply(B[i], X_avg).sum() + beta[i]
            for i in range(m)
        ])
        this_t_stats.update({'minimum of m objective, avg iterate': avg_Obj})

        #add the primal objective, if specified
        if (primal_function is not None):
            this_t_stats.update(
                {'primal objective, that iterate': primal_function(B, X)})
            this_t_stats.update(
                {'primal objective, avg iterate': primal_function(B, X_avg)})

        #add the dual objective
        if (dual_function is not None):
            dual_val = dual_function(weight, B, X)

            #dual bound is the best we see so far
            if (t > 0): dual_val = min([dual_val, dual_val_previous])
            this_t_stats.update({'dual objective': dual_val})

            dual_val_previous = dual_val

        #update with the objective
        if report_all_obj:
            this_t_stats.update(
                dict(('Obj' + str(i), Obj[i]) for i in range(m)))
        if (verbose):
            print("stats at iteration " + str(t) + " is :")
            print(this_t_stats)
        run_stats = run_stats.append(pd.DataFrame(this_t_stats, index=[t]))

        #now the update of the weight
        Loss = np.multiply(-1, Obj)

        for i in range(m):
            #gradient of dual will also have -1/w_i term
            if (NSW_update): Loss[i] -= 1 / (weight[i])

            weight[i] = math.exp(eta[t] * Loss[i]) * weight[i]

            #boudn away from 0 to bound gradient norm 1/(weight[i])
            if (NSW_update): weight[i] = min([1e-4, weight[i]])
        if (NSW_update == False):  #normal MW, else no need to do this
            weight = weight / weight.sum()

        if ((dual_function is not None) and (stopping_gap is not None)
                and (stopping_gap > 0)):
            #we have to check if we need to stop
            # min(Obj) is the minimum of utility of all groups, which is the social welfare in MM_Loss and MM_Var case
            if (abs(dual_val - min(Obj)) < stopping_gap):
                print(
                    "MW terminated at T=", t,
                    " iterations: current iterate solution achieved primal-dual gap of",
                    stopping_gap)
                break

            elif (abs(dual_val - avg_Obj) < stopping_gap):
                print(
                    "MW terminated at T=", t,
                    " iterations: average iterate solution achieved primal-dual gap of",
                    stopping_gap)
                break

    if (
        (n_X_last > 0) and (len(list_X) > 0)
    ):  #return the whole list of last few X's if not empty. Else, this happens when gap is reached earlier than T iterations
        return [list_X, X_avg, run_stats]
    else:
        return [X, X_avg, run_stats]
Example #13
0
def fairDimReductionFractional(n,k,B,list_n='one',Obj='MM_Loss',list_d='all',verbose=1,print_other_obj=True,return_option='run_statistics',save=True,savedPath='fairDimReductionFractional.csv'):
    """
    Given k PSD n-by-n matrices B1,...,Bk, solve the (fractional) convex optimization of fair dimensional reduction.
    Arguments:
        k: number of groups
        n: original number of features (size of all B_i's)
        B: list of PSD matrices, as numpy matrices. It must contain at least k matrices. If there are more than k matrices provided, the first k will be used as k groups.
        list_n: by default, this is simply n, which is the total number of features. 
                If 'all', this n0 (number of dimension, first n0 features are used) ranges from d0+1 to n (d0 is the target dimension of that iteration)
                Else, you can specify as a list of n_0.
        list_d: list of target dimensions to project to. By default ('all'), it is from 1 to n-1.
        print_other_obj:setting to True will also print other welfare economic objective (total of four, including one specified as input Obj)
        verbose: set to 1 if the details are to be printed. Set to 2 to print the information table of each iteration
        save: will save to csv if set to True
        savedPath: path of the file to export the result to.
        Obj:the objective to optimize. Must be MM_Var (maximize the minimum variance), MM_Lose (default) (minimize the maximum loss, output the negative), or NSW (Nash social welfare)
        return_option:  by default, it returns the runtime, n,d, the rank, and several objectives of each group. 
                        Another option 'frac_sol' is to return a list of fractional solution X. list_X (the additional output) will be a list, each row containing one run of particular value of n and d.
                        In each row, it contains value of n,d and solution X as cvx matrix. One can convert this back to numpy by:
                            import numpy as np
                            array(list_X[0][2]) --> this gives numpy solution matrix X of the first setting of (n,d).
                            array(list_X[i][2]) --> this gives numpy solution matrix X of the (i+1)th setting of (n,d).
        
    """
    
    #input check
    if (input_check(n, k, 1, B, function_name='fairDimReductionFractional') > 0):
        return -1
        
    #for storing results of the optimization
    runstats = pd.DataFrame()
    if (return_option == 'frac_sol'):
        list_X = []
    
    #list of all d
    if (list_d == 'all'):
        list_d = range(1,n)

    for d in list_d:
        #valid value of n_0
        if (list_n == 'one'):
            list_n_this_d = [n]
        elif (list_n == 'all'):
            list_n_this_d = range(d+1,n+1)
        else:
            list_n_this_d = list_n
            
        for n0 in list_n_this_d:
            #shorten version of the matrix, in case we want to delete any earlier features for experiments
            Bnumpy_s = [B[i][np.ix_(range(0,n0),range(0,n0))] for i in range(k)]

            #now define the problem
            B_s = [matrix(B[i][np.ix_(range(0,n0),range(0,n0))]) for i in range(k)]

            fairPCA = pic.Problem()
            n = n0

            I =pic.new_param('I',cvx.spmatrix([1]*n,range(n),range(n),(n,n))) #identity matrix

            # Add the symmetric matrix variable.
            X=fairPCA.add_variable('X',(n,n),'symmetric') #projection matrix, should be rank d but relaxed
            z=fairPCA.add_variable('z',1) #scalar, for the objective

            # Add parameters for each group
            A = [pic.new_param('A'+str(i),B_s[i]) for i in range(k)]

            #best possible variance for each group
            best = [np.sum(np.sort(np.linalg.eigvalsh(Bnumpy_s[i]))[-d:]) for i in range(k)]
            
            # Constrain X on trace
            fairPCA.add_constraint(I|X<=d)

            # Constrain X to be positive semidefinite.

            fairPCA.add_constraint(X>>0)
            fairPCA.add_constraint(X<<I)
            
            #the following depends on the type of the problems. Here we coded 3 of them: 
            #1) max min variance 2) min max loss 3) Nash social welfare of variance
            
            if(Obj=='MM_Loss'):
                # Add loss constriant
                fairPCA.add_list_of_constraints([(A[i]|X) - best[i] >= z for i in range(k)]) #constraints

                # Set the objective.
                fairPCA.set_objective('max',z)
                
            elif (Obj=='MM_Var'):
                # Add variance constriant
                fairPCA.add_list_of_constraints([(A[i]|X) >= z for i in range(k)]) #constraints

                # Set the objective.
                fairPCA.set_objective('max',z)
                
            elif (Obj=='NSW'):
                s=fairPCA.add_variable('s',k) #vector of variances
                # Add variance constriant
                fairPCA.add_list_of_constraints([(A[i]|X) >= s[i] for i in range(k)]) #constraints

                # Set the objective.
                fairPCA.add_constraint( z <= pic.geomean(s) )
                fairPCA.set_objective('max',z)
                
            else:
                fairPCA.set_objective('max',z)
                print("Error: fairDimReductionFractional is called with invalid Objective. Supported Obj augements are: ... Exit the method")
                return

            solveInfo=fairPCA.solve(verbose = 0,solver='cvxopt')

            var = [np.sum(np.multiply(Bnumpy_s[i],X.value)) for i in range(k)]
            loss = [var[i] - best[i] for i in range(k)]
            

            #dictionary of info for this iterate
            solveInfoShort = dict((key, solveInfo[key]) for key in ('time','obj','status'))
            if (print_other_obj):
                solveInfoShort.update({'MM_Var':np.amin(var),'MM_loss':np.amin(loss),'NSW':geo_mean_through_log(var),'Total_Var':np.sum(var)})
            
            solveInfoShort.update({'n':n0,'gap':solveInfo['cvxopt_sol']['gap'],'d':d,'rank':np.linalg.matrix_rank(array(X.value),tol=1e-6,hermitian =True)})
            
            for i in range(k):
                solveInfoShort.update({'Loss'+str(i):loss[i],'Var'+str(i):var[i],'Best'+str(i):best[i]})
            
            #add information of this optimization for this d,n0
            runstats = runstats.append(pd.DataFrame(solveInfoShort,index=[n0])) #add this info
            
            if (return_option == 'frac_sol'):
                list_X.append([n0,d,X.value])

    if(verbose==2):
        print(runstats)
        
    if(verbose==1):
        print("The total number of cases tested is:")
        print(len(runstats))
        print("The number of cases where the rank is exact is:")
        print(len(runstats[runstats['d']==runstats['rank']]))
        
    if(save):
        runstats.to_csv(savedPath,index=False)
    
    if (return_option == 'frac_sol'):
        return [runstats,list_X]
    
    return runstats;
Example #14
0
def MW_for_PCA(n,
               m,
               d,
               B,
               weight=None,
               alpha=None,
               beta=None,
               eta=1,
               T=10,
               verbose=False):
    '''
    Arguments:
        n: size of matrices in B
        m: number of objectives
        d: target dimension
        B: list of m matrices
        weight: any initial weight, if user wants to specify. By default, this is set to all 1/m.
        alpha: list of m numbers, used in the objectives By default, this is set to all 1.
        beta: list of m numbers, used in the objectives. By default, this is set to all 0.
        eta: learning rate of MW.
        T: number of iteration
        verbose: will print the objective value in each single iteration. 
                 This may not be needed as the function will output pandas dataframe of all statistics already.
    Note: in theory, for eps < 1 approximation to optimization problem with bounded objective in [0,1],
        eta = eps/8
        T = 32log(m)/eps^2
    analyzed in "The Price of Fair PCA: One Extra Dimension."
        
    Given m objectives to maximize simultanously
        alpha_1 <B_1,X> + beta_1
        alpha_2 <B_2,X> + beta_2
        ...
        alpha_m <B_m,X> + beta_m
    subject to
        tr(X) <= d
        0 << X << I (matrix inequality)
    the function uses MW to maximize the minimum of m objectives.
    
    Output:
    [X_last,X_avg,runstats]
        X_last: n x n matrix X from the last iterate. 
        X_avg: averge over T matrices of n x n matrices X from each of T iterates. 
        runstats: value of weights and objectives in each iterate
    '''
    #input check
    if (input_check(n, m, d, B, function_name='MW_for_PCA') > 0):
        return -1

    if (weight is None):
        weight = np.full(m, 1 / m)
    weight = weight / weight.sum(
    )  #Without loss of generality, make sure the sum of weight is 1

    if (alpha is None):
        alpha = np.full(m, 1)

    if (beta is None):
        beta = np.zeros(m)

    if (weight.shape != (m, ) or alpha.shape != (m, ) or beta.shape != (m, )):
        print(
            "Error: MW_for_PCA is called with wrong weight or alpha or beta coefficient size. They should be numpy vectors of length m"
        )
        return -1

    if ((eta > 0) == False):
        print("Error: MW_for_PCA is called eta not a positive real number.")
        return -1

    run_stats = pd.DataFrame()
    X_avg = np.zeros((n, n))

    for t in range(T):

        [X, _, Obj] = weightedPCA(n,
                                  m,
                                  d,
                                  B,
                                  weight=weight,
                                  alpha=alpha,
                                  beta=beta,
                                  calculate_objective=True)
        #update the average solution of X. In MW, the average is guaranteed to converge, not the last iterate, at least in theory
        X_avg = t * (X_avg / (t + 1)) + X / (t + 1)

        #this stats below keeps the weight and objective value of this iterate
        this_t_stats = {'iteration': t}
        this_t_stats.update(
            dict(('weight' + str(i), weight[i]) for i in range(m)))
        this_t_stats.update({'minimum of m objective, that iterate': min(Obj)})
        this_t_stats.update({
            'minimum of m objective, avg iterate':
            min([
                alpha[i] * np.multiply(B[i], X_avg).sum() + beta[i]
                for i in range(m)
            ])
        })
        this_t_stats.update(dict(('Obj' + str(i), Obj[i]) for i in range(m)))
        if (verbose):
            print("stats at iteration " + str(t) + " is :")
            print(this_t_stats)
        run_stats = run_stats.append(pd.DataFrame(this_t_stats, index=[t]))

        #now the update of the weight
        Loss = np.multiply(-1, Obj)
        for i in range(m):
            weight[i] = math.exp(eta * Loss[i]) * weight[i]
        weight = weight / weight.sum()

    return [X, X_avg, run_stats]
def weightedPCA(n,
                k,
                d,
                B,
                weight=None,
                alpha=None,
                beta=None,
                calculate_objective=False):
    '''
    Arguments:
        n: dimension of (symmetric real) matrix B
        k: number of B_i
        d: target dimension of the PCA
        weight: vector of k numbers as numpy list specifying weight of PCA to each group to combine. This is all 1/k by default. 
        alpha: additional weight to multiply to <B_i,X>, if any. This is all 1 by default. 
        beta: constant adding to the objective: alpha_i (<B_i,X>) + beta_i.
        
    Task:
    Given the objective 
    sum_{i=1}^k w_i * (alpha_i (<B_i,X>) + beta_i).
    where B_i and X are n-by-n matrices, solve to get the rank-d solution X. This is simply standard PCA on the weighted data
    sum_{i=1}^k w_i alpha_i B_i
    
    Note:
    - The solution is independent of beta. Beta only affects the objective function.
    - It seems redundant to have weight and alpha, as both are just weights multiplied together. 
      The only reason for separating them is to calculate the objective value alpha_i (<B_i,X>) + beta_i 
      which is independent of weight but dependent on alpha.
    - Assume matrices B[i] are all symmetric reals. If not, change 'np.linalg.eigh(W)' to 'np.linalg.eig(W)
    
    Output: 
    [X = P P^T , P , Obj_list = [Obj_1, ..., Obj_k] ],
    or [X = P P^T , P] if calculate_objective = False
    - solution X which is n x n matrix of rank d. 
    - P is the n x d matrix of d principle eigenvectors as columns, sorted from ones with biggest eigenvalue first to lower.
    - objective value Obj_i = alpha_i (<B_i,X>) + beta_i
    
    to save time, calculate_objective can be set to false when calculating Obj_i is not needed. This is the default.
    
    '''
    if (input_check(n, k, d, B, function_name='weightedPCA') > 0):
        return -1

    #I have to define default value in the function, not at the declaration of funciton, as the defaul value is dependent on k
    if (weight is None):
        weight = np.full(k, 1 / k)

    if (alpha is None):
        alpha = np.full(k, 1)

    if (beta is None):
        beta = np.zeros(k)

    if (weight.shape != (k, ) or alpha.shape != (k, ) or beta.shape != (k, )):
        print(
            "Error: weightedPCA is called with wrong weight or alpha or beta coefficient size. They should be numpy vectors of length k"
        )
        return -1

    #normalization
    weight = weight / weight.sum()
    #alpha_normalized = alpha/alpha.sum() #no need for this
    #Note that I define new alpha because I don't want to edit alpha when I calculate objective value later, which has alpha in the expression

    W = np.zeros((n, n))
    for i in range(k):
        W = W + (weight[i] * alpha[i]) * B[i]

    [eigenValues, eigenVectors] = np.linalg.eigh(W)
    #note: sometimes the numerical problems makes small complex parts, Put np.real to avoid them, as we know W should be PSD. We put linalg.eigh instead of linalg.eig for this, or alternatively do what follow(s):
    #eigenValues = eigenValues.real
    #eigenVectors= eigenVectors.real

    #sort eigenvalues and eigenvectors in decending orders
    idx = eigenValues.argsort()[::-1]
    eigenValues = eigenValues[idx]
    eigenVectors = eigenVectors[:, idx]

    #take the first d vectors. Obtained the solution
    P = eigenVectors[:, :d]
    X = P @ P.T

    if (calculate_objective == False):
        return [X, P]
    else:
        Obj_list = [
            alpha[i] * np.multiply(B[i], X).sum() + beta[i] for i in range(k)
        ]
        return [X, P, Obj_list]
def fairDimReduction_MW(n,
                        k,
                        d,
                        B,
                        Obj='MM_Loss',
                        eta=1,
                        T=10,
                        verbose=False,
                        timed=True,
                        n_X_last=0,
                        return_time_only=False,
                        calculate_dual=False,
                        eps=1e-9,
                        stopping_gap=1e-6):
    '''
    Arguments:
        n: size of matrices in B
        k: number of objectives
        d: target dimension
        B: list of k matrices
        Obj:the objective to optimize. Must be MM_Var (maximize the minimum variance) or MM_Lose (default) (minimize the maximum loss, output the negative number variance - best)
        
        *** Obj can also be NSW (Nash social welfare, which is the sum of log of variances across groups), but due to bad performance of MW when the dual space is not simplex, we do not recommend solving NSW by MW. Use Frank-Wolfe for NSW instead ***
        
        eta: learning rate of MW. See changing learning rate by putting eta as an array of T numbers in MW_for_PCA
        T: number of iteration
        verbose: will print the objective value in each single iteration of MW. 
                 This may not be needed as the function will output pandas dataframe of all statistics already. 
        timed: will print amount of time used by this method in total, in seconds.
        n_X_last: the number of X_last I will keep. It will keep n_X_last last iterates' solutions.
        return_time_only: put this to true if one wants to measure the runtime only. It will return only the time in seconds of this method.
        calculate_dual: if this is true, for each weight vector w during the multiplcative weight update, the method calculates the dual objective by w: 
        D(w) := max_{n-by-n matrix X: tr(X)<=d, 0<=X<=I} {sum_{group i} w_i*<B,X>} - f_*(w). 
        Here, f is the objective function (for example, with MM_Var, f(z_1,...,z_k)=min_i {z_i}), and f_* is the concave conjugate of function f_*.
        
    Note on the dual formulation: 1) strong duality holds for concave f. 2) We maximize f, and try to minimize the dual objective D(w) over reals w. 3) Some of the concave congulate functions (denote z_i=<B_i,X>):
        f = MM_Var = min_i {z_i} 
            --> f_*(z) = 0 if w>=0 and w_1+...+w_k = 1; = -\infty otherwise.
        f = MM_Loss = min_i {z_i + beta_i} for beta_i the best variance of group i 
            --> f_*(z) = -sum_i w_i*beta_i if w>=0 and w_1+...+w_k = 1; = -\infty otherwise.
        f = NSW = sum_i {log z_i}
            --> f_*(z) = sum_i {1 + log w_i} for w > 0; -\infty otherwise
            
        eps: numerical error threshold for checking if w satisfies sum w_i = 1 and w>=0
        stopping_gap: if not None and positive, and if calculate_dual is true, the MW will stop automatically when primal and dual is no more than the gap specified. By default, this is set to 1e-6.
    Output:
    [X_last,X_avg,runstats]
        X_last: n x n matrix X from the last iterate. 
        X_avg: averge over T matrices of n x n matrices X from each of T iterates. 
        runstats: value of weights and objectives in each iterate
    if n_X_last > 0:
        X_last: list of n_X_last n x n matrices X from the last n_X_last iterates (ordered from (T - n_X_last +1)th iterates till the very last iterate).
        
    OR
    
    Output:
    runtime
        runtime: time in seconds it takes to run all iterations. (Helpful for checking the running time)
    '''

    #input check
    if (input_check(n, k, d, B, function_name='fairDimReduction_MW') > 0):
        return -1

    #we just take Obj and convert that into alpha-beta notation for MW for PCA
    if (Obj == 'MM_Loss'):
        #best possible PCA projection for each group is easy to calculate: take d best eigenvalues
        best = [
            np.sum(np.sort(np.linalg.eigvalsh(B[i]))[-d:]) for i in range(k)
        ]

        #shift the objective by the best possible PCA for that single group
        beta = np.multiply(-1, best)

        #no need to modify anything in MW method
        primal_function = None

    elif (Obj == 'MM_Var'):
        beta = np.zeros(k)

        #no need to modify anything in MW method
        primal_function = None

    elif (Obj == 'NSW'):
        beta = np.zeros(k)

        #modify the objective, since NSW is not the min max form covered by MW method as it is
        def primal_function(B, X):
            utility = 0
            for i in range(len(B)):
                dot_product = np.multiply(B[i], X).sum()
                if (dot_product < 0):
                    print("Warning: the dot product <B[", i,
                          "],X> is not positive. The value is", dot_product)
                    print("Eigenvalues of X is", np.linalg.eig(X)[0])
                    print("Eigenvalues of B[i] is", np.linalg.eig(B[i])[0])
                utility += math.log(dot_product)
            return utility
    else:

        print(
            "Error:fairDimReduction_MW is called with an invalid input objective."
        )
        return -1

    #specify the dual objective, if need to
    dual_function = None
    if (calculate_dual):
        if (Obj == 'MM_Var'):

            def dual_function(w, B, X):
                if (abs(np.sum(w) - 1) > 1e-9):
                    print(
                        "Warning: dual is infeasible with w not summing to 1")
                if (np.amin(w) < 0):
                    print("Warning: dual is infeasible with some w_i < 0")

                weighted_matrix = np.full_like(
                    B[0], 0
                )  #create a matrix of same size as first B initially as 0 matrix
                for i in range(len(B)):
                    weighted_matrix += w[i] * B[i]
                return np.sum(
                    np.multiply(weighted_matrix,
                                X))  #dot product <sum {w_i B_i}, X> - f_*(w)
        elif (Obj == 'MM_Loss'):

            def dual_function(w, B, X):
                if (abs(np.sum(w) - 1) > 1e-9):
                    print(
                        "Warning: dual is infeasible with w not summing to 1")
                if (np.amin(w) < 0):
                    print("Warning: dual is infeasible with some w_i < 0")

                weighted_matrix = np.full_like(
                    B[0], 0
                )  #create a matrix of same size as first B initially as 0 matrix
                for i in range(len(B)):
                    weighted_matrix += w[i] * B[i]
                return np.sum(np.multiply(weighted_matrix, X)) + np.sum(
                    np.multiply(w, beta)
                )  #dot product <sum {w_i B_i}, X> - f_*(w) and f_*(z) = -sum_i w_i*beta_i

        elif (Obj == 'NSW'):

            def dual_function(w, B, X):
                if (np.amin(w) < 0):
                    print(
                        "Warning: dual is infeasible with some w_i < 0. The minimum found is",
                        np.amin(w))
                elif (np.amin(w) == 0):
                    return float("inf")  #log(0) is -infinity

                dual = 0
                for i in range(len(B)):
                    dual += np.multiply(B[i], X).sum() - 1 - math.log(w[i])
                return dual

    if (stopping_gap <= 0): stopping_gap = None

    start = timeit.default_timer()
    [X_last, X_avg, Obj] = MW_for_PCA(n,
                                      k,
                                      d,
                                      B,
                                      weight=None,
                                      alpha=None,
                                      beta=beta,
                                      eta=eta,
                                      T=T,
                                      verbose=verbose,
                                      n_X_last=n_X_last,
                                      dual_function=dual_function,
                                      stopping_gap=stopping_gap,
                                      primal_function=primal_function,
                                      NSW_update=(Obj == 'NSW'))

    stop = timeit.default_timer()
    if (timed):
        print("fairDimReduction_MW is called. Total time used is: ",
              stop - start, " seconds.")

    best_obj = max([
        Obj['minimum of m objective, that iterate'].max(),
        Obj['minimum of m objective, avg iterate'].max()
    ])
    best_dual = Obj['dual objective'].min()
    print(
        'The best solution found from avg and single iterate acheieves primal',
        best_obj, '. The dual is', best_dual, '. Gap is', best_dual - best_obj,
        'which is',
        abs((best_dual - best_obj) / best_dual) * 100, '%.')

    if (return_time_only):
        return stop - start

    return [X_last, X_avg, Obj]
def FW(n,
       k,
       d,
       B,
       Obj='NSW',
       delta=1e-4,
       start_solution='uniform',
       update_rule='1/t',
       duality_gap=1e-4,
       num_iterations=None):
    """
    Solve the fair PCA with social welfare objective specified by Obj
        
    The algorithm is by Frank-Wolf. Parameters update_rule, duality_gap, num_iterations are parameters of Frank-Wolfe algorithm.
    
    It returns the last iterate of Frank-Wolfe.
    Arguments:
        n: original dimension of the data
        k: number of groups
        d: target dimenion
        B: list of all k groups' data. Must be a list of k n-by-n matrices.
        delta: buffer in the objective function for its stability and Lipschitzness. 
        start_solution: options for the starting solution X_0.
            'uniform' --> X_0 = d/n I_n
            'standard_PCA' --> [not yet implemented]
        update_rule, duality_gap, num_iterations: see documentation in FrankWolfe method.
    """

    #input check
    if (input_check(n, k, d, B, function_name='FW_NSW') > 0):
        return -1

    #specify parameters to feed to FrankWolfe algorithm
    if (start_solution == 'uniform'):
        init_X = d * np.eye(n) / n
    else:
        print(
            "Warning: starting X_0 for FW_NSW not yet implemented. Using uniform as starting rule"
        )
        init_X = d * np.eye(n) / n

    def linear_oracle(G):
        P = std_PCA(G, d)  #this is n x d matrix of d top sigular values of G
        return P @ P.T  #return an n x n matrix

    #define functions based on objective function f
    if (Obj == 'NSW'):
        #if using line_search, we also need to put the function definition into FrankWolfe as well
        def primal(X):
            #return sum of log of variances
            return sum(
                [np.log(np.sum(np.multiply(B[i], X))) for i in range(k)])

        def grad_function(X):
            #sum of 1/(<B_i,X>+delta)*B[i] is the gradient of f(X)
            return sum([
                1 / (np.multiply(B[i], X).sum() + delta) * B[i]
                for i in range(len(B))
            ])

    elif (Obj == 'MM_Var'):

        def primal(X):
            return np.min([np.sum(np.multiply(B[i], X)) for i in range(k)])

        def grad_function(X):
            #B[j] of group j with lowest objective
            return B[np.argmin(
                [np.sum(np.multiply(B[i], X)) for i in range(k)])]

    elif (Obj == 'MM_Loss'):
        #the best possible variance for each group. Constant independent of X
        best = [
            np.sum(np.sort(np.linalg.eigvalsh(B[i]))[-d:]) for i in range(k)
        ]

        def primal(X):
            return np.min(
                [np.sum(np.multiply(B[i], X)) - best[i] for i in range(k)])

        def grad_function(X):
            #B[j] of group j with lowest objective

            #return B[ np.argmin([np.sum(np.multiply(B[i],X)) - best[i] for i in range(k)]) ]

            #-------------------
            #try softmax
            lam = -50  #should be -log k / eps for error eps. Use negative for softmin. positive for softmax
            w = dict()  #weights of groups
            is_0_w = dict()  #is 1 if the weight is so small
            for i in range(k):
                #weight of group i
                w[i] = 0
                #find the sum in denominator
                for j in range(k):
                    exponent = lam * (np.sum(np.multiply(B[j] - B[i], X)) -
                                      best[j] + best[i])
                    #if exponent is too low, ignore. If it is too high, this weight (after inverting) is pretty much 0
                    if exponent < -20:
                        continue
                    elif exponent > 20:
                        is_0_w = True
                        break  #done with this w_i
                    else:
                        w[i] += math.exp(exponent)
                if is_0_w:
                    w[i] = 0
                else:
                    w[i] = 1 / w[i]

            #for checking
            print('sanity check: sum of w is', sum([w[i] for i in range(k)]),
                  'w is', [w[i] for i in range(k)])

            return sum([w[i] * B[i] for i in range(k)])
    else:
        print('Error: objective for FW is invalid. Return None for FW method.')
        return None

    #perform FrankWolfe
    X_final, dual_gap = FrankWolfe(init_X,
                                   grad_function=grad_function,
                                   linear_oracle=linear_oracle,
                                   update_rule=update_rule,
                                   duality_gap=duality_gap,
                                   num_iterations=num_iterations,
                                   function=primal)

    primal = primal(X_final)
    dual = primal + dual_gap

    if (Obj == 'NSW'):
        print('NSW primal value (sum of log) is', primal,
              '. The (multiplicative) gap of product objective is ',
              (np.exp(dual_gap) - 1) * 100, '%.')

    elif (Obj == 'MM_Var') or (Obj == 'MM_Loss'):
        print(Obj, 'primal value is', primal, '. Dual is', dual,
              '. The gap is', dual_gap, ', which is',
              abs((dual - primal) / dual) * 100, '%.')

    return X_final