Beispiel #1
0
def test_manova_no_formula_no_hypothesis():
    # Same as previous test only skipping formula interface
    exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True))
    endog = X[['Basal', 'Occ', 'Max']]
    mod = MANOVA(endog, exog)
    r = mod.mv_test()
    assert isinstance(r, MultivariateTestResults)
Beispiel #2
0
def test_manova_no_formula_no_hypothesis():
    # Same as previous test only skipping formula interface
    exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True))
    endog = X[['Basal', 'Occ', 'Max']]
    mod = MANOVA(endog, exog)
    r = mod.mv_test()
    assert isinstance(r, MultivariateTestResults)
Beispiel #3
0
def test_manova_no_formula():
    # Same as previous test only skipping formula interface
    exog = add_constant(pd.get_dummies(X[['Loc']], drop_first=True))
    endog = X[['Basal', 'Occ', 'Max']]
    mod = MANOVA(endog, exog)
    intercept = np.zeros((1, 3))
    intercept[0, 0] = 1
    loc = np.zeros((2, 3))
    loc[0, 1] = loc[1, 2] = 1
    hypotheses = [('Intercept', intercept), ('Loc', loc)]
    r = mod.mv_test(hypotheses)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'],
                        0.60143661, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'],
                        0.44702843, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
                                             'Value'],
                        0.58210348, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'],
                        0.35530890, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'],
                        0.77, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'],
                        0.86, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
                                             'F Value'],
                        0.75, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'],
                        1.07, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'],
                        6, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'],
                        6, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
                                             'Num DF'],
                        6, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'],
                        3, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'],
                        16, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'],
                        18, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
                                             'Den DF'],
                        9.0909, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'],
                        9, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'],
                        0.6032, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'],
                        0.5397, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace",
                                             'Pr > F'],
                        0.6272, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'],
                        0.4109, decimal=4)
def runFeatureReduce() :
	
	orig_stdout = sys.stdout
	f = open('./best/manova.txt', 'w')
	sys.stdout = f
	
	print("Loading dataset...")
	X, y = loadDataset()
	
	maov = MANOVA(X,y)
	
	
	print(len(X))
	print(len(X[0]))
	print(len(y))

	print(maov.mv_test())
	
	est = sm.OLS(y, X)
	est2 = est.fit()
	print(est2.summary())
	
	cases=[]
	controls=[]
	for i in range (0,len(y)):
		valuesTemp=[]
		for j in range (0,len(X[0])):
			valuesTemp.append(X[i,j])
		if(y[i]==0):
			controls.append(valuesTemp)
		else:
			cases.append(valuesTemp)
	
	controls=np.asarray(controls)
	cases=np.asarray(cases)
	
	ttest,pval =  stats.f_oneway(controls,cases)
	print("p-value ANOVA",pval)
	pd.DataFrame(pval).to_csv("./pANOVA.csv", header=None, index =None)
	
	
	ttest,pval =  stats.ttest_ind(controls,cases)
	print("p-value Two sampled T-test",pval)
	pd.DataFrame(pval).to_csv("./pttestInd.csv", header=None, index =None)

	meanControls=np.mean(controls, axis=0)
	print(meanControls)
	pd.DataFrame(meanControls).to_csv("./meanControls.csv", header=None, index =None)
	
	meanCases=np.mean(cases, axis=0)
	print(meanCases)
	pd.DataFrame(meanCases).to_csv("./meanCases.csv", header=None, index =None)
	
	sys.stdout = orig_stdout
	f.close()
	return
Beispiel #5
0
class MANOVAAnalyzer:
"""Multivariate ANOVA analyzer class."""
    
    def __init__(self, independent_variables, dependent_variables):
        """Initializes and  fits the model."""
        self.model = MANOVA(dependent_variables, independent_variables)
        self.model.fit()
    
    def analyze(self):
        """Applies and tests MANOVA for the given data."""
        #self.model.mv_test() is of type MultivariateTestResults
        return MANOVAAnalysis(self.model.mv_test()) 
Beispiel #6
0
def get_best_thresh_for_layer(min_p1, min_p2, x_org, thresh_abs_mag, method=1, constrain=0, cur_layer=0, supervised=False, truth=[], res=40, plot=False):
    partition_measures=[]
    tick=1/res
    thresholds_values = np.array([])
    margin = (1-thresh_abs_mag)/2
    for i in range(1, res):
        temp=tick * i
        if(temp>=(margin) and temp<=(1-margin)):
            thresholds_values=np.append(thresholds_values, temp)

    thresholds_values=thresholds_values.round(decimals=3)

    for thresh in thresholds_values:
        (clust_1, trash) = BGC.basic_consensus_two(min_p1, min_p2,  BGC.jaccard_distance, thresh)
        clust_1 = BGC.output_to_array(clust_1, x_org.shape[0])
        try:
            manova = MANOVA(endog=x_org, exog=clust_1)
            man_out=manova.mv_test().results
            man_f_res=man_out['x0']['stat']['F Value']['Hotelling-Lawley trace']
        except ValueError:
            man_f_res = 0

        measure = []

        if(len(set(clust_1))>1):
            measure.append(silhouette_score(x_org, clust_1))
            measure.append(calinski_harabasz_score(x_org, clust_1))
            measure.append(davies_bouldin_score(x_org, clust_1))
            measure.append(len(set(clust_1)))
            measure.append(man_f_res)

            if(supervised):
                measure.append(f1_score(truth, clust_1, average='micro'))
        else:
            measure.append(0)
            measure.append(0)
            measure.append(1)
            measure.append(1)
            measure.append(0)
            if(supervised):
                measure.append(0)
        partition_measures.append(measure)

    if(constrain!=0):
        output=naive_max_thresh(thresholds_values, partition_measures, tick, method=method, cur_layer=cur_layer, constrain=constrain, plot=plot, supervised=supervised)
    else:
        output=naive_max_thresh(thresholds_values, partition_measures, tick, method=method, cur_layer=cur_layer, plot=plot, supervised=supervised)

    return output
Beispiel #7
0
def compute_manova_cvg(topdir: str, m: int):
    # Assemble a large experiment table with all data
    neighbors = ["5", "10", "15", "20"]
    tolerances = ['0.0', '0.2', '0.4', '0.6', '0.8', '1.0']
    dfs = []

    for n in neighbors:
        for tol in tolerances:
            casedir = topdir + '/' + 'nn' + '_' + tol + '_' + n
            casetable = ac.compute_stored_runs(casedir, m, None)
            casetable['TOL'] = [float(tol)] * 5
            casetable['NNN'] = [float(n)] * 5
            dfs.append(casetable)

    df = pd.concat(dfs).reset_index(drop=True)

    # Perform a regression with the data
    endog = np.asarray(df[['K', 'N']])
    exog = np.asarray(df[['TOL', 'NNN']])

    mod = MANOVA.from_formula('K + N ~ TOL + NNN + NNN:TOL', data=df)
    print(mod)
    result = mod.mv_test()
    print(result)
    return mod
 def run_manova(self):
     # https://stackoverflow.com/questions/51553355/how-to-get-pvalue-from-statsmodels-manova
     formula = 'cpt1 + dept1 + jelt1 ~ C(a01) + C(a08) + C(a01) * C(a08)'
     manova = MANOVA.from_formula(formula, self.data.feature_df)
     manova_model = manova.mv_test()
     print(type(manova_model))
     print(manova_model.summary())
 def fit_linear_reg2(self,X,y):
     dp=pd.concat([X,y],axis=1)
     table=MANOVA.from_formula('X.values~ y.values', data=dp).mv_test().results['y.values']['stat']
     Wilks_lambda=table.iloc[0,0]
     F_value=table.iloc[0,3]
     p_value=table.iloc[0,4]
     return Wilks_lambda,F_value,p_value,table
Beispiel #10
0
def global_threshold_consensus(partition_list_in, x_org, constrain=0, thresh_abs_mag=1, method=1, res=40, plot=False, supervised=False, truth=[]):
    partition_measures=[]
    tick=1/res
    thresholds_values = np.array([])
    margin = (1-thresh_abs_mag)/2
    for i in range(1, res):
        temp=tick * i
        if(temp>=(margin) and temp<=(1-margin)):
            thresholds_values=np.append(thresholds_values, temp)

    thresholds_values=thresholds_values.round(decimals=3)

    print("Beginning Analysis of ideal global threshold value...")
    for thresh in thresholds_values:
        (clust_1, trash) = BGC.basic_consensus(partition_list_in, thresh)
        clust_1 = BGC.output_to_array(clust_1, x_org.shape[0])
        measure = []
        try:
            manova = MANOVA(endog=x_org, exog=clust_1)
            man_out=manova.mv_test().results
            man_f_res=man_out['x0']['stat']['F Value']['Hotelling-Lawley trace']
        except ValueError:
            man_f_res = 0

        if(len(set(clust_1))>1):
            measure.append(silhouette_score(x_org, clust_1))
            measure.append(calinski_harabasz_score(x_org, clust_1))
            measure.append(davies_bouldin_score(x_org, clust_1))
            measure.append(len(set(clust_1)))
            measure.append(man_f_res)
            if(supervised):
                measure.append(f1_score(truth, clust_1, average='micro'))
        else:
            measure.append(0)
            measure.append(0)
            measure.append(1)
            measure.append(1)
            measure.append(0)
            if(supervised):
                measure.append(0)
        partition_measures.append(measure)

    best = naive_max_thresh(thresholds_values, partition_measures, tick, method=method, constrain=constrain,  plot=plot, supervised=supervised)
    out = BGC.basic_consensus(partition_list_in, best)
    print("Threshold:",best, ": Number of Clusters",(len(set(BGC.output_to_array(out[0], x_org.shape[0])))))
    print("Consensus Achieved")
    return out
 def fit_linear_reg(self,X,y):
     x=np.ones(X.shape[0])
     x=list(x)
     x=pd.DataFrame(x)
     x.columns=['constant']
     X=pd.concat([X,x],axis=1)
     dp=pd.concat([X,y],axis=1)
     table=MANOVA.from_formula('X.values~ y.values', data=dp).mv_test().results['y.values']['stat']
     Wilks_lambda=table.iloc[0,0]
     F_value=table.iloc[0,3]
     p_value=table.iloc[0,4]
     return Wilks_lambda,F_value,p_value,table
Beispiel #12
0
def mvsExp(exps):
    #MANOVA
    mnv = MANOVA.from_formula('rise_times + errors + energy ~ ce', data=exps)
    print(mnv.mv_test())

    #Multiple Linear Regression
    est = ols(formula='rise_times ~ cr + ce + cs + cg', data=exps).fit()
    print(est.summary())
    est = ols(formula='errors ~ cr + ce + cs + cg', data=exps).fit()
    print(est.summary())
    est = ols(formula='energy ~ cr + ce + cs + cg', data=exps).fit()
    print(est.summary())
Beispiel #13
0
def test_manova_test_input_validation():
    mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X)
    hypothesis = [('test', np.array([[1, 1, 1]]), None)]
    mod.mv_test(hypothesis)
    hypothesis = [('test', np.array([[1, 1]]), None)]
    assert_raises(ValueError, mod.mv_test, hypothesis)
    """
    assert_raises_regex(ValueError,
                        ('Contrast matrix L should have the same number of '
                         'columns as exog! 2 != 3'),
                        mod.mv_test, hypothesis)
    """
    hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1], [1]]))]
    mod.mv_test(hypothesis)
    hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1]]))]
    assert_raises(ValueError, mod.mv_test, hypothesis)
    """
Beispiel #14
0
def test_manova_sas_example():
    # Results should be the same as figure 4.5 of
    # https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/
    # viewer.htm#statug_introreg_sect012.htm
    mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X)
    r = mod.mv_test()
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'],
                        0.60143661, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'],
                        0.44702843, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Value'],
                        0.58210348, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'],
                        0.35530890, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'],
                        0.77, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'],
                        0.86, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'F Value'],
                        0.75, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'],
                        1.07, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'],
                        6, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'],
                        6, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Num DF'],
                        6, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'],
                        3, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'],
                        16, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'],
                        18, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Den DF'],
                        9.0909, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'],
                        9, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'],
                        0.6032, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'],
                        0.5397, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Pr > F'],
                        0.6272, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'],
                        0.4109, decimal=4)
Beispiel #15
0
def test_manova_test_input_validation():
    mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X)
    hypothesis = [('test', np.array([[1, 1, 1]]), None)]
    mod.mv_test(hypothesis)
    hypothesis = [('test', np.array([[1, 1]]), None)]
    assert_raises(ValueError, mod.mv_test, hypothesis)
    """
    assert_raises_regex(ValueError,
                        ('Contrast matrix L should have the same number of '
                         'columns as exog! 2 != 3'),
                        mod.mv_test, hypothesis)
    """
    hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1], [1]]))]
    mod.mv_test(hypothesis)
    hypothesis = [('test', np.array([[1, 1, 1]]), np.array([[1], [1]]))]
    assert_raises(ValueError, mod.mv_test, hypothesis)
    """
Beispiel #16
0
def test_manova_sas_example():
    # Results should be the same as figure 4.5 of
    # https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/
    # viewer.htm#statug_introreg_sect012.htm
    mod = MANOVA.from_formula('Basal + Occ + Max ~ Loc', data=X)
    r = mod.mv_test()
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Value'],
                        0.60143661, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Value'],
                        0.44702843, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Value'],
                        0.58210348, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Value'],
                        0.35530890, decimal=8)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'F Value'],
                        0.77, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'F Value'],
                        0.86, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'F Value'],
                        0.75, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'F Value'],
                        1.07, decimal=2)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Num DF'],
                        6, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Num DF'],
                        6, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Num DF'],
                        6, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Num DF'],
                        3, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Den DF'],
                        16, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Den DF'],
                        18, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Den DF'],
                        9.0909, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Den DF'],
                        9, decimal=3)
    assert_almost_equal(r['Loc']['stat'].loc["Wilks' lambda", 'Pr > F'],
                        0.6032, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Pillai's trace", 'Pr > F'],
                        0.5397, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Hotelling-Lawley trace", 'Pr > F'],
                        0.6272, decimal=4)
    assert_almost_equal(r['Loc']['stat'].loc["Roy's greatest root", 'Pr > F'],
                        0.4109, decimal=4)
Beispiel #17
0
def manova(datacol, label, variable_cols):
    """
    Performs a MANOVA to assess for example batch effects: Check if a significant proportion of the data variance is
    explained by the dataset membership.
    For more documentation see: https://www.statsmodels.org/stable/generated/statsmodels.multivariate.manova.MANOVA.html

    :param datacol: A DataCollection object storing the datasets
    :param label: The name of the label column that will be created and represents the factor in the MANOVA
    :param variable_cols: A subset of features which shall be used as variables in the MANOVA
    :return: A multiindex dataframe listing important outcome statistics of the MANOVA.
    """

    # create combined dataframe with dataframe membership as label
    df_manova = datacol.combine_dfs(label, variable_cols)

    # construct formula
    formula = construct_formula(label, variable_cols, label_side="r")

    return MANOVA.from_formula(formula, df_manova).mv_test().summary()
Beispiel #18
0
def Hypo5():
    Groups, NbComments = Luxury_vs_NonLuxury(False)

    df = pd.DataFrame({'Groups': Groups, 'NbComments': NbComments})

    print(
        stats.f_oneway(df['NbComments'][df['Groups'] == 'Luxary'],
                       df['NbComments'][df['Groups'] == 'NonLuxuary']))

    #df['Groups'].replace({1: 'Luxary', 2: 'NonLuxuary'}, inplace= True)

    print(stats.kruskal(Groups, NbComments))
    #print(stats.kruskal(df['Groups'].tolist(),df['NbComments'].tolist()))
    maov = MANOVA.from_formula('Groups ~ C(NbComments)', data=df)
    print(maov.mv_test())

    results = ols('NbComments ~ Groups', data=df).fit()
    print(results.summary())
    aov_table = sm.stats.anova_lm(results, typ=2)
    print(aov_table)
    return df
Beispiel #19
0
    def test_statistic(self, matrix_X, matrix_Y):
        """
        Computes the Manova test statistic between two datasets.
        - uses statsmodels.multivariate.manova's implementation

        :param matrix_X: a [n*p] data matrix, a matrix with n samples in p dimensions, where p >= 2
        :type matrix_X: 2D numpy.array

        :param matrix_Y: a [n*q] data matrix, a matrix with n samples in q dimensions
        :type matrix_Y: 2D numpy.array

        :return: returns a list of two items, that contains:

            - :test_statistic: the manova test statistic
            - :test_statistic_metadata: (optional) a ``dict`` of metadata that the
                                        independence tests computes in the process
        :rtype: float, dict

        **Example:**

        >>> import numpy as np
        >>> from mgcpy.independence_tests.manova import Manova

        >>> X = np.array([0.07487683, -0.18073412, 0.37266440, 0.06074847, 0.76899045,
                      0.51862516, -0.13480764, -0.54368083, -0.73812644, 0.54910974]).reshape(-1, 2)
        >>> Y = np.array([-1.31741173, -0.41634224, 2.24021815, 0.88317196, 2.00149312]).reshape(-1, 1)
        >>> manova = Manova()
        >>> manova_stat = manova.test_statistic(X, Y)
        """
        assert matrix_X.shape[0] == matrix_Y.shape[
            0], "Matrices X and Y need to be of dimensions [n, p] and [n, q], respectively, where p can be equal to q"

        # use Pillai's trace to compute MANOVA
        self.test_statistic_ = MANOVA(
            matrix_X, matrix_Y).mv_test().results['x0']['stat'].values[1, 0]

        self.test_statistic_metadata_ = {}

        return self.test_statistic_, self.test_statistic_metadata_
Beispiel #20
0
def multivariate_anova():
    cpg_data = correct_cpg_data()
    print(1)
    cpg_to_bop = cpg_sites_to_bops.get_cpg_to_bop_dictionary(cpg_data)
    print(2)
    column_dict = get_column_dict(cpg_to_bop)
    print(3)

    del cpg_data
    del cpg_to_bop

    file = open('bop_manova.txt', 'w', encoding='utf-8')
    file.write('BoP_name    p_value\n')
    ages = get_ages()
    p_val_dic = {}
    j = 0
    for bop_name, column_lst in column_dict.items():
        p_val_list = []
        size = len(column_lst)
        if size > 2:
            for i in range(size - 2):
                df = DataFrame({
                    'cpg1': column_lst[i],
                    'cpg2': column_lst[i + 1],
                    'cpg3': column_lst[i + 2],
                    'age': ages
                })
                model = MANOVA.from_formula('cpg1 + cpg2 + cpg3 ~ age', df)
                test = model.mv_test()
                p_val_list.append(test.results['age']['stat'].values[3, 4])
            minimum = min(p_val_list)
            file.write(bop_name + '\t' + str(minimum) + '\n')
            # p_val_dic.update({bop_name: minimum})
        print(j)
        j += 1

    return p_val_dic
Beispiel #21
0
 def randomization(self):
     C, A, X = [], [], []
     for i in range(0, self.ni):
         inc = self.df.iloc[:, 0:1]
         yr = self.shuffling(self.df.iloc[:, 1:2])
         #yr=self.df.iloc[:,1:2]
         c = self.nc
         cr = self.shuffling(self.df.iloc[:, 2:c + 2])
         xr = self.df.iloc[:, c + 2:]
         ndr = pd.concat([inc, yr, cr, xr], axis=1)
         #ndr.to_csv('ndr.csv')
         if self.no == 1:
             dfbjr = self.cal_1(ndr.iloc[:, 0:-1], c)
         elif self.no == 2:
             dfbjr = self.cal_2(ndr.iloc[:, 0:-1], c)
         elif self.no == 3:
             dfbjr = self.cal_3(ndr.iloc[:, 0:-1], c)
         elif self.no == 4:
             dfbjr = self.cal_4(ndr.iloc[:, 0:-1], c)
         #dfbjr.to_csv('dfbjr.csv', index=False)
         s = self.df.iloc[:, -1:]
         dfbjr = pd.concat([dfbjr, s], axis=1)
         #dfbjr.to_csv('sfd2.csv',index=False)
         dfbjtr = dfbjr[dfbjr['Set'] == 'Sub_train']
         #dfbjtr.to_csv('sfd.csv',index=False)
         xrd = dfbjtr[self.desc]
         yr = dfbjtr[yr.columns]
         table = MANOVA.from_formula(
             'xrd.values~ yr.values',
             data=dfbjtr).mv_test().results['yr.values']['stat']
         self.model.fit(xrd, yr)
         ypr = self.model.predict(xrd)
         acc = accuracy_score(yr, ypr) * 100
     C.append(table.iloc[0, 0])
     A.append(np.mean(acc))
     return C, A
Beispiel #22
0
def mmr_with_fig(endog, exog, dataset, basepath):
    manova = MANOVA(endog=endog, exog=exog)

    manova.mv_test().summary_frame.to_csv(
        f"{basepath}/multivariate_results.csv")
    results = manova.mv_test().results
    sig_key = []

    for key, (_, output) in zip(manova.mv_test().exog_names, results.items()):
        p_val = output["stat"]["Pr > F"][0]
        key = (" ").join(key.split("_"))
        if p_val < 0.05:
            sig_key.append((key, p_val))
        # partial eta square
        f_val = output["stat"]["F Value"][0]
        den_df = output["stat"]["Den DF"][0]
        num_df = output["stat"]["Num DF"][0]
        par_eta_sqr = num_df * f_val / (num_df * f_val + den_df)
        print("partical eta squared of {}: {}".format(key, par_eta_sqr))

    if not sig_key:
        sig_key.append(("None", "N/A"))

    df_coef = pd.DataFrame()
    df_pval = pd.DataFrame()
    iv_formula = " + ".join(exog.columns.tolist())
    for dv in manova.endog_names:
        univeriate = smf.ols(formula=f"{dv} ~ {iv_formula}",
                             data=dataset).fit()
        print(univeriate.summary())
        p_adjust = multipletests(univeriate.pvalues,
                                 alpha=0.05,
                                 method="bonferroni")
        df_coef = df_coef.append(univeriate.params, ignore_index=True)
        df_p_adjust = pd.DataFrame(
            np.array([p_adjust[0], p_adjust[1]]).T,
            index=["Intercept"] + exog.columns.tolist(),
            columns=["Sig.", "p_adjusted"],
        )
        df_pval = df_pval.append(df_p_adjust.iloc[:, 1], ignore_index=True)
        print(df_p_adjust)
        print("Bonferroni corrected alpha (0.05): {}\n".format(
            multipletests(univeriate.pvalues, alpha=0.05,
                          method="bonferroni")[-1]))

    df_coef.index = manova.endog_names
    df_pval.index = df_coef.index

    df_coef.columns = ["Intercept"] + exog.columns.tolist()

    plt.figure(figsize=(13, 7))
    sns.heatmap(
        df_coef.iloc[:, 1:],
        cmap="PiYG_r",
        square=False,
        center=0,
        annot=df_pval.iloc[:, 1:],
    )
    plt.title("Full univariate results")
    plt.annotate(
        f"""
    * Value in each cell is Bonferroni corrected p-value.
    ** {sig_key[0][0]} is significant at multivatiate level.
       p = {sig_key[0][1]}""",
        (0, 0),
        (0, -70),
        xycoords="axes fraction",
        textcoords="offset points",
        va="top",
    )
    plt.tight_layout()
    plt.savefig(f"{basepath}/univeriate.png", dpi=300, transparent=True)
    return df_coef
def significanceTesting(featureDf2,
                        pairwiseClustersToCompare,
                        confidence=0.05,
                        foldchange=2,
                        responseCutoff=0.1,
                        errorCorrection='bonferroni'):
    n = len(featureDf2.columns) - 1
    if errorCorrection == 'bonferroni':
        alpha = confidence / n
    else:
        alpha = confidence
    uniqueClusters = [
        list(x) for x in set(tuple(x) for x in pairwiseClustersToCompare)
    ]

    #Kruskal Wallis is unecessary; one way anova seems to be relatively robust to non-normality: http://www.biostathandbook.com/kruskalwallis.html
    endog = featureDf2.iloc[:, :-1]
    exog = featureDf2.iloc[:, -1]
    modelFormula = " + ".join("Q(\'" + featureDf2.columns[:-1] +
                              "\')") + " ~ Cluster"
    print(featureDf2)
    sys.exit(0)
    manova = MANOVA.from_formula(modelFormula, data=featureDf2)
    #Pillai's trace is most robust against deviations from assumptions of manova
    manovapval = manova.mv_test().results['Cluster']['stat'].iloc[1, 4]
    print(manovapval)
    #Need to think about how to handle multiple clusters; for now just iterate through all pairs
    if manovapval < confidence:
        allDataMatrices = []
        allSignificantDifferences = []
        for clustersToCompare in pairwiseClustersToCompare:
            comp1 = clustersToCompare[0]
            comp2 = clustersToCompare[1]
            group1 = featureDf2[featureDf2['Cluster'] == str(
                comp1)].iloc[:, :-1]
            group2 = featureDf2[featureDf2['Cluster'] == str(
                comp2)].iloc[:, :-1]
            anova = scipy.stats.kruskal(group1, group2)
            pval2 = anova[1]
            stat = anova[0]
            if pval2 < 0.01:
                print('Different')
            significantArray = []
            allBoxPairs = []
            pvalList = []
            meanFoldChangeList = []
            medianFoldChangeList = []
            foldChangeList = []
            normalityList = []
            tempnormalityList = []

            for col in range(featureDf2.shape[1] - 1):
                group1 = featureDf2[featureDf2['Cluster'] == str(
                    comp1)].iloc[:, col]
                group2 = featureDf2[featureDf2['Cluster'] == str(
                    comp2)].iloc[:, col]
                normalitypval = shapiro(group1)[1]
                normalitypval2 = shapiro(group2)[1]
                normalityCondition = False
                if normalitypval < 0.05 and normalitypval2 < 0.05:
                    normalityCondition = True
                    try:
                        pval = scipy.stats.ttest_ind(group1, group2)[1]
                    except:
                        pval = 0.5
                else:
                    try:
                        pval = scipy.stats.mannwhitneyu(group1, group2)[1]
                    except:
                        pval = 0.5
                pvalList.append(pval)
                tempnormalityList.append(normalityCondition)

            #For holm bonferroni
            ordered_pval_list = sorted(pvalList)

            for col in range(featureDf2.shape[1] - 1):
                pvalCondition = False
                foldChangeCondition = False
                group1 = featureDf2[featureDf2['Cluster'] == str(
                    comp1)].iloc[:, col]
                group2 = featureDf2[featureDf2['Cluster'] == str(
                    comp2)].iloc[:, col]

                pval = pvalList[col]
                if errorCorrection != 'holm-bonferroni':
                    if pval < alpha:
                        pvalCondition = True
                else:
                    rank = ordered_pval_list.index(pval) + 1
                    modifiedAlpha = alpha / (n - rank + 1)
                    if pval < modifiedAlpha:
                        pvalCondition = True

                normalityCondition = tempnormalityList[col]
                if normalityCondition:
                    if np.nanmean(group1) < responseCutoff:
                        if np.nanmean(group2) >= responseCutoff:
                            meanFoldChangeList.append(4)
                            foldChangeList.append(4)
                        else:
                            meanFoldChangeList.append(0.0001)
                            foldChangeList.append(0.0001)
                    else:
                        if np.nanmean(group2) < responseCutoff:
                            meanFoldChangeList.append(4)
                            foldChangeList.append(4)
                        else:
                            meanFoldChangeList.append(
                                np.nanmean(group1) / np.nanmean(group2))
                            foldChangeList.append(
                                np.nanmean(group1) / np.nanmean(group2))
                else:
                    if np.nanmedian(group1) < responseCutoff:
                        if np.nanmedian(group2) >= responseCutoff:
                            medianFoldChangeList.append(4)
                            foldChangeList.append(4)
                        else:
                            medianFoldChangeList.append(0.0001)
                            foldChangeList.append(0.0001)
                    else:
                        if np.nanmedian(group2) < responseCutoff:
                            medianFoldChangeList.append(4)
                            foldChangeList.append(4)
                        else:
                            medianFoldChangeList.append(
                                np.nanmedian(group1) / np.nanmedian(group2))
                            foldChangeList.append(
                                np.nanmedian(group1) / np.nanmedian(group2))
                if pvalCondition:
                    if abs(np.log2(foldChangeList[-1])) >= np.log2(foldchange):
                        significantArray.append(
                            featureDf2.columns.get_level_values('Feature')
                            [col])
                        allBoxPairs.append(
                            ((featureDf2.columns.get_level_values('Feature')
                              [col], str(comp1)),
                             (featureDf2.columns.get_level_values('Feature')
                              [col], str(comp2))))
                        normalityList.append(normalityCondition)

            foldChangeArray = np.log2(np.array(foldChangeList))
            pvalArray = -np.log10(np.array(pvalList))

            dataMatrix = np.vstack([foldChangeArray, pvalArray])
            allSignificantDifferences.append(significantArray)
            allDataMatrices.append(dataMatrix)
        significantArray = list(set().union(*allSignificantDifferences))
        dataMatrix = np.vstack(allDataMatrices)
    else:
        significantArray = []
        dataMatrix = []

    print(significantArray)
    return dataMatrix, significantArray
df["strike_count"] = df["strike_count"].str.replace("s_count_", "")

# ------------ Check 2nd component ---------------

n = [len(df.get_group(gr)) for gr in groups]

c = 1

y = embeddings[:, c - 1]

plt.scatter(n, y)
plt.show()

# ------------ MANOVA -------------------

manova = MANOVA.from_formula(
    "c0+c1+c2+c3+c4+c5+c6+c7+c8+c9~umpire+ball_count*strike_count", data=df)

table = manova.mv_test()

res = pd.DataFrame(
    {term: table.results[term]["stat"].iloc[0]
     for term in table.results}).T

components_names = [
    "Smaller",
    "Uncertain",
    "High inside excluded",
    "Wide bottom",
    "Wide middle",
    "Wide top",
    "NW/SE diagonal",
Beispiel #25
0
Series = pd.concat([
    Series.reset_index(drop=True),
    pd.DataFrame(indice.tolist(), columns=['y'])
],
                   axis=1)

#%%
####################################################
# Test MANOVA (diferentes estadísticos)
####################################################
import pandas as pd
from statsmodels.multivariate.manova import MANOVA

maov = MANOVA.from_formula(
    'AA+AAL+AAP+AAPL+AB+ABBV+ABC+ABM+ABMD+ABT+ACAD+ACHN+ACIW+ACN+ACOR+ADBE+ADI+ADM+ADP+ADSK+AEE+AEO+AEP+AES+AFL+AG+AGIO+AGN+AIG+AINV+AIV+AKAM+AKS+ALK+ALL+ALNY+AMAT+AMD+AMGN+AMP+AMTD+AMZN+AN+ANTM+APA+APC+ARCC+ARLP+ARNA+ARR+ASH+ATI+ATVI+AUY+AVB+AVP+AVXL+AVY+AWK+AXP+AZN+BABA+BAC+BDX+BUD+CS+DAL+DD+FNMA+GOOG+GOOGL+LH+LLY+LUV+MO+MT+NAT+NLY+NVO+PAA+T+UA+UBS+WBA~ y',
    data=Series)
#%%
##############################################################################
# resultado manova Ho igualdad en as medias dado covarianzas
##############################################################################
print(maov.mv_test())
#%%
##########################
# Test Traicy-Widom
##########################

##########################
# Distribución TW F1
##########################
f90 = t_1[F1 >= .90].min()
Beispiel #26
0
def manova(test_row, data, categorical):

    data = data.dropna()
    data.loc[len(data)] = test_row

    le = LabelEncoder()
    for val in categorical:
        data[val] = le.fit_transform(data[val])

    for col in data.columns:
        if (col not in categorical):
            data[col] = (data[col] - np.mean(data[col])) / np.std(data[col])

    test_row = data.iloc[len(data) - 1]
    data.drop([len(data) - 1])

    data_good = data[data[10] == 0]
    data_bad = data[data[10] == 1]

    x_good = data_good.drop([10, 9], axis=1)
    y_good = data_good[[9]]
    x_bad = data_bad.drop([10, 9], axis=1)
    y_bad = data_bad[[9]]

    man_good = MANOVA(endog=x_good, exog=y_good)
    man_bad = MANOVA(endog=x_bad, exog=y_bad)

    output_good = man_good.mv_test()
    output_bad = man_bad.mv_test()

    out_good = np.array(output_good['x0']['stat'])
    out_bad = np.array(output_bad['x0']['stat'])

    WL_good = out_good[0][0]
    PT_good = out_good[1][0]
    HT_good = out_good[2][0]
    RGR_good = out_good[3][0]

    WL_bad = out_bad[0][0]
    PT_bad = out_bad[1][0]
    HT_bad = out_bad[2][0]
    RGR_bad = out_bad[3][0]

    x = test_row.drop([10, 9])
    y = test_row[[9]]

    data_test_x = x_good.append(x)
    data_test_y = y_good.append(y)

    man_test = MANOVA(endog=data_test_x, exog=data_test_y)
    output_test = man_test.mv_test()

    out_test = np.array(output_test['x0']['stat'])

    WL_test_good = out_test[0][0]
    PT_test_good = out_test[1][0]
    HT_test_good = out_test[2][0]
    RGR_test_good = out_test[3][0]

    data_test_x = x_bad.append(x)
    data_test_y = y_bad.append(y)

    man_test = MANOVA(endog=data_test_x, exog=data_test_y)
    output_test = man_test.mv_test()

    out_test = np.array(output_test['x0']['stat'])

    WL_test_bad = out_test[0][0]
    PT_test_bad = out_test[1][0]
    HT_test_bad = out_test[2][0]
    RGR_test_bad = out_test[3][0]

    scorecard = {
        "method": "MANOVA",
        "WL_good": WL_good,
        "WL_test_good": WL_test_good,
        "WL_bad": WL_bad,
        "WL_test_bad": WL_test_bad
    }

    ret = "WL good : " + str(WL_good) + " WL test good : " + str(
        WL_test_good) + "\nWL bad : " + str(WL_bad) + " WL test bad : " + str(
            WL_test_bad)

    return scorecard
Beispiel #27
0
with open(encoder_path, "rb") as f:
    _, embeddings, groups, _, _ = pickle.load(f)

ids = [groups.index(gr) for gr, _ in df if gr in groups]

embeddings = embeddings[ids, :]
groups = [groups[i] for i in ids]

df = pd.DataFrame(embeddings,
                  index=pd.MultiIndex.from_tuples(groups)).reset_index()
df.columns = ["umpire", "score", "inning", *["c" + str(i) for i in range(10)]]

# ------------ MANOVA -------------------

manova = MANOVA.from_formula(
    "c0+c1+c2+c3+c4+c5+c6+c7+c8+c9~umpire+score*inning", data=df)
table = manova.mv_test()

res = pd.DataFrame(
    {term: table.results[term]["stat"].iloc[0]
     for term in table.results}).T

components_names = [
    "Smaller",
    "Uncertain",
    "High inside excluded",
    "Wide bottom",
    "Wide middle",
    "Wide top",
    "NW/SE diagonal",
    "Irregular 1",
Beispiel #28
0
def save_top_manova(config, attributes_types, attribute_target, num_top=500, window=3, test=MANOVATest.pillai_bartlett):
    dict_bop_cpgs = load_bop_cpg_dict(config)
    dict_bop_genes = get_dict_bop_genes(config, dict_bop_cpgs)
    cpgs, betas = load_cpg_data(config)

    atr_table = []
    atr_cols = []
    for atr_type in attributes_types:
        if isinstance(atr_type, Attribute):
            atr_table.append(get_attributes(config, atr_type))
        elif isinstance(atr_type, CellPop):
            atr_table.append(get_cell_pop(config, [atr_type]))
        atr_cols.append(atr_type.value)

    num_bops = 0
    bops_passed = []
    bops_pvals = []
    for bop in dict_bop_cpgs:
        curr_cpgs = dict_bop_cpgs.get(bop)
        cpgs_passed = []
        for cpg in curr_cpgs:
            if cpg in cpgs:
                cpgs_passed.append(cpg)
        if len(cpgs_passed) > 2:
            pvals_on_bop = []
            for win_id in range(0, len(cpgs_passed) - 2):
                val_table = []
                val_cols = []
                for cpg_id in range(0, window):
                    cpg = cpgs_passed[win_id + cpg_id]
                    beta = betas[cpgs.index(cpg)]
                    val_table.append(beta)
                    val_cols.append('cpg_'+str(cpg_id))
                table = atr_table + val_table
                cols = atr_cols + val_cols

                formula = val_cols[0]
                for val_col_id in range(1, len(val_cols)):
                    val_col = val_cols[val_col_id]
                    formula += ' + ' + val_col
                formula += ' ~ ' + atr_cols[0]
                for atr_col_id in range(1, len(atr_cols)):
                    atr_col = atr_cols[atr_col_id]
                    formula += ' + ' + atr_col

                table = list(map(list, zip(*table)))
                x = pd.DataFrame(table, columns=cols)
                manova = MANOVA.from_formula(formula, x)
                mv_test_res = manova.mv_test()
                pvals = mv_test_res.results[attribute_target.value]['stat'].values[0:4, 4]
                target_pval = pvals[0]
                if test is MANOVATest.wilks:
                    target_pval = pvals[0]
                elif test is MANOVATest.pillai_bartlett:
                    target_pval = pvals[1]
                elif test is MANOVATest.lawley_hotelling:
                    target_pval = pvals[2]
                elif test is MANOVATest.roy:
                    target_pval = pvals[3]
                pvals_on_bop.append(target_pval)
            min_pval = np.min(pvals_on_bop)
            bops_passed.append(bop)
            bops_pvals.append(min_pval)
        num_bops += 1
        if num_bops % config.print_rate == 0:
            print('num_bops: ' + str(num_bops))

    reject, pvals_corrected, alphacSidak, alphacBonf = multipletests(bops_pvals, 0.05, method='fdr_bh')
    order = np.argsort(pvals_corrected)
    bops_opt = list(np.array(bops_passed)[order])[0:num_top]
    pvals_opt = list(np.array(pvals_corrected)[order])[0:num_top]
    genes_opt = []
    genes_from_bop = []
    for bop in bops_opt:
        curr_genes = dict_bop_genes.get(bop)
        genes_str = curr_genes[0]
        for gene_id in range(1, len(curr_genes)):
            genes_str += ';' + curr_genes[gene_id]
        genes_opt.append(genes_str)
        for gene in curr_genes:
            if gene not in genes_from_bop:
                genes_from_bop.append(gene)

    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [bops_opt, genes_opt, pvals_opt])

    config.approach_gd = GeneDataType.from_bop
    config.dt = DataType.gene
    fn = 'top.txt'
    fn = get_result_path(config, fn)
    save_features(fn, [genes_from_bop])
    config.dt = DataType.cpg
# MANOVA test in statsmodel

import pandas as pd
from statsmodels.multivariate.manova import MANOVA
# data for t test
url = 'https://vincentarelbundock.github.io/Rdatasets/csv/datasets/iris.csv'
df = pd.read_csv(url, index_col=0)
df.columns = df.columns.str.replace(".", "_")
print(df.head())

# run the manova model
maov = MANOVA.from_formula('Sepal_Length + Sepal_Width + \
                            Petal_Length + Petal_Width  ~ Species',
                           data=df)

# print out the results
print()  # print a blank line
print(maov.mv_test())

# source
# https://www.marsja.se/python-manova-made-easy-using-statsmodels/
Beispiel #30
0
    def single(self, item, config, configs_child):

        if config.experiment.method == Method.heteroskedasticity:

            x = self.get_strategy.get_target(config, item)
            y = self.get_strategy.get_single_base(config, item)
            process_heteroscedasticity(x, y, config.metrics,
                                       f'_{config.hash[0:8]}')

        elif config.experiment.method == Method.manova:

            bop_data = config.base_dict[item]
            raw_cpgs = bop_data['cpg']
            passed_cpgs = [
                cpg for cpg in raw_cpgs if cpg in config.target_dict
            ]
            genes = list(bop_data['gene'])
            cl = bop_data['class']

            method_params = config.experiment.method_params
            covariates = []
            for key, values in method_params.items():
                for val in values:
                    covariates.append(val)

            manova_dict = {}
            manova_dict.update(config.observables_dict.items())
            if len(config.cells_dict) > 0:
                manova_dict.update(config.cells_dict.items())

            for cpg_id in range(0, len(passed_cpgs)):
                y = self.get_strategy.get_single_base(config,
                                                      passed_cpgs[cpg_id])
                manova_dict[f'cpg{cpg_id}'] = y
            df = pd.DataFrame(manova_dict)

            if len(passed_cpgs) > 0:

                if len(passed_cpgs) > 2:

                    p_values = {}
                    for cov in covariates:
                        p_values[cov] = 1
                    p_values_wilks = copy.deepcopy(p_values)
                    p_values_pillai_bartlett = copy.deepcopy(p_values)
                    p_values_lawley_hotelling = copy.deepcopy(p_values)
                    p_values_roy = copy.deepcopy(p_values)

                    for w_id in range(0, len(passed_cpgs) - 2):
                        cpg_keys = []
                        for cpg_id in range(0, 3):
                            cpg_keys.append(f'cpg{w_id + cpg_id}')
                        formula = ' + '.join(cpg_keys) + ' ~ ' + ' + '.join(
                            covariates)
                        manova = MANOVA.from_formula(formula, df)
                        mv_test_res = manova.mv_test()
                        for cov in covariates:
                            pvals = mv_test_res.results[cov]['stat'].values[
                                0:4, 4]

                            p_values_wilks[cov] = min(pvals[0],
                                                      p_values_wilks[cov])
                            p_values_pillai_bartlett[cov] = min(
                                pvals[1], p_values_pillai_bartlett[cov])
                            p_values_lawley_hotelling[cov] = min(
                                pvals[2], p_values_lawley_hotelling[cov])
                            p_values_roy[cov] = min(pvals[3],
                                                    p_values_roy[cov])

                else:

                    p_values = {}
                    for cov in covariates:
                        p_values[cov] = 1

                    for cpg_id in range(0, len(passed_cpgs)):
                        formula = f'cpg{cpg_id}' + ' ~ ' + ' + '.join(
                            covariates)
                        anova = ols(formula, df).fit()
                        anova_table = sm.stats.anova_lm(anova)
                        for cov_id, cov in enumerate(covariates):
                            p_value = anova_table.values[cov_id, 4]
                            p_values[cov] = min(p_values[cov], p_value)

                    p_values_wilks = copy.deepcopy(p_values)
                    p_values_pillai_bartlett = copy.deepcopy(p_values)
                    p_values_lawley_hotelling = copy.deepcopy(p_values)
                    p_values_roy = copy.deepcopy(p_values)
            else:
                p_values = {}
                for cov in covariates:
                    p_values[cov] = 1
                p_values_wilks = copy.deepcopy(p_values)
                p_values_pillai_bartlett = copy.deepcopy(p_values)
                p_values_lawley_hotelling = copy.deepcopy(p_values)
                p_values_roy = copy.deepcopy(p_values)

            suffix = f'_{config.hash[0:8]}'

            config.metrics['class' + suffix].append(cl)
            config.metrics['genes' + suffix].append(';'.join(genes))

            for cov in covariates:
                config.metrics[f'{cov}_p_value_wilks' + suffix].append(
                    p_values_wilks[cov])
                config.metrics[f'{cov}_p_value_pillai_bartlett' +
                               suffix].append(p_values_pillai_bartlett[cov])
                config.metrics[f'{cov}_p_value_lawley_hotelling' +
                               suffix].append(p_values_lawley_hotelling[cov])
                config.metrics[f'{cov}_p_value_roy' + suffix].append(
                    p_values_roy[cov])

        elif config.experiment.method == Method.linreg:

            x = self.get_strategy.get_target(config, item)
            y = self.get_strategy.get_single_base(config, item)
            process_linreg(x, y, config.metrics, f'_{config.hash[0:8]}')

        elif config.experiment.method == Method.cluster:

            x = self.get_strategy.get_target(config, item)
            y = self.get_strategy.get_single_base(config, item)
            process_cluster(x, y, config.experiment.method_params,
                            config.metrics, f'_{config.hash[0:8]}')

        elif config.experiment.method == Method.formula:

            y = self.get_strategy.get_single_base(config, item)
            method_params = config.experiment.method_params

            exog_dict = {}
            for key, values in method_params.items():
                if key == 'cells':
                    for val in values:
                        if val in config.cells_dict:
                            exog_dict[val] = self.get_strategy.get_cell(
                                config, key=val, item=item)
                        else:
                            raise ValueError(
                                f'Wrong cell type in formula: {val}')
                if key == 'observables':
                    for val in values:
                        if val in config.observables_dict:
                            exog_dict[val] = self.get_strategy.get_observalbe(
                                config, key=val, item=item)
                        else:
                            raise ValueError(
                                f'Wrong observable in formula: {val}')

            exog_keys = []
            for exog_type, exog_data in exog_dict.items():
                if config.is_observables_categorical.get(exog_type, False):
                    exog_keys.append('C(' + exog_type + ')')
                else:
                    exog_keys.append(exog_type)
            formula = 'cpg ~ ' + ' + '.join(exog_keys)

            exog_dict['cpg'] = y
            data_df = pd.DataFrame(exog_dict)
            reg_res = smf.ols(formula=formula, data=data_df).fit()
            params = dict(reg_res.params)
            bse = dict(reg_res.bse)
            pvalues = dict(reg_res.pvalues)

            suffix = f'_{config.hash[0:8]}'

            config.metrics['mean' + suffix].append(np.mean(y))
            config.metrics['R2' + suffix].append(reg_res.rsquared)
            config.metrics['R2_adj' + suffix].append(reg_res.rsquared_adj)
            for key in params:
                config.metrics[key + suffix].append(params[key])
                config.metrics[key + '_std' + suffix].append(bse[key])
                config.metrics[key + '_p_value' + suffix].append(pvalues[key])

        elif config.experiment.method == Method.formula_new:

            y = self.get_strategy.get_single_base(config, item)
            method_params = config.experiment.method_params
            formula = method_params['formula']

            dict_global = {}
            dict_global.update(config.observables_dict.items())
            if len(config.cells_dict) > 0:
                dict_global.update(config.cells_dict.items())
            dict_global['cpg'] = y

            data_df = pd.DataFrame(dict_global)
            reg_res = smf.ols(formula=formula, data=data_df).fit()
            params = dict(reg_res.params)
            bse = dict(reg_res.bse)
            pvalues = dict(reg_res.pvalues)

            suffix = f'_{config.hash[0:8]}'

            config.metrics['mean' + suffix].append(np.mean(y))
            config.metrics['R2' + suffix].append(reg_res.rsquared)
            config.metrics['R2_adj' + suffix].append(reg_res.rsquared_adj)
            for key in params:
                config.metrics[key + suffix].append(params[key])
                config.metrics[key + '_std' + suffix].append(bse[key])
                config.metrics[key + '_p_value' + suffix].append(pvalues[key])

        elif config.experiment.method == Method.oma:

            x = self.get_strategy.get_target(config, item)
            y = self.get_strategy.get_single_base(config, item)
            lin_x = minmax_scale(x, feature_range=(0.0, 1.0))
            lin_y = minmax_scale(y, feature_range=(0.0, 1.0))
            tmp_x = minmax_scale(x, feature_range=(1.0, 10.0))
            tmp_y = minmax_scale(y, feature_range=(1.0, 10.0))
            log_x = np.log10(tmp_x)
            log_y = np.log10(tmp_y)

            lin_lin_corr_coeff, lin_lin_p_value = pearsonr(lin_x, lin_y)
            config.metrics['lin_lin_corr_coeff' +
                           f'_{config.hash[0:8]}'].append(lin_lin_corr_coeff)
            config.metrics['lin_lin_p_value' +
                           f'_{config.hash[0:8]}'].append(lin_lin_p_value)

            lin_log_corr_coeff, lin_log_p_value = pearsonr(lin_x, log_y)
            config.metrics['lin_log_corr_coeff' +
                           f'_{config.hash[0:8]}'].append(lin_log_corr_coeff)
            config.metrics['lin_log_p_value' +
                           f'_{config.hash[0:8]}'].append(lin_log_p_value)

            log_lin_corr_coeff, log_lin_p_value = pearsonr(log_x, lin_y)
            config.metrics['log_lin_corr_coeff' +
                           f'_{config.hash[0:8]}'].append(log_lin_corr_coeff)
            config.metrics['log_lin_p_value' +
                           f'_{config.hash[0:8]}'].append(log_lin_p_value)

            log_log_corr_coeff, log_log_p_value = pearsonr(log_x, log_y)
            config.metrics['log_log_corr_coeff' +
                           f'_{config.hash[0:8]}'].append(log_log_corr_coeff)
            config.metrics['log_log_p_value' +
                           f'_{config.hash[0:8]}'].append(log_log_p_value)

        elif config.experiment.method == Method.pbc:

            x = self.get_strategy.get_target(config, item)
            y = self.get_strategy.get_single_base(config, item)

            if len(set(x)) != 2:
                raise RuntimeError('x variable is not binary in pbc')

            keys = list(set(x))
            d = {k: [] for k in keys}
            for x_id, x_val in enumerate(x):
                d[x_val].append(y[x_id])

            corr_coeff, p_value = pointbiserialr(x, y)

            if np.isnan(corr_coeff) or np.isnan(p_value):
                corr_coeff = 0.0
                p_value = 1.0
                anova_p_value = 1.0
                kw_p_value = 1.0
            else:
                _, anova_p_value = f_oneway(d[keys[0]], d[keys[1]])
                _, kw_p_value = kruskal(d[keys[0]], d[keys[1]])

            config.metrics['pbc_corr_coeff' +
                           f'_{config.hash[0:8]}'].append(corr_coeff)
            config.metrics['pbc_p_value' +
                           f'_{config.hash[0:8]}'].append(p_value)
            config.metrics['anova_p_value' +
                           f'_{config.hash[0:8]}'].append(anova_p_value)
            config.metrics['kw_p_value' +
                           f'_{config.hash[0:8]}'].append(kw_p_value)

        elif config.experiment.method == Method.polygon:

            xs = []
            ys = []
            metrics_keys = get_method_metrics_keys(config)
            for config_child in configs_child:
                update_parent_dict_with_children(metrics_keys, item, config,
                                                 config_child)
                x = self.get_strategy.get_target(config_child, item)
                y = self.get_strategy.get_single_base(config_child, item)
                xs.append(x)
                ys.append(y)

            if config.experiment.method_params['method'] == Method.linreg:
                process_linreg_polygon(configs_child, item, xs, config.metrics,
                                       f'_{config.hash[0:8]}')

            elif config.experiment.method_params['method'] == Method.variance:
                process_variance_polygon(configs_child, item, xs,
                                         config.metrics,
                                         f'_{config.hash[0:8]}')

        elif config.experiment.method == Method.z_test_linreg:

            slopes = []
            slopes_std = []
            num_subs = []
            metrics_keys = get_method_metrics_keys(config)
            for config_child in configs_child:
                update_parent_dict_with_children(metrics_keys, item, config,
                                                 config_child)
                item_id = config_child.advanced_dict[item]
                slopes.append(config_child.advanced_data[
                    'slope' + f'_{config_child.hash[0:8]}'][item_id])
                slopes_std.append(config_child.advanced_data[
                    'slope_std' + f'_{config_child.hash[0:8]}'][item_id])
                num_subs.append(
                    len(config_child.observables_dict[
                        config_child.attributes.target]))

            process_z_test_slope(slopes, slopes_std, num_subs, config.metrics,
                                 f'_{config.hash[0:8]}')

        elif config.experiment.method == Method.ancova:

            x_all = []
            y_all = []
            category_all = []
            metrics_keys = get_method_metrics_keys(config)
            for config_child in configs_child:
                x = self.get_strategy.get_target(config_child,
                                                 item,
                                                 categorical=False)
                y = self.get_strategy.get_single_base(config_child, item)
                x_all += list(x)
                y_all += list(y)
                category_all += [
                    list(string.ascii_lowercase)[configs_child.index(
                        config_child)]
                ] * len(x)

            data = {'x': x_all, 'y': y_all, 'category': category_all}
            df = pd.DataFrame(data)
            formula = 'y ~ x * C(category)'
            lm = ols(formula, df)
            results = lm.fit()

            suffix = f'_{config.hash[0:8]}'

            config.metrics['R2' + suffix].append(results.rsquared)
            config.metrics['R2_adj' + suffix].append(results.rsquared_adj)
            config.metrics['f_stat' + suffix].append(results.fvalue)
            config.metrics['prob(f_stat)' + suffix].append(results.f_pvalue)

            config.metrics['intercept' + suffix].append(results.params[0])
            config.metrics['category' + suffix].append(results.params[1])
            config.metrics['x' + suffix].append(results.params[2])
            config.metrics['x:category' + suffix].append(results.params[3])

            config.metrics['intercept_std' + suffix].append(results.bse[0])
            config.metrics['category_std' + suffix].append(results.bse[1])
            config.metrics['x_std' + suffix].append(results.bse[2])
            config.metrics['x:category_std' + suffix].append(results.bse[3])

            config.metrics['intercept_pval' + suffix].append(
                results.pvalues[0])
            config.metrics['category_pval' + suffix].append(results.pvalues[1])
            config.metrics['x_pval' + suffix].append(results.pvalues[2])
            config.metrics['x:category_pval' + suffix].append(
                results.pvalues[3])

        elif config.experiment.method == Method.aggregator:

            metrics_keys = get_method_metrics_keys(config)
            for config_child in configs_child:
                update_parent_dict_with_children(metrics_keys, item, config,
                                                 config_child)

        elif config.experiment.method == Method.variance:

            x = self.get_strategy.get_target(config, item)
            y = self.get_strategy.get_single_base(config, item)

            semi_window = config.experiment.method_params['semi_window']
            box_b = config.experiment.method_params['box_b']
            box_t = config.experiment.method_params['box_t']

            process_variance(x, y, semi_window, box_b, box_t, config.metrics,
                             f'_{config.hash[0:8]}')

            xs = get_box_xs(x)
            ys_b, ys_t = fit_variance(xs, config.metrics,
                                      f'_{config.hash[0:8]}')

            diff_begin = abs(ys_t[0] - ys_b[0])
            diff_end = abs(ys_t[-1] - ys_b[-1])

            config.metrics['increasing_div' + f'_{config.hash[0:8]}'].append(
                max(diff_begin, diff_end) / min(diff_begin, diff_end))
            config.metrics['increasing_sub' + f'_{config.hash[0:8]}'].append(
                abs(diff_begin - diff_end))

            if diff_end > diff_begin:
                config.metrics['increasing_type' +
                               f'_{config.hash[0:8]}'].append(+1)
            else:
                config.metrics['increasing_type' +
                               f'_{config.hash[0:8]}'].append(-1)

        config.metrics['item'].append(item)
        aux = self.get_strategy.get_aux(config, item)
        config.metrics['aux'].append(aux)
def MANOVA_analysis(dict_cpg_bop, dict_bop_cpg):
    dict_BoP_PValue = {}
    age = get_ages()
    file = open("average_beta.txt", "r")
    file.readline()
    for line in file:
        line = line.split()
        name_cpg = line.pop(0)
        if name_cpg in dict_cpg_bop:
            bop = dict_cpg_bop[name_cpg]
            l = dict_bop_cpg[bop].split(";")
            if len(l) < 3:
                continue
            else:
                if bop in dict_BoP_PValue:
                    dict_BoP_PValue[bop].append(line)
                else:
                    dict_BoP_PValue[bop] = []
                    dict_BoP_PValue[bop].append(line)
    file = open("DataFrame.txt", "w")
    print(len(dict_BoP_PValue))
    num = 0
    for key in dict_BoP_PValue:
        print(num)
        num += 1
        dict = {}
        pVal = []
        l = len(dict_BoP_PValue[key])
        for i in range(0, l - 2):
            cpg1 = []
            cpg2 = []
            cpg3 = []

            cpg1 = list(np.float_(dict_BoP_PValue[key][i]))
            cpg2 = list(np.float_(dict_BoP_PValue[key][i + 1]))
            cpg3 = list(np.float_(dict_BoP_PValue[key][i + 2]))

            #for j in range(len(dict_BoP_PValue[key][i])):
            #    cpg1.append(float(dict_BoP_PValue[key][i][j]))
            #    cpg2.append(float(dict_BoP_PValue[key][i+1][j]))
            #    cpg3.append(float(dict_BoP_PValue[key][i+2][j]))

            DatFrame = pd.DataFrame({
                'age': age,
                'cpg1': cpg1,
                'cpg2': cpg2,
                'cpg3': cpg3
            })
            #DatFrame.to_csv(file, header=None, index = None, sep=' ', mode='a')
            #DatFrame.to_csv(file, sep=' ', mode='a')
            model = MANOVA.from_formula('cpg1 + cpg2 + cpg3 ~ age',
                                        data=DatFrame)
            test = model.mv_test()
            pVal.append(test.results['age']['stat'].values[3, 4])
        pVal.sort()
        min_pVal = pVal[0]
        dict_BoP_PValue[key] = min_pVal
    '''
    age = get_ages()
    for i in range(1):
        dict = {}
        cpg1 = []
        cpg2 = []
        cpg3 = []
        for i in range(728):
            tmp = random.random()
            cpg2.append(tmp)
            cpg3.append(tmp)
            cpg1.append(tmp)
        tmp = 0.954697456795
        cpg2.append(tmp)
        cpg3.append(tmp+0.000001)
        cpg1.append(tmp+0.00001)


        #DatFrame = pd.DataFrame({'age': age,
        #                         'cpg1': cpg1,
        #                         'cpg2': cpg2,
        #                         'cpg3': cpg3
        #                        })
        dict['age'] = age;
        dict['cpg1'] = cpg1
        dict['cpg2'] = cpg2
        dict['cpg3'] = cpg3
        #print(DatFrame)
        model = MANOVA.from_formula('cpg1 + cpg2 + cpg3 ~ age', data=dict)
        test = model.mv_test()
        res = test.results['age']['stat'].values[1,4]
        print(res)
    '''
    return dict_BoP_PValue
        all_feat_names = [key for key in PARAMS['all_featName']]
        opFile = PARAMS['opDir'] + '/MANOVA.csv'
        for feat_i in all_feat_names:
            feat_train = All_feature_data[feat_i]['train_data']
            feat_test = All_feature_data[feat_i]['test_data']
            feat_train_label = All_feature_data[feat_i]['train_label']
            print('MANOVA ', feat_i, np.shape(feat_train),
                  np.shape(feat_train_label), np.shape(feat_test))

            PARAMS_temp = PARAMS.copy()
            print('feat_train: ', np.shape(feat_train))

            # endog~dependent variables, exog~independent variables
            try:
                moav = MANOVA(endog=feat_train, exog=feat_train_label)
                test_results = moav.mv_test()
            except:
                print(feat_i, ' Noise added')
                feat_train += np.random.rand(
                    np.shape(feat_train)[0],
                    np.shape(feat_train)[1]) * 1e-10
                moav = MANOVA(endog=feat_train, exog=feat_train_label)
                test_results = moav.mv_test()

            WL = test_results.results['x0']['stat']['Value']['Wilks\' lambda']
            PT = test_results.results['x0']['stat']['Value']['Pillai\'s trace']
            HLT = test_results.results['x0']['stat']['Value'][
                'Hotelling-Lawley trace']
            RGR = test_results.results['x0']['stat']['Value'][
                'Roy\'s greatest root']
def get_pca_pvalue_manova(PC1,PC2,Y):
    data = pd.DataFrame({'PC1':PC1,'PC2':PC2,'Y':Y})
    maov = MANOVA.from_formula('PC1 + PC2 ~ Y',data)
    stats = maov.mv_test()
    return stats.results['Y']['stat']['Pr > F'].iloc[0]
Beispiel #34
0
 def __init__(self, independent_variables, dependent_variables):
     """Initializes and  fits the model."""
     self.model = MANOVA(dependent_variables, independent_variables)
     self.model.fit()