def plot_main_effects(self, intensity): if intensity[-1] != "%": intensity += "%" plt.subplots(3, 1, figsize=(12, 7)) plt.suptitle("{} Activity".format(intensity.capitalize())) plt.subplot(1, 3, 1) model_means = rp.summary_cont(self.df_percent.groupby( ['Model']))[intensity]["Mean"] model_sd = rp.summary_cont(self.df_percent.groupby( ['Model']))[intensity]["SD"] plt.bar([i for i in model_sd.index], [100 * i for i in model_means.values], yerr=[i * 100 for i in model_sd], capsize=10, ecolor='black', color=["Red", "Blue", "Green", "Purple"], edgecolor='black', linewidth=2) plt.ylabel("% of Collection") plt.title("Model Means") plt.subplot(1, 3, 2) group_means = rp.summary_cont(self.df_percent.groupby( ['Group']))[intensity]["Mean"] group_sd = rp.summary_cont(self.df_percent.groupby( ['Group']))[intensity]["SD"] plt.bar([i for i in group_means.index], [100 * i for i in group_means.values], yerr=[i * 100 for i in group_sd], capsize=10, ecolor='black', color=["Grey", "White"], edgecolor='black', linewidth=2) plt.title("Group Means") plt.subplot(1, 3, 3) sns.pointplot(data=x.df_percent, x="Model", y=intensity, hue="Group", dodge=True, markers='o', capsize=.1, errwidth=1, palette='Set1') plt.title("All Combination Means") plt.ylabel(" ")
def plot_activity_group_means(self): ci_range = sms.DescrStatsW( self.df_kappa_long.groupby("Group").get_group("LOW") ["Kappa"]).tconfint_mean() ci_width_h = (ci_range[1] - ci_range[0]) / 2 ci_range = sms.DescrStatsW( self.df_kappa_long.groupby("Group").get_group("HIGH") ["Kappa"]).tconfint_mean() ci_width_l = (ci_range[1] - ci_range[0]) / 2 e_bars = [ci_width_h, ci_width_l] group_means = rp.summary_cont(self.df_kappa_long.groupby( ['Group']))["Kappa"]["Mean"] plt.bar(["LOW", "HIGH"], [group_means["LOW"], group_means["HIGH"]], yerr=[i for i in e_bars], capsize=8, ecolor='black', color=["white", "dimgrey"], edgecolor='black', alpha=0.5, linewidth=2) plt.title("Cohen's Kappa by Activity Group") plt.ylabel("Cohen's Kappa") plt.yticks(np.arange(0, 1.1, 0.1)) plt.yticks(fontsize=10)
def two_way_anova(xs: tuple, ys: tuple, values: tuple, replications, stds: tuple = None, log_transform=True): with pd.option_context('display.max_rows', 100): xname, xlevels = xs yname, ylevels = ys dname, data = values y = np.repeat(ylevels, replications) for i in range(1, len(xlevels)): thing = np.repeat(ylevels, replications) y = np.concatenate((y, thing)) x = np.repeat(xlevels, len(ylevels) * replications) df = pd.DataFrame({dname: data, xname: x, yname: y}) if stds: df[stds[0]] = stds[1] # Rearrange df = df[[dname, stds[0], xname, yname]] df[dname] = df[dname].astype(np.float) print("=" * 30) print("Original data") print(df) # Remove stds again if stds: del df[stds[0]] if log_transform: print("=" * 30) print("LN Transformed data") df[dname] = np.log(df[dname]) print(df) print(rp.summary_cont(df.groupby([xname, yname]))[dname]) model = ols(f"{dname}~ C({xname})*C({yname})", df).fit() # Seeing if the overall model is significant print("=" * 30) print( f"Overall model F({model.df_model:.0f},{model.df_resid:.0f}) = {model.fvalue:.3f}, " f"p = {model.f_pvalue:.4f}") print(model.summary()) print("=" * 30) print("ANOVA") res = sm.stats.anova_lm(model, typ=2) print(res)
def plot_mains_effects_kappa(self): e_bars = [self.df_kappa_ci[1], self.df_kappa_ci[0]] group_means = rp.summary_cont(self.df_kappa_long.groupby( ['Group']))["Kappa"]["Mean"] plt.bar(["LOW", "HIGH"], [group_means["LOW"], group_means["HIGH"]], yerr=[i for i in e_bars], capsize=8, ecolor='black', color=["white", "dimgrey"], edgecolor='black', alpha=0.5, linewidth=2) plt.title("Cohen's Kappa by Activity Group") plt.ylabel("Kappa") plt.yticks(np.arange(0, 1.1, 0.1)) plt.yticks(fontsize=10)
print(my_df.info()) print(my_df.describe()) print(my_df.shape) # It looks like we have 5497 observations (examples) and 6 features (dimensions) but one of those dimensions # is the unique identifier. # Notice how we are missing data in the currentsalary data column # We have to decide what to do with the missing data. # We could delete the examples/observations or we could impute (estimate) # the missing values. This decision is not made in isolation by a single individual. # Let's use the average for the low, medium and high groups to impute the missing values. print(my_df.isnull().sum()) pd.set_option('display.max_columns', 10) rp.summary_cont(my_df['currentsalary'].groupby(my_df['flightrisk'])) my_df1 = my_df.query('flightrisk=="high"') my_avg1 = my_df1['currentsalary'].mean() #my_df1['currentsalary'] = my_df1['currentsalary'].fillna(my_avg1) my_df1['currentsalary'].fillna(my_avg1, inplace=True) print(my_df1) my_df2 = my_df.query('flightrisk=="medium"') my_avg2 = my_df2['currentsalary'].mean() my_df2['currentsalary'].fillna(my_avg2, inplace=True) print(my_df2) my_df3 = my_df.query('flightrisk=="low"') my_avg3 = my_df3['currentsalary'].mean() my_df3['currentsalary'].fillna(my_avg3, inplace=True)
import pandas as pd import matplotlib.pyplot as plt import researchpy as rp import statsmodels.api as sm from statsmodels.formula.api import ols import scipy.stats as stats from statsmodels.stats.multicomp import pairwise_tukeyhsd from statsmodels.stats.multicomp import MultiComparison data_frame = pd.read_csv('/home/normie/Documents/advent/2020-1-23-HMB312-Lab3-AM-PlaqueCount.csv') data_frame = data_frame.drop('Group # ', axis=1) data_frame = data_frame.drop('TA', axis=1) data_frame = data_frame.rename(columns={'Hippocampus Amyloid Counts': 'hippocampus', 'Cerebellum Amyloid Counts': 'cerebellum', 'Cortex Amyloid Counts': 'cortex'}) hipposummary = rp.summary_cont(data_frame['hippocampus'].groupby(data_frame['Slide'])) print(hipposummary) cortexsummary = rp.summary_cont(data_frame['cortex'].groupby(data_frame['Slide'])) print(cortexsummary) cerebsummary = rp.summary_cont(data_frame['cerebellum'].groupby(data_frame['Slide'])) print(cerebsummary) hipresults = ols('hippocampus ~ C(Slide)', data=data_frame).fit() hip_table = sm.stats.anova_lm(hipresults, typ=2) cerresults = ols('cerebellum ~ C(Slide)', data=data_frame).fit() cer_table = sm.stats.anova_lm(cerresults, typ=2) cortresults = ols('cortex ~ C(Slide)', data=data_frame).fit() cort_table = sm.stats.anova_lm(cortresults, typ=2) print('HIPPOCAMPUS')
""" import pandas as pd import researchpy as rp import seaborn as sns import scipy.stats as stats import statsmodels.api as sm from statsmodels.formula.api import ols import statsmodels.stats.multicomp # maximizer df_m = pandas.read_csv("https://raw.githubusercontent.com/impudding/Maximizer-Saticeficer/master/data/ad-questionaire-maximizer.csv") print('\nMAXIMIZER\n') # ad recognotion ad_recog_data = rp.summary_cont(df_m.groupby(['size', 'RL', 'hashtag']))['ad_recog'] print('ad recognition\n') display(ad_recog_data) #ad_recog_data.to_csv('m_out.csv', sep='\t', encoding='utf-8') model = ols('ad_recog ~ C(size)*C(RL)*C(hashtag)', df_m).fit() # Seeing if the overall model is significant print(f"\nOverall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}") # brand attitude brand_data = rp.summary_cont(df_m.groupby(['size', 'RL', 'hashtag']))['brand'] print('brand attitude\n') display(brand_data) #brand_data.to_csv('m_out.csv', sep='\t', encoding='utf-8')
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, researchpy as rp, warnings warnings.filterwarnings('ignore') #Let's turn off all warning messages my_df = pd.read_csv('ProjectReturnData.txt', delimiter=';') print(my_df.head(5)) print(my_df.info()) print(my_df.describe()) print(my_df.shape) # It looks like we have 5181 observations (examples) and 4 features (dimensions) in this dataset. # It does not appear that we are missing any data print(my_df.isnull().sum()) pd.set_option('display.max_columns', 10) rp.summary_cont(my_df['Percent Return'].groupby(my_df['Leader Gender'])) rp.summary_cont(my_df['Percent Return'].groupby(my_df['Team Size'])) rp.summary_cont(my_df['Percent Return'].groupby( my_df['Aggregate Sales Experience'])) # Let's build a series of scatter plots to visualize our data sns.pairplot(my_df, hue='Leader Gender', diag_kind='hist', kind='scatter', palette='husl') # Let's map the genders to 0s and 1s....let's have the females be 1's and males be 0s my_df['Leader Gender'] = my_df['Leader Gender'].map({'Male': 0, 'Female': 1}) print(my_df['Leader Gender'].head(5))
import statsmodels.api as sm from statsmodels.formula.api import ols import numpy as np import pingouin as pg import seaborn as sns from statsmodels.stats.multicomp import pairwise_tukeyhsd import csv df = pd.read_csv("Datos.csv", index_col=None,usecols=[1,2,3,4,8],dtype={'generador': 'category', 'algoritmo_flujo': 'category','vertices': 'category','aristas': 'category', 'mediana': np.float64} ) logX = np.log1p(df['mediana']) df = df.assign(mediana_log=logX.values) df.drop(['mediana'], axis= 1, inplace= True) factores=["vertices","generador","aristas","algoritmo_flujo"] plt.figure(figsize=(8, 6)) for i in factores: print(rp.summary_cont(df['mediana_log'].groupby(df[i]))) anova = pg.anova (dv='mediana_log', between=i, data=df, detailed=True) pg._export_table (anova,("ANOVA"+i+".csv")) ax=sns.boxplot(x=df["mediana_log"], y=df[i], data=df, palette="Set1") plt.savefig("boxplot_"+ i+".png", bbox_inches='tight') plt.savefig("boxplot_" + i + ".eps", bbox_inches='tight') tukey = pairwise_tukeyhsd(endog = df["mediana_log"], groups= df[i], alpha=0.05) tukey.plot_simultaneous(xlabel='Time', ylabel=i) plt.vlines(x=49.57,ymin=-0.5,ymax=4.5, color="red") plt.savefig("simultaneous_tukey"+ i+".png", bbox_inches='tight') plt.savefig("simultaneous_tukey" + i + ".eps", bbox_inches='tight') print(tukey.summary()) t_csv = open("Tukey"+i+".csv", 'w') with t_csv: writer = csv.writer(t_csv) writer.writerows(tukey.summary())
def contentAnalysis(data, outcomeVar, independent): return rp.summary_cont(data[outcomeVar].groupby(data[independent]))
# These are for exploring data import pandas as pd import researchpy as rp import matplotlib.pyplot as plt # These are for running the model and conducting model diagnostics import statsmodels.formula.api as smf import statsmodels.stats.api as sms from scipy import stats from statsmodels.compat import lzip df = pd.read_csv('insurance.csv') print("============================================") # Let's get more information on the continuous varibles print(rp.summary_cont(df[['charges', 'age', 'children']])) print("\n===========================================") # Let's get more information on the categorical data print(rp.summary_cat(df[['sex', 'smoker', 'region']])) df['sex'].replace({'female': 1, 'male': 0}, inplace=True) df['smoker'].replace({'no': 0, 'yes': 1}, inplace=True) df = pd.get_dummies(df) print("\n===========================================") print(df.head()) print("\n===========================================") model = smf.ols(
def summerize(col, df): summ = rp.summary_cont(df.groupby(['g'])[col]) display(summ)
data_new.boxplot(column='Soil_pH', by='District') import seaborn as sns ax = sns.boxplot(x="District", y="Soil_pH", data=data_new) !pip install researchpy import researchpy as rp import scipy.stats as stats import statsmodels.api as sm sns.violinplot(x="District", y="Soil_pH", data=data_new) rp.summary_cont(data_new['Soil_pH'].groupby(data_new['District'])) stats.f_oneway(data_new['Soil_pH'][data_new['District'] == 'Kannur'], data_new['Soil_pH'][data_new['District'] == 'Kollam'],data_new['Soil_pH'][data_new['District'] == 'Kottayam'],data_new['Soil_pH'][data_new['District'] == 'Thrissur']) from statsmodels.stats.multicomp import pairwise_tukeyhsd tuckey=pairwise_tukeyhsd(endog=data_new['Soil_pH'],groups=data_new['District'],alpha=0.05) tuckey.plot_simultaneous() plt.vlines(x='District',ymin=-0.5,ymax=0.5,color='red') tuckey.summary() ax = sns.boxplot(x="District", y="Soil_Mg", data=data_new) stats.f_oneway(data_new['Soil_Mg'][data_new['District'] == 'Kannur'], data_new['Soil_Mg'][data_new['District'] == 'Kollam'],data_new['Soil_Mg'][data_new['District'] == 'Kottayam'],data_new['Soil_Mg'][data_new['District'] == 'Thrissur'])
def plot_main_effects(self, intensity): if intensity[-1] != "%": intensity += "%" plt.subplots(3, 1, figsize=(12, 7)) plt.suptitle("{} Activity (±95%CI)".format(intensity.capitalize())) # MODEL MEANS ------------------------------------------------------------------------------------------------- plt.subplot(1, 3, 1) # n - 1 t_crit = scipy.stats.t.ppf(.95, int(len(set(self.df_percent["ID"])) - 1)) model_means = rp.summary_cont(self.df_percent.groupby( ['Model']))[intensity]["Mean"] model_ci = rp.summary_cont(self.df_percent.groupby( ['Model']))[intensity]["SE"] * t_crit plt.bar([i for i in model_means.index], [100 * i for i in model_means.values], yerr=[i * 100 for i in model_ci], capsize=10, ecolor='black', color=["White", "silver", "grey", "#404042"], edgecolor='black', linewidth=2) # color=["Red", "Blue", "Green", "Purple"] plt.ylabel("% of Collection") plt.title("Model Means") # ACTIVITY GROUPS --------------------------------------------------------------------------------------------- plt.subplot(1, 3, 2) group_means = rp.summary_cont(self.df_percent.groupby( ['Group']))[intensity]["Mean"] group_sd = rp.summary_cont(self.df_percent.groupby( ['Group']))[intensity]["SD"] plt.bar([i for i in group_means.index], [100 * i for i in group_means.values], yerr=[i * 100 for i in group_sd], capsize=10, ecolor='black', color=["Grey", "White"], edgecolor='black', linewidth=2) plt.title("Group Means") plt.subplot(1, 3, 3) sns.pointplot(data=self.df_percent, x="Model", y=intensity, hue="Group", markers=".", scale=.8, dodge=True, capsize=.1, errwidth=1, palette='Set1') plt.title("All Combination Means") plt.ylabel(" ")
main_df = main_df.drop('person', axis=1) # Map 'dose' column values with string analogues main_df['dose'] = main_df['dose'].map({1: 'placebo', 2: 'low', 3: 'high'}) display(main_df['dose']) # In[6]: display(rp.summary_cat(main_df['dose'])) display(rp.summary_cat(main_df['libido'])) # In[7]: rp.summary_cont(main_df['libido'].groupby(main_df['dose'])) # In[8]: # ANOVA example with scipy.stats display( stats.f_oneway( main_df['libido'][main_df['dose'] == 'high'], # sample1 main_df['libido'][main_df['dose'] == 'low'], # sample2 main_df['libido'][main_df['dose'] == 'placebo'] # sample3 )) # In[9]: # ANOVA with statsmodels
# (i.e., make predictions from the model). # Let's load the data into a Pandas DataFrame using the read_csv my_df = pd.read_csv('EmpData.txt', delimiter='|', index_col=0) print(my_df.head(5)) print(my_df.info()) print(my_df.describe()) print(my_df.shape) # It looks like we have 5497 observations (examples) and 6 features (dimensions) in this dataset. # Also notice how we are missing data in the currentsalary data column print(my_df.isnull().sum()) pd.set_option('display.max_columns', 10) rp.summary_cont(my_df['currentsalary'].groupby(my_df['flightrisk'])) # For this analysis, let's fill the missing values with the average values based on each flightrisk group my_df1 = my_df.query('flightrisk=="high"') my_avg1 = my_df1['currentsalary'].mean() my_df1['currentsalary'].fillna(my_avg1, inplace=True) print(my_df1) my_df2 = my_df.query('flightrisk=="medium"') my_avg2 = my_df2['currentsalary'].mean() my_df2['currentsalary'].fillna(my_avg2, inplace=True) print(my_df2) my_df3 = my_df.query('flightrisk=="low"') my_avg3 = my_df3['currentsalary'].mean() my_df3['currentsalary'].fillna(my_avg3, inplace=True)
sum = 0.0 for name in names: sum = sum + float(df[name][index]) avg = sum / nnames diff = avg - df[score][index] newset.append(diff) return newset names = ['İno', 'K1', 'K3', 'K4', 'İ1'] score = 'Skor' score_diff = construct(df, names, score) distance = df['Skor'] dfimp = pd.DataFrame({'Skor': score_diff, 'K2': distance}) summary = rp.summary_cont(dfimp['Skor'].groupby(dfimp['K2'])) print(summary) corr = dfimp.corr() print(corr) y = dfimp['Skor'] x = dfimp['K2'] plt.scatter(x, y) plt.show() few = 30 dfimp_last = dfimp[-few - 1:-1] summary2 = rp.summary_cont(dfimp_last['Skor'].groupby(dfimp_last['K2'])) print(summary2) corr2 = dfimp_last.corr() print(corr2) y = dfimp_last['Skor']
# The variable SUSPECT has 0's for the runs before John's "improvement" and 1's from the start # Therefore it represents the presence of a new unknown factor, probably doping # We should understand that the "difference in performances" # is what we'd like to correlate with the suspected doping use names = ['JOE', 'BILL', 'JACK'] john = 'JOHN' john_diff = construct(df, names, john) unknown = df['SUSPECT'] # in general print( "In general, John's performance differs from the average of other athletes..." ) dfimp = pd.DataFrame({'JOHN': john_diff, 'UNKNOWN': unknown}) summary = rp.summary_cont(dfimp['JOHN'].groupby(dfimp['UNKNOWN'])) print(summary) corr = dfimp.corr() print(corr) y = dfimp['JOHN'] x = dfimp['UNKNOWN'] plt.scatter(x, y) plt.show() # How about last few runs? few = 63 # Twice the size of suspected period added print( "Last ", few, " runs, , John's performance differs from the average of other athletes..." ) dfimp_last = dfimp[-few - 1:-1]
# !conda install -c researchpy researchpy import pandas import researchpy as rp import seaborn as sns import statsmodels.api as sm from statsmodels.formula.api import ols import statsmodels.stats.multicomp df = pandas.read_csv( "https://raw.githubusercontent.com/Opensourcefordatascience/Data-sets/master/crop_yield.csv" ) df.boxplot(column=['Yield'], grid=True) rp.summary_cont(df['Yield']) rp.summary_cont(df.groupby(['Fert']))['Yield'] rp.summary_cont(df.groupby(['Water']))['Yield'] rp.summary_cont(df.groupby(['Fert', 'Water']))['Yield'] # 2 way ANOVA # Fits the model with the interaction term # This will also automatically include the main effects for each factor model = ols('Yield ~ C(Fert)*C(Water)', df).fit() # Seeing if the overall model is significant print( f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}" ) model.summary()
smmouth = df.loc[df.network == 'smmouth'] ventral = df.loc[df.network == 'ventral'] vis = df.loc[df.network == 'vis'] aud_w_baseline = pd.concat([baseline, aud]) cingulo_w_baseline = pd.concat([baseline, cingulo]) dmn_w_baseline = pd.concat([baseline, dmn]) dorsal_w_baseline = pd.concat([baseline, dorsal]) fronto_w_baseline = pd.concat([baseline, fronto]) retro_w_baseline = pd.concat([baseline, retro]) smhand_w_baseline = pd.concat([baseline, smhand]) smmouth_w_baseline = pd.concat([baseline, smmouth]) ventral_w_baseline = pd.concat([baseline, ventral]) vis_w_baseline = pd.concat([baseline, vis]) summary_aud = rp.summary_cont(aud_w_baseline.groupby('network')) summary_cingulo = rp.summary_cont(cingulo_w_baseline.groupby('network')) summary_dmn = rp.summary_cont(dmn_w_baseline.groupby('network')) summary_dorsal = rp.summary_cont(dorsal_w_baseline.groupby('network')) summary_fronto = rp.summary_cont(fronto_w_baseline.groupby('network')) summary_retro = rp.summary_cont(retro_w_baseline.groupby('network')) summary_smhand = rp.summary_cont(smhand_w_baseline.groupby('network')) summary_smmouth = rp.summary_cont(smmouth_w_baseline.groupby('network')) summary_ventral = rp.summary_cont(ventral_w_baseline.groupby('network')) summary_vis = rp.summary_cont(aud_w_baseline.groupby('network')) summary_aud2 = rp.summary_cont( aud_w_baseline.groupby(['network', 'awake', 'mild', 'deep', 'recovery'])) summary_cingulo2 = rp.summary_cont( cingulo_w_baseline.groupby( ['network', 'awake', 'mild', 'deep', 'recovery']))
def descriptiveStatistics(): df = checkEmptyValues() rp.summary_cont(df[["Üretim", "Tohum Fiyatı"]]) rp.summary_cat(df[["EkildiğiAy", "Bölge", "ÜretimSüresi"]]) df[["Üretim", "EkilenAlan"]].cov()
print(my_contracts_df.head(5)) #I want to use the ContractPK column as the index instead of the 0,1,2,3,4,5, etc. that gets entered by default my_contracts_df.set_index('ContractPK', inplace=True) print(my_contracts_df.head(5)) pd.set_option('display.max_columns', 10) #To describe the data, let's construct a correlation matrix to see how correlated our data are corrMatrix = my_contracts_df.corr() print(corrMatrix) #For as many features as we have in these data, they don't seem to be excessively correlated. sns.heatmap(corrMatrix, annot=True) plt.show() print(my_contracts_df.info()) rp.summary_cont(my_contracts_df['QuotedPrice'].groupby( my_contracts_df['Status'])) rp.summary_cont(my_contracts_df['NumberofSocialMediaConnections'].groupby( my_contracts_df['Status'])) rp.summary_cont(my_contracts_df['SizeOfSalesTeam'].groupby( my_contracts_df['Status'])) rp.summary_cont(my_contracts_df['SalesTeamExperience'].groupby( my_contracts_df['Status'])) rp.summary_cont(my_contracts_df['NumberPriorPurchases'].groupby( my_contracts_df['Status'])) rp.summary_cont(my_contracts_df['CreditPercentage'].groupby( my_contracts_df['Status'])) rp.summary_cont(my_contracts_df['InterestRate'].groupby( my_contracts_df['Status'])) rp.summary_cont(my_contracts_df['FinanceTermMonths'].groupby( my_contracts_df['Status']))
else: df.iat[column, 6] = 3 print(df["Excentricidad"]) df['Excentricidad'].replace({1: "baja", 2: 'media', 3: 'alta'}, inplace=True) print(df["Excentricidad"]) logX = np.log1p(df['Mediana']) df = df.assign(mediana_log=logX.values) df.drop(['Mediana'], axis=1, inplace=True) factores = [ "Grado", "CoefAg", "CentCer", "CentCag", "Excentricidad", "PageRag" ] plt.figure(figsize=(8, 6)) for i in factores: print(rp.summary_cont(df['FlujoMax'].groupby(df[i]))) anova = pg.anova( dv='FlujoMax', between=i, data=df, detailed=True, ) pg._export_table(anova, ("ANOVAsFlujoMax" + i + ".csv")) ax = sns.boxplot(x=df["FlujoMax"], y=df[i], data=df, palette="cubehelix") plt.savefig("boxplot_FlujoMax" + i + ".eps", bbox_inches='tight') tukey = pairwise_tukeyhsd(endog=df["FlujoMax"], groups=df[i], alpha=0.05) tukey.plot_simultaneous(xlabel='Flujo Maximo', ylabel=i)
print(df['body_word_count'].median()) print(df['body_word_count'].mean()) norm_data = df['body_word_count'] print("Standard deviation of body word count is: ", end="") print(norm_data.std()) print("The varianve of the body word count is: ", end="") print(norm_data.var()) diff = df.body_word_count - df.body_unique_words stats.probplot(diff, plot=plt) stats.ttest_ind(df.body_word_count, df.body_unique_words) rp.summary_cont(df.groupby('publish_time')['body_word_count']) df.avg_word.mean() df[['body_word_count', 'numerics']].plot(kind='box') plt.show() df['publish_time'].corr(df['body_word_count'], method='spearman') df['publish_time'].corr(df['numerics'], method='spearman') df['publish_time'].corr(df['stopwords'], method='spearman') df['abstract_word_count'].corr(df['body_word_count'], method='spearman') df['body_word_count'].corr(df['numerics'], method='spearman') df['body_word_count'].corr(df['stopwords'], method='spearman') df.stopwords.mean() df.numerics.mean() df.numerics.mode()
import scipy.stats as stats import matplotlib.pyplot as plt import researchpy as rp import statsmodels.api as sm from statsmodels.formula.api import ols import seaborn as sns import numpy as np from statsmodels.stats.multicomp import pairwise_tukeyhsd from statsmodels.stats.multicomp import MultiComparison df = pd.read_csv( "Parameterized_dataset_OQ7E9S7KML_2019_3_18_5_18_58_44431.csv", index_col=None) df.drop(['run', 'iterations'], axis=1, inplace=True) print(rp.summary_cont(df['mean_exec_time'])) logX = np.log10(df['mean_exec_time']) print(logX) df = df.assign(mean_exec_time_log=logX.values) print(df) df.drop(['mean_exec_time'], axis=1, inplace=True) print(df) print(rp.summary_cont(df['mean_exec_time_log'].groupby(df['algorithm']))) results = ols('mean_exec_time_log ~ C(algorithm)', data=df).fit() print(results.summary()) aov_table = sm.stats.anova_lm(results, typ=2) print(aov_table)
def test_significance(df, dependent_var, *independent_vars, formula=None, logit_model=False, correction_method='bonf', anova_type=2): """ Test the significance of independent vars on the dependent var and output the complete results of each step. This doesn't let us tune as many parameters as we might want to. (Don't use this generally) Args: df: DataFrame dependent_var: The name of the dependent variable column in df independent_vars: Array of independent variable columns in df formula (str): A formula relating the vars. If not specified, no interactions are assumed Returns: output (str) : A string to print the results of each test results (dict) : A dictionary of results corresponding to each test """ ALPHA = 0.05 # Used for diagnostic tests output = '' results = { 'multicollinearity': False, 'homoskedastic': True, 'normal_distribution': True, } # First add the summary data summary_df = rp.summary_cont( df.groupby(list(independent_vars))[dependent_var]) summary_df['median'] = df.groupby( list(independent_vars))[dependent_var].median() output += f'Summary:\n{summary_df}\n\n' results['summary'] = summary_df # Get the OLS model formula if formula is None: formula = f"{dependent_var} ~ {' + '.join([f'C({v})' for v in independent_vars])} " # Then create the model and fit the data if not logit_model: model = smapi.ols(formula, data=df) else: # model = smapi.logit(formula, data=df) model = smapi.glm(formula, data=df, family=sm.families.Binomial()) model_results = model.fit() output += f"{model_results.summary()}\n\n" results['initial'] = model_results # Check for normality if not logit_model: w, pvalue = spstats.shapiro(model_results.resid) output += f'Shapiro-Wilk test: {w, pvalue}\n\n' results['shapiro'] = ( w, pvalue, ) # if pvalue < 1e-4: if pvalue < ALPHA: output += 'NON NORMAL detected. Do something else\n\n' results['normal_distribution'] = False # Check for homoskedasticity based on the normality test if not logit_model: unique_values = df.groupby( list(independent_vars)).size().reset_index().rename( columns={0: 'count'}) hs_test_data = [] for row in unique_values.itertuples(index=False): if len(independent_vars) > 1: selectors = [(df[v] == getattr(row, v)) for v in independent_vars] row_selector = np.logical_and(*selectors[:2]) if len(independent_vars) > 2: row_selector = np.logical_and(row_selector, selectors[2]) else: v = independent_vars[0] row_selector = df[v] == getattr(row, v) hs_test_data.append(df.loc[row_selector, dependent_var]) assert len(hs_test_data) == unique_values.shape[0] if results['normal_distribution']: w, pvalue = spstats.bartlett(*hs_test_data) output += f'Bartlett test: {w, pvalue}\n\n' results['bartlett'] = ( w, pvalue, ) else: w, pvalue = spstats.levene(*hs_test_data) output += f'Levene test: {w, pvalue}\n\n' results['levene'] = ( w, pvalue, ) if pvalue < ALPHA: output += 'HETEROSKEDASTICITY detected. Do something else\n\n' results['homoskedastic'] = False # Check that the condition number is reasonable if model_results.diagn['condno'] > 20: output += f'MULTICOLLINEARITY detected. Do something else\n\n' results['multicollinearity'] = True # If we are normal, non-multicollinear, and homoskedastic, perform ANOVA # and then multiple comparisons using Tukey's HSD. If heteroskedastic, then # we should use robust regression. Else, use a non-parametric test # TODO: Perhaps we should look into using the Wald test instead? # https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.RegressionResults.wald_test.html if results['normal_distribution'] and results[ 'homoskedastic'] and not logit_model: o, r = test_using_anova(model, model_results, True, df, dependent_var, *independent_vars, anova_type=anova_type) output += o results.update(r) elif results['normal_distribution'] and not logit_model: model = smapi.rlm(formula, data=df) rlm_results = model.fit() output += f"{rlm_results.summary()}\n\n" results['rlm'] = rlm_results o, r = test_using_anova(model, rlm_results, False, df, dependent_var, *independent_vars, anova_type=anova_type) output += o results.update(r) elif not logit_model: o, r = test_using_kruskal(df, dependent_var, *independent_vars, correction_method=correction_method) output += o results.update(r) # Return the outputs return output, results
Date creation: Jan-3-2020 Description: This program runs a two factor ANOVA on the data set contained in the file RI.csv. ''' import pandas import researchpy as rp import statsmodels.api as sm from statsmodels.formula.api import ols import statsmodels.stats.multicomp #Imports data from csv file df = pandas.read_csv('RI.csv') #Summary of the RI (Grand Mean) sum_RI = rp.summary_cont(df['RI']) print('\n--Overall summary:\n') print(sum_RI) #Summary of RI ordered by Genotype and Concentration sum_RI_con = rp.summary_cont(df.groupby(['Genotype', 'Concentration']))['RI'] print('\n--Overall summary by groups:\n') print(sum_RI_con) #Fits the regression model. We include the interaction of Genotype and Concentration model = ols('RI ~ C(Genotype) * C(Concentration) ', df).fit() #Shows if the overall model is statistically significant print(f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}") #Summary of the model
print(" ") print("DATASET SCORES BY METHODS:") print(" ") dg = pd.DataFrame(index=range(7), columns=['Method A', 'Method B', 'Method C', 'Method D']) dfe = df.values.reshape(4, 7, 2) dfee = dfe[:, :, 1] for j in range(4): dg[dg.columns[j]] = dfee[j] print(dg) #df['Method'].replace({1:"Method A", 2: "Method B",3:"Method C", 4:"Method D"},inplace=True) print(rp.summary_cont(df['Scores'])) print(" ") print("Descritive Statistics for outcome variable DV") print(" ") print(rp.summary_cont(df['Scores'].groupby(df['Method']))) print(" ") print("ASSUMPTIONS FOR ANOVA TEST") # INDEPENDENCE print(" ") print("INDEPENDENCE") print(" ") print("It is Assumed due to the statement ") print(" ") #NORMALITY
df = pd.read_csv("../data/all_annotated.csv", parse_dates=['publish_date']) tracker = open('../outputs/stats.csv', "w", newline="") #summary stats tracker.write("Statistics\r\n") #HYPOTHESIS 1 Kashmir over Pakistan tracker.write( "\nHYPOTHESIS 1: Kashmir-related headlines will have more negative sentiment scores on average than non-Kashmir-related in any given year\r\n" ) levene = stats.levene(df['total_score'][df['is_kashmir'] == True], df['total_score'][df['is_kashmir'] == False]) tracker.write("Variance is equal: %r, %s\n" % ((levene[1] > .05), levene)) rp.summary_cont(df.groupby(['year'])['total_score']).to_csv(tracker, mode="a") rp.summary_cont(df.groupby(['is_kashmir'])['total_score']).to_csv(tracker, mode="a") tracker.write("Summary by year and relation to Kashmir\n") rp.summary_cont(df.groupby(['is_kashmir', 'year'])['total_score']).to_csv(tracker, mode="a") model = ols('total_score ~ C(year)*C(is_kashmir)', df).fit() # Seeing if the overall model is significant tracker.write("\nSeeing if overall model is significant\n") tracker.write( f"Overall model F({model.df_model: .0f},{model.df_resid: .0f}) = {model.fvalue: .3f}, p = {model.f_pvalue: .4f}\n" ) tracker.write(str(model.summary())) tracker.write("\nTwo-way ANOVA\n") res = sm.stats.anova_lm(model, typ=2)
import pandas as pd import researchpy as rp # From https://www.pythonfordatascience.org/anova-python/ df = pd.read_csv("../datasets/difficile.csv") df.drop('person', axis=1, inplace=True) # Recoding value from numeric to string df['dose'].replace({1: 'placebo', 2: 'low', 3: 'high'}, inplace=True) df.info() rp.summary_cont(df['libido']) rp.summary_cont(df['libido'].groupby(df['dose']))