def descriptiveStatistics(): df = checkEmptyValues() rp.summary_cont(df[["Üretim", "Tohum Fiyatı"]]) rp.summary_cat(df[["EkildiğiAy", "Bölge", "ÜretimSüresi"]]) df[["Üretim", "EkilenAlan"]].cov()
############################################################################################### ## This code implements the Exploratory Data Analysis ## ## ## ############################################################################################### # Read the raw data from the postgres into dataframe df_cleaned = pd.read_sql_table("diabetes_clean_data", engine) print(df_cleaned.head(5)) # 69,710 records in the dataset print(df_cleaned.describe()) # 53.25% of admissions came from Trauma Center followed by ED print(rp.summary_cat(df_cleaned[['admission_source_id']])) # ~75% of cases were caucasian followed by 18% for African American print(rp.summary_cat(df_cleaned[['race']])) # 53% cases were female, 47% male print(rp.summary_cat(df_cleaned[['gender']])) labels = ['Female', 'Male'] sizes = df_cleaned.gender.value_counts() explode = (0, 0.1) colors = ['pink','lightblue'] fig1, ax1 = plt.subplots() ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90, colors=colors)
print(df.info()) # We need the four last characters of published in our new variable df['year'] = df['published'].str[-4:] df['year'] = df['year'].astype(int) def yearcat(x): if x >= 2005 and x < 2010: y = '2005 until 2010' if x >= 2010 and x < 2015: y = '2010 until 2015' if x >= 2015: y = '2015 until now' return y df['yearcat'] = df['year'].apply(lambda x: yearcat(x)) ### MODE, FREQUENCIES, AND COUNTS catsum = rp.summary_cat(df['yearcat']) print(catsum) ### BART CHART plt.bar(catsum['Outcome'], catsum['Percent']) plt.ylabel('Percentage') # Saving the image to a file plt.savefig('barchart.pdf') plt.clf() # Clear figure
# ACCESS_METHOD_ID varibale me aksar values 2 dafa repeat hwi hen, magar kuch values 1 dafa i hen, jo 1 dafa hen un k lye 2nd line # add karni h, jis me access_method_id call to same ho, or IsWeekDay me oposite ho(agar exist me ho to new me 1 and vise versa) # or baqi columns me 0 def add_row(row): global df df.loc[-1] = row df.index = df.index + 1 counts = df.ACCESS_METHOD_ID_.value_counts() for i in counts[counts == 1].index: m = 0 if df[df.ACCESS_METHOD_ID_ == i].IsWeekDay_.values == 1 else 1 add_row([i, m, 0, 0, 0, 0, 0, 0, 0, 0]) ------------------------------------------------- count and proportion of values in a column: import researchpy as rp rp.summary_cat(df) ------------------------------------------------- Add a prefix to all of your column names: df.add_prefix('X_') Add a suffix to all of your column names: df.add_suffix('_Y') ------------------------------------------------- # continues variables to catagorical with bins... df['age_groups'] = pd.cut(df.age, bins=[0, 18, 65, 99], labels=['child', 'adult', 'elderly']) ------------------------------------------------- df = df.replace(r'^\s*$', " ", regex=True) ------------------------------------------------- # fill diffirent variables with distinct value df1.fillna({"column_x":0.5, "column_n":0}) -------------------------------------------------
import pandas as pd import researchpy as rp import matplotlib.pyplot as plt df = pd.read_csv('imdb_data_clean.csv', delimiter = ';') pd.set_option('max_rows', 9999) # Set minimum of rows to show, in/decrease to needs ### MODE, FREQUENCIES, AND COUNTS catsum = rp.summary_cat(df['productionlocation']) print(catsum) # Mode is value with highest count: 'USA' # Counts are absolute frequencies # Percentages are relative frequencies ### BART CHART plt.bar(catsum['Outcome'],catsum['Count']) # x-labels based on outcome strings of catsum # bar height based on count figures of catsum # Saving the image to a file plt.savefig('productionlocation-absolute-barchart.pdf') plt.clf() # Clear figure ### BART CHART plt.bar(catsum['Outcome'],catsum['Percent']) # x-labels based on outcome strings of catsum # bar height based on percentage figures of catsum # Saving the image to a file plt.savefig('productionlocation-relative-barchart.pdf')
# Initial work on raw_df main_df = raw_df.copy() # Drop 'person' column - we don't need it main_df = main_df.drop('person', axis=1) # Map 'dose' column values with string analogues main_df['dose'] = main_df['dose'].map({1: 'placebo', 2: 'low', 3: 'high'}) display(main_df['dose']) # In[6]: display(rp.summary_cat(main_df['dose'])) display(rp.summary_cat(main_df['libido'])) # In[7]: rp.summary_cont(main_df['libido'].groupby(main_df['dose'])) # In[8]: # ANOVA example with scipy.stats display( stats.f_oneway( main_df['libido'][main_df['dose'] == 'high'], # sample1 main_df['libido'][main_df['dose'] == 'low'], # sample2
# These are for running the model and conducting model diagnostics import statsmodels.formula.api as smf import statsmodels.stats.api as sms from scipy import stats from statsmodels.compat import lzip df = pd.read_csv('insurance.csv') print("============================================") # Let's get more information on the continuous varibles print(rp.summary_cont(df[['charges', 'age', 'children']])) print("\n===========================================") # Let's get more information on the categorical data print(rp.summary_cat(df[['sex', 'smoker', 'region']])) df['sex'].replace({'female': 1, 'male': 0}, inplace=True) df['smoker'].replace({'no': 0, 'yes': 1}, inplace=True) df = pd.get_dummies(df) print("\n===========================================") print(df.head()) print("\n===========================================") model = smf.ols( "charges ~ age + bmi + sex + smoker + children + region_northwest + region_southeast + region_southwest", data=df).fit() print(model.summary())
jitter=True, edgecolor='none', alpha=.40) smoker_charg = sns.stripplot(x='smoker', y='charges', data=data, jitter=True, edgecolor='none', alpha=.40) age_charg = sns.jointplot(x='age', y='charges', data=data, kind='hex') bmi_charg = sns.jointplot(x='bmi', y='charges', data=data, kind='hex') #overview of data rp.summary_cont(data[['charges', 'age', 'children']]) rp.summary_cat(data[['sex', 'smoker', 'region']]) #creating dummy columns data = pd.get_dummies(data) #generating model model = smf.ols( "charges ~ age + bmi + sex_female + smoker_yes + children + region_northwest + region_southeast + region_southwest", data=data).fit() #checking for normality qqplot = stats.probplot(model.resid, plot=plt) #checking for heteroscedasticity name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value'] test = sms.het_breuschpagan(model.resid, model.model.exog)
#print(df.info()) #print(df.head()) #print(rp.summary_cat(df[['Do you currently have a mental health disorder?', 'Would you have been willing to discuss a mental health issue with your direct supervisor(s)?' ]])) def drop_maybe(series): if series.lower() == 'yes' or series.lower() == 'no': return series else: return df['current_mental_disorder'] = df['Do you currently have a mental health disorder?'].apply(drop_maybe) df['willing_discuss_mh_supervisor'] = df['Would you have been willing to discuss a mental health issue with your direct supervisor(s)?'] print(rp.summary_cat(df[['current_mental_disorder', 'willing_discuss_mh_supervisor']])) print("\n **************************** \n") print(df[['current_mental_disorder', 'willing_discuss_mh_supervisor']].info()) print("\n **************************** \n") print(pd.crosstab(df['willing_discuss_mh_supervisor'], df['current_mental_disorder'])) crosstab = pd.crosstab(df['willing_discuss_mh_supervisor'], df['current_mental_disorder']) print("\n **************************** \n") print(stats.chi2_contingency(crosstab)) # The H0 (Null Hypothesis): There is no relationship between variable one and variable two # The H1 (Alternative Hypothesis): There is a relationship between variable 1 and variable 2 # If the p-value is significant, you can reject the null hypothesis and claim that the findings support the alternative hypothesis # ******** Assumptions **********
import pandas as pd import researchpy as rp import matplotlib.pyplot as plt df = pd.read_csv( 'https://www.digitalanalytics.id.au/static/files/youtube_vevo_clean.csv', delimiter=',') # Set max of rows to show, in/decrease to needs pd.set_option('max_rows', 9999) ### MODE, FREQUENCIES, AND COUNTS catsum = rp.summary_cat(df['view_cat']) print(catsum) # Mode is value with highest count: 'USA' # Counts are absolute frequencies # Percentages are relative frequencies ### BART CHART plt.bar(catsum['Outcome'], catsum['Count']) # x-labels based on outcome strings of catsum # bar height based on count figures of catsum # Saving the image to a file plt.tight_layout() # Saving the image to a file plt.savefig('view_cat-absolute-barchart.pdf') # Clear figure plt.clf() ### BART CHART