def descriptiveStatistics():
    df = checkEmptyValues()
    rp.summary_cont(df[["Üretim", "Tohum Fiyatı"]])
    rp.summary_cat(df[["EkildiğiAy", "Bölge", "ÜretimSüresi"]])
    df[["Üretim", "EkilenAlan"]].cov()
###############################################################################################
## This code implements the Exploratory Data Analysis                                        ##
##                                                                                           ##
############################################################################################### 

# Read the raw data from the postgres into dataframe
df_cleaned = pd.read_sql_table("diabetes_clean_data", engine)
print(df_cleaned.head(5))


# 69,710 records in the dataset 
print(df_cleaned.describe())


# 53.25% of admissions came from Trauma Center followed by ED
print(rp.summary_cat(df_cleaned[['admission_source_id']]))


# ~75% of cases were caucasian followed by 18% for African American 
print(rp.summary_cat(df_cleaned[['race']]))


# 53% cases were female, 47% male 
print(rp.summary_cat(df_cleaned[['gender']]))
labels = ['Female', 'Male']
sizes = df_cleaned.gender.value_counts()
explode = (0, 0.1)
colors = ['pink','lightblue']
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90, colors=colors)
Example #3
0
print(df.info())

# We need the four last characters of published in our new variable
df['year'] = df['published'].str[-4:]
df['year'] = df['year'].astype(int)


def yearcat(x):
    if x >= 2005 and x < 2010:
        y = '2005 until 2010'
    if x >= 2010 and x < 2015:
        y = '2010 until 2015'
    if x >= 2015:
        y = '2015 until now'
    return y


df['yearcat'] = df['year'].apply(lambda x: yearcat(x))

### MODE, FREQUENCIES, AND COUNTS
catsum = rp.summary_cat(df['yearcat'])
print(catsum)

### BART CHART
plt.bar(catsum['Outcome'], catsum['Percent'])
plt.ylabel('Percentage')

# Saving the image to a file
plt.savefig('barchart.pdf')
plt.clf()  # Clear figure
Example #4
0
# ACCESS_METHOD_ID varibale me aksar values 2 dafa repeat hwi hen, magar kuch values 1 dafa i hen, jo 1 dafa hen un k lye 2nd line
# add karni h, jis me access_method_id call to same ho, or IsWeekDay me oposite ho(agar exist me ho to new me 1 and vise versa)
# or baqi columns me 0
def add_row(row):
    global df
    df.loc[-1] = row
    df.index = df.index + 1  
    
counts = df.ACCESS_METHOD_ID_.value_counts()
for i in counts[counts == 1].index:
    m = 0 if df[df.ACCESS_METHOD_ID_ == i].IsWeekDay_.values == 1 else 1
    add_row([i, m, 0, 0, 0, 0, 0, 0, 0,  0])
-------------------------------------------------
count and proportion of values in a column:
import researchpy as rp
rp.summary_cat(df)
-------------------------------------------------
Add a prefix to all of your column names:
df.add_prefix('X_')
Add a suffix to all of your column names:
df.add_suffix('_Y')
-------------------------------------------------
# continues variables to catagorical with bins...
df['age_groups'] = pd.cut(df.age, bins=[0, 18, 65, 99], labels=['child', 'adult', 'elderly'])
-------------------------------------------------
df = df.replace(r'^\s*$', " ", regex=True)
-------------------------------------------------
# fill diffirent  variables with distinct value
df1.fillna({"column_x":0.5,
	"column_n":0})
-------------------------------------------------
Example #5
0
import pandas as pd
import researchpy as rp
import matplotlib.pyplot as plt

df = pd.read_csv('imdb_data_clean.csv', delimiter = ';') 

pd.set_option('max_rows', 9999) # Set minimum of rows to show, in/decrease to needs

### MODE, FREQUENCIES, AND COUNTS
catsum = rp.summary_cat(df['productionlocation'])
print(catsum)
# Mode is value with highest count: 'USA'
# Counts are absolute frequencies
# Percentages are relative frequencies

### BART CHART
plt.bar(catsum['Outcome'],catsum['Count'])
# x-labels based on outcome strings of catsum
# bar height based on count figures of catsum

# Saving the image to a file
plt.savefig('productionlocation-absolute-barchart.pdf')
plt.clf() # Clear figure

### BART CHART
plt.bar(catsum['Outcome'],catsum['Percent'])
# x-labels based on outcome strings of catsum
# bar height based on percentage figures of catsum

# Saving the image to a file
plt.savefig('productionlocation-relative-barchart.pdf')
Example #6
0
# Initial work on raw_df

main_df = raw_df.copy()

# Drop 'person' column - we don't need it
main_df = main_df.drop('person', axis=1)

# Map 'dose' column values with string analogues
main_df['dose'] = main_df['dose'].map({1: 'placebo', 2: 'low', 3: 'high'})

display(main_df['dose'])

# In[6]:

display(rp.summary_cat(main_df['dose']))

display(rp.summary_cat(main_df['libido']))

# In[7]:

rp.summary_cont(main_df['libido'].groupby(main_df['dose']))

# In[8]:

# ANOVA example with scipy.stats

display(
    stats.f_oneway(
        main_df['libido'][main_df['dose'] == 'high'],  # sample1
        main_df['libido'][main_df['dose'] == 'low'],  # sample2
Example #7
0
# These are for running the model and conducting model diagnostics
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
from scipy import stats
from statsmodels.compat import lzip

df = pd.read_csv('insurance.csv')

print("============================================")
# Let's get more information on the continuous varibles
print(rp.summary_cont(df[['charges', 'age', 'children']]))

print("\n===========================================")
# Let's get more information on the categorical data
print(rp.summary_cat(df[['sex', 'smoker', 'region']]))

df['sex'].replace({'female': 1, 'male': 0}, inplace=True)
df['smoker'].replace({'no': 0, 'yes': 1}, inplace=True)

df = pd.get_dummies(df)

print("\n===========================================")
print(df.head())

print("\n===========================================")
model = smf.ols(
    "charges ~ age + bmi + sex + smoker + children + region_northwest + region_southeast + region_southwest",
    data=df).fit()

print(model.summary())
                             jitter=True,
                             edgecolor='none',
                             alpha=.40)
smoker_charg = sns.stripplot(x='smoker',
                             y='charges',
                             data=data,
                             jitter=True,
                             edgecolor='none',
                             alpha=.40)
age_charg = sns.jointplot(x='age', y='charges', data=data, kind='hex')
bmi_charg = sns.jointplot(x='bmi', y='charges', data=data, kind='hex')

#overview of data
rp.summary_cont(data[['charges', 'age', 'children']])

rp.summary_cat(data[['sex', 'smoker', 'region']])

#creating dummy columns
data = pd.get_dummies(data)

#generating model
model = smf.ols(
    "charges ~ age + bmi + sex_female + smoker_yes + children + region_northwest + region_southeast + region_southwest",
    data=data).fit()

#checking for normality
qqplot = stats.probplot(model.resid, plot=plt)

#checking for heteroscedasticity
name = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
test = sms.het_breuschpagan(model.resid, model.model.exog)
#print(df.info())
#print(df.head())

#print(rp.summary_cat(df[['Do you currently have a mental health disorder?', 'Would you have been willing to discuss a mental health issue with your direct supervisor(s)?' ]]))

def drop_maybe(series):
    if series.lower() == 'yes' or series.lower() == 'no':
        return series
    else:
        return

df['current_mental_disorder'] = df['Do you currently have a mental health disorder?'].apply(drop_maybe)
df['willing_discuss_mh_supervisor'] = df['Would you have been willing to discuss a mental health issue with your direct supervisor(s)?']

print(rp.summary_cat(df[['current_mental_disorder', 'willing_discuss_mh_supervisor']]))
print("\n **************************** \n")
print(df[['current_mental_disorder', 'willing_discuss_mh_supervisor']].info())
print("\n **************************** \n")
print(pd.crosstab(df['willing_discuss_mh_supervisor'], df['current_mental_disorder']))

crosstab = pd.crosstab(df['willing_discuss_mh_supervisor'], df['current_mental_disorder'])

print("\n **************************** \n")
print(stats.chi2_contingency(crosstab))

# The H0 (Null Hypothesis): There is no relationship between variable one and variable two
# The H1 (Alternative Hypothesis): There is a relationship between variable 1 and variable 2
# If the p-value is significant, you can reject the null hypothesis and claim that the findings support the alternative hypothesis

# ******** Assumptions **********
Example #10
0
import pandas as pd
import researchpy as rp
import matplotlib.pyplot as plt

df = pd.read_csv(
    'https://www.digitalanalytics.id.au/static/files/youtube_vevo_clean.csv',
    delimiter=',')

# Set max of rows to show, in/decrease to needs
pd.set_option('max_rows', 9999)

### MODE, FREQUENCIES, AND COUNTS
catsum = rp.summary_cat(df['view_cat'])
print(catsum)
# Mode is value with highest count: 'USA'
# Counts are absolute frequencies
# Percentages are relative frequencies

### BART CHART
plt.bar(catsum['Outcome'], catsum['Count'])
# x-labels based on outcome strings of catsum
# bar height based on count figures of catsum

# Saving the image to a file
plt.tight_layout()
# Saving the image to a file
plt.savefig('view_cat-absolute-barchart.pdf')
# Clear figure
plt.clf()

### BART CHART