Example #1
0
 def run(self):
     df = self.df
     plot(df).save('dataprep_plot.html')
     plot_correlation(df).save('dataprep_correlation.html')
     plot_missing(df).save('dataprep_missing.html')
     ProfileReport(df, title='Pandas Profiling Report').to_file(
         'pandas_profiling_report.html')
data=pd.read_csv("healthcare-dataset-stroke-data.csv")
data

data.describe()

#drop id
data.drop(columns=['id'],inplace=True)

#checking missing values
data.isna()

#getting the count of null values in a column
data.isna().sum()

#checking if we have missing data
plot_missing(data)

data=data.fillna(np.mean(data['bmi']))
data.info()

plot(data)

plot(data,'stroke')

plot(data,'smoking_status')

plot(data,'bmi')

plot(data,'heart_disease')

plot_correlation(data)
Example #3
0
    print('#rows:', df.shape[0])  # number of rows
    for r in df.columns:
        print(
            r,
            ':',  # column name
            df[r].unique().shape[0],  # number of unique elements in the column
            '| example:',
            df[r][0])  # example of the first element in the column


vp_summ(df)

import dataprep.eda as eda
eda.plot(df, 'country')
eda.plot_correlation(df, 'numeric-column')
eda.plot_missing(df, 'country')

# Summarizing
df.groupby('country').nunique()[['show_id']].sort_values(by='show_id',
                                                         ascending=False)
df.groupby('country').nunique()[['show_id']].sum()
7280 - 923

# Plotting
import plotly.graph_objects as go

labels = ['All other movies', 'Indian movies']
values = [6357, 923]

# pull is given as a fraction of the pie radius
fig = go.Figure(
Example #4
0
 def missing_data_analysis(data):
     return (plot_missing(data))
Example #5
0
# cleaning the dataset
# select features we need - CustomerID, InvoiceDate, Quantity and Total Sales (Quantity * UnitPrice)
df2 = df[['Quantity', 'InvoiceNo', 'InvoiceDate', 'UnitPrice', 'CustomerID']]
df2['TotalSales'] = df2.Quantity * df2.UnitPrice
df2.shape

# review descriptive statistics
df2.describe()

# drop negative sales due to returns
df3=df2[df2.TotalSales>0] 
df3.shape

# check how many CustomerID's are missing
dp.plot_missing(df2, 'CustomerID') 
pd.DataFrame(zip(df2.isnull().sum(), df2.isnull().sum()/len(df2)), columns=['Count', 'Proportion'], index=df2.columns) # alternate approach

# drop rows with null CustomerID
df2 = df2[pd.notnull(df2.CustomerID)] 

##############################################################

# aggregate model
# assumes a constant average spend and churn rate for all the customers, and produces a single value for CLV at an overall Level
# downside - unrealistic estimates if some of the customers transacted in high value and high volume

'''
CLV = ((Average Sales X Purchase Frequency) / Churn) X Profit Margin
Where,
Average Sales = TotalSales/Total no. of orders
# Exploratory Data Analysis 
from dataprep.eda import plot
plot(df) # distribution of each column and calculates dataset statistics
plot(df,'tip') # distribution of column x in various ways and calculates column statistics
plot(df, 'tip', 'total_bill') # depicting the relationship between columns x and y

# Plot corr
from dataprep.eda import plot_correlation
plot_correlation(df) # plots correlation matrices (correlations between all pairs of columns)
plot_correlation(df, 'tip') # plots the most correlated columns to column x
plot_correlation(df, 'tip', 'total_bill') # plots the joint distribution of column x and column y and computes a regression line

# Plot missing data
from dataprep.eda import plot_missing
plot_missing(df) # plots the amount and position of missing values, and their relationship between columns
plot_missing(df, 'tip') # plots the impact of the missing values in column x on all other columns
plot_missing(df, 'tip', 'total_bill') # plots the impact of the missing values from column x on column y in various ways

# Report
'''
Overview: detect the types of columns in a dataframe
Variables: variable type, unique values, distint count, missing values
Quantile statistics like minimum value, Q1, median, Q3, maximum, range, interquartile range
Descriptive statistics like mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness
Text analysis for length, sample and letter
Correlations: highlighting of highly correlated variables, Spearman, Pearson and Kendall matrices
Missing Values: bar chart, heatmap and spectrum of missing values
'''
from dataprep.eda import create_report
create_report(df, title='My Report')