Esempio n. 1
0
 def run(self):
     df = self.df
     plot(df).save('dataprep_plot.html')
     plot_correlation(df).save('dataprep_correlation.html')
     plot_missing(df).save('dataprep_missing.html')
     ProfileReport(df, title='Pandas Profiling Report').to_file(
         'pandas_profiling_report.html')
plot_missing(data)

data=data.fillna(np.mean(data['bmi']))
data.info()

plot(data)

plot(data,'stroke')

plot(data,'smoking_status')

plot(data,'bmi')

plot(data,'heart_disease')

plot_correlation(data)

#converting Marrital Status, Residence and Gender into 0s and 1s
data['gender']=data['gender'].apply(lambda x : 1 if x=='Female' else 0) 
data["Residence_type"]=data["Residence_type"].apply(lambda x: 1 if x=="Urban" else 0)
data["ever_married"]=data["ever_married"].apply(lambda x: 1 if x=="Yes" else 0)

#removing the observations that have smoking_status type unknown 
data=data[data['smoking_status']!='Unknown']

data.head(12)

data

#using OneHotEncoding for smoking_status, work_type
data_dummies=data[['smoking_status','work_type']]
Esempio n. 3
0
    print('#columns:', df.shape[1])  # number of columns
    print('#rows:', df.shape[0])  # number of rows
    for r in df.columns:
        print(
            r,
            ':',  # column name
            df[r].unique().shape[0],  # number of unique elements in the column
            '| example:',
            df[r][0])  # example of the first element in the column


vp_summ(df)

import dataprep.eda as eda
eda.plot(df, 'country')
eda.plot_correlation(df, 'numeric-column')
eda.plot_missing(df, 'country')

# Summarizing
df.groupby('country').nunique()[['show_id']].sort_values(by='show_id',
                                                         ascending=False)
df.groupby('country').nunique()[['show_id']].sum()
7280 - 923

# Plotting
import plotly.graph_objects as go

labels = ['All other movies', 'Indian movies']
values = [6357, 923]

# pull is given as a fraction of the pie radius
import numpy as np
import pandas as pd
from matplotlib.pyplot import xticks
import dataprep.eda as eda

broadband = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-11/broadband.csv')
broadband.columns=['ST', 'COUNTY_ID', 'COUNTY_NAME', 'AVAILABILITY', 'USAGE']

eda.create_report(broadband).show_browser()

eda.plot(broadband, 'ST', 'BROADBAND AVAILABILITY PER FCC').show_browser()
eda.plot(broadband, 'ST', 'BROADBAND USAGE').show_browser()

eda.plot_correlation(broadband).show_browser()

broadband_zip = pd.read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-11/broadband_zip.csv')

broadband.loc[50:55,] # there is a - in data
broadband = broadband.drop(broadband[broadband.AVAILABILITY=='-'].index)
broadband = broadband.drop(broadband[broadband.USAGE=='-'].index)

broadband.info()
broadband.AVAILABILITY=pd.to_numeric(broadband.AVAILABILITY)
broadband.USAGE=pd.to_numeric(broadband.USAGE)

br = broadband.groupby(['ST']).mean()[['AVAILABILITY', 'USAGE']].reset_index()

import plotly.express as px
fig = px.scatter(br, x='AVAILABILITY', 
            y='USAGE', color='ST', opacity=0,
Esempio n. 5
0
  arrowwidth=1,arrowcolor='orange')
fig.add_annotation(dict(xref='paper',yref='paper',x=0.095,y=0.05,xanchor='center',yanchor='top',
  font=dict(family='Arial', size=12, color='cornflowerblue'),showarrow=False,
  text='5550, Chhukung Ri'),
  showarrow=True,align='left',arrowhead=1,arrowsize=1,
  arrowwidth=1,arrowcolor='cornflowerblue')
fig.add_annotation(dict(xref='paper',yref='paper',x=0.94,y=0.05,xanchor='center',yanchor='top',
  font=dict(family='Arial', size=12, color='cornflowerblue'),showarrow=False,
  text='8850, Everest'),
  showarrow=True,align='left',arrowhead=1,arrowsize=1,
  arrowwidth=1,arrowcolor='cornflowerblue')
fig.show()


###################### needs to be changed ##########################
# EDA
def vp_summ(df):
    print('#columns:', df.shape[1]) # number of columns
    print('#rows:', df.shape[0]) # number of rows
    for r in df.columns:
        print(r, ':', # column name
        df[r].unique().shape[0], # number of unique elements in the column
        '| example:', df[r][0]) # example of the first element in the column
vp_summ(df)

import dataprep.eda as eda
eda.plot(df)
eda.plot_correlation(df) 
eda.plot_missing(df, 'country')

Esempio n. 6
0
 def correlation_plot(data):
     return (plot_correlation(data))
Esempio n. 7
0
 def bivariate_numerical_scatterplot(data, feature1, feature2):
     return (plot_correlation(data, x=feature1, y=feature2, k=5))
# create average fico score
df1['fico_score'] = (df1['fico_range_low'] + df1['fico_range_high']) / 2

# create categorical variable for credit grade
df1.grade = [ ord(x) - 64 for x in df1.grade ]

# create logs for income
df1['log_income'] = np.log(df1['annual_inc'])

# Correlations
from dataprep.eda import plot_correlation

cor_features = ['emp_length', 'annual_inc', 'dti', 'fico_score', 'GDP', 'Unemployment_rate', 'hc_coverage', 'education', 'crime_rate']
df_cor = df1[cor_features]
plot_correlation(df_cor)

# demographic statistics
# accepted dataset
x = df1['crime_rate'].mean()
print(x)
x2 = df1['crime_rate'].min()
print(x2)
x3 = df1['crime_rate'].max()
print(x3)

"""Scaling the demographics for accepted dataset"""

# Scale the demographic variables
scaler = StandardScaler()
features = ['GDP', 'Unemployment_rate', 'hc_coverage', 'education', 'crime_rate']
df2=df.copy()
df2["Sales"] = np.where(df["Sales"] <7.8936,7.8936,df['Sales'])
df2["Sales"] = np.where(df["Sales"] >572.949,572.949,df['Sales'])
df2["Discount"] = np.where(df["Discount"] <0.0, 0.0,df['Discount'])
df2["Discount"] = np.where(df["Discount"] >0.4, 0.4,df['Discount'])

plot(df2,'Discount','Profit')

plot(df2,'Sales','Profit')



"""##Quantity and sales have high correlation with profit

---



---


"""

plot_correlation(df2)

plot_correlation(df2,'Profit')



# Loading the Dataset
import plotly.express as px
df = px.data.tips() 

# Exploratory Data Analysis 
from dataprep.eda import plot
plot(df) # distribution of each column and calculates dataset statistics
plot(df,'tip') # distribution of column x in various ways and calculates column statistics
plot(df, 'tip', 'total_bill') # depicting the relationship between columns x and y

# Plot corr
from dataprep.eda import plot_correlation
plot_correlation(df) # plots correlation matrices (correlations between all pairs of columns)
plot_correlation(df, 'tip') # plots the most correlated columns to column x
plot_correlation(df, 'tip', 'total_bill') # plots the joint distribution of column x and column y and computes a regression line

# Plot missing data
from dataprep.eda import plot_missing
plot_missing(df) # plots the amount and position of missing values, and their relationship between columns
plot_missing(df, 'tip') # plots the impact of the missing values in column x on all other columns
plot_missing(df, 'tip', 'total_bill') # plots the impact of the missing values from column x on column y in various ways

# Report
'''
Overview: detect the types of columns in a dataframe
Variables: variable type, unique values, distint count, missing values
Quantile statistics like minimum value, Q1, median, Q3, maximum, range, interquartile range
Descriptive statistics like mean, mode, standard deviation, sum, median absolute deviation, coefficient of variation, kurtosis, skewness
Text analysis for length, sample and letter
Correlations: highlighting of highly correlated variables, Spearman, Pearson and Kendall matrices