Example #1
0
import pandas as pd

from pandas_profiling import ProfileReport

heart = pd.read_csv('../data/raw/heart.csv', index_col=0)

profile = ProfileReport(heart, title='Heart report', explorative=True)
profile.to_file('../reports/heart_report.html')
Example #2
0
# First, as we normally do, we are going to import pandas and numpy
import numpy as np
import pandas as pd

# Thats where we import the function that will generate the ProfileReport
from pandas_profiling import ProfileReport

# Loads the dataset with the admission probability of various students and their
# scores in different tests of knowledge
df = pd.read_csv("Admission_Predict_Ver1.1.csv", encoding='unicode_escape')

# Here is the function that generates the report using Pandas-Profiling
profile = ProfileReport(df,
                        title='Graduate Admission',
                        html={'style': {
                            'full_width': True
                        }})

profile.to_notebook_iframe()

# Hint! If you were using a large dataset, set the minimal named argument to True
# profile = ProfileReport(large_dataset, minimal=True)

# It is also recommended to open the report as a html file, in this way Jupyter-Notebook
# does not becames laggy because of the big Jupyter-Notebook cell
profile.to_file(output_file="largeDatasetProfileReport.html")

# The profile report can also be saved as json, just change the file extension in the
# to_file() call
profile.to_file(output_file="largeDatasetProfileReport.json")
contf = pd.DataFrame(
    data=scale(quantile_transform(
        X=X.filter(like='cont'),
        output_distribution='normal',
        random_state=0
    )),
    columns=X.filter(like='cont').columns,
    index=X.index
)

LOGGER.info('Make raw X and profile')
X = catf.join(contf)
del catf, contf
ProfileReport(
    df=X.join(y),
    minimal=True,
    title='Data with minimum preprocessing',
    progress_bar=False
).to_file(p.joinpath('reports', 'profiles', 'raw.html'))

LOGGER.info('Create ranking')
ranking = Ranker().rank(X, y)

LOGGER.info('Figure 4: Volcano plot for features')
fig, ax = plt.subplots(figsize=A4_DIMS)
ax = sns.scatterplot(
    x=ranking['Statistical Significance'],
    y=ranking['Association Strength'],
    s=100,
)
ax.set(
    title='Figure 4: Volcano plot for features',
def exec_sql_multiple_sites():
    empty_df = pd.DataFrame(data={'': ['No data available.']})
    error_df = pd.DataFrame()
    config_flag = input(
        f'\nDo you have a config file you want to use? Please enter Y/N only:')
    sites = []
    if config_flag == 'Y':
        config_path = input(
            f'\nPlease enter the full path to your config file: ')
        usr, pwd, input_f, write_out, output_folder, output_file, all_sites, individual, sites, rule_folder = read_config(
            config_path)
    else:
        confirm = 'N'
        while (confirm != 'Y'):
            usr, pwd, input_f, write_out, output_folder, output_file, all_sites, individual, sites, rule_folder = gather_input(
            )
            confirm = input(
                f'\nPlease confirm all your inputs are correct. (Enter Y/N only): '
            )

    if (all_sites == 'Y' and sites):
        print(
            'You can\'t answer \'Y\' to run all sites and enter site lists at the same time. Conflicting values. Please review your config file and run again.'
        )
        quit()

    location_df, site_lists, site_locations = get_all_sites(usr, pwd)
    output_folder = Path(''.join([
        output_folder, '/',
        dt.datetime.now().strftime("%y-%m-%d %H%M%S"), '/'
    ]))
    output_folder.mkdir(parents=True, exist_ok=True)

    final_df = pd.DataFrame()
    outcome_final_df = pd.DataFrame()
    if (all_sites == 'Y'): sites = site_lists
    if (all_sites == 'N' and not sites):
        for s in site_lists:
            s_name = location_df[location_df['locationid'] ==
                                 s]['reportname'].item()
            region = location_df[location_df['locationid'] ==
                                 s]['region'].item()
            confirm_site = input('Do you want to run for ' + s_name +
                                 '? (Y/N only): ')
            if (confirm_site == 'Y'): sites.append(s)
            else: print('Skipping ' + s_name)

    for s in sites:
        s = int(s)
        s_name = location_df[location_df['locationid'] ==
                             s]['reportname'].item()
        region = location_df[location_df['locationid'] == s]['region'].item()
        outcome_df = pd.DataFrame()
        try:
            df, rule_df = exec_sql(usr, pwd, s, location_df, input_f,
                                   rule_folder)
            convert_date(df, [
                'lot_expiration', 'rx_date_received', 'order_date',
                'date_entered', 'expire_date', 'date_last_adjudicated',
                'fillcalendardate', 'fill_date', 'date_verified', 'delivered'
            ])
            collist = [col for col in df.columns if "_CHECK" in col]
            if len(rule_df) > 0:
                outcome_df = check_column(df, rule_df, collist)
                outcome_df['site'] = s_name
                outcome_df['region'] = region
                outcome_final_df = outcome_final_df.append(outcome_df)
            else:
                outcome_df['site'] = s_name
                outcome_df['region'] = region
                outcome_df[''] = 'No rule engine defined'
                outcome_final_df = outcome_final_df.append(outcome_df)

            for col in collist:
                df.drop(columns=col, inplace=True)

            if (len(df) == 0):
                df = empty_df
            df['site'] = s_name
            df['region'] = region
            if (individual == 'N'):
                final_df = final_df.append(df)
            else:
                individual_file = s_name + '-' + output_file
                out_file = str(
                    output_folder) + '\\' + individual_file + '.html'
                ProfileReport(df,
                              minimal=True,
                              samples=None,
                              reject_variables=False,
                              correlations=None,
                              missing_diagrams=None,
                              duplicates=None,
                              interactions=None).to_file(output_file=out_file)

        except Exception as e:
            print(f'\nError with ' + s_name +
                  '. Please check the error log at the end of the run.')
            err = str(e)
            err_df = pd.DataFrame(data={
                'Locaton': [s_name],
                'ErrorMsg': [err]
            })
            error_df = error_df.append(err_df)

    if (individual == 'N'):
        ProfileReport(final_df,
                      minimal=True,
                      samples=None,
                      reject_variables=False,
                      correlations=None,
                      missing_diagrams=None,
                      duplicates=None,
                      interactions=None).to_file(output_file=out_file)

    outcome_final_df.to_excel(
        str(Path(output_folder / 'Quality Check Outcome-All Sites-')) +
        str(dt.date.today()) + '.xlsx',
        index=False)

    if (len(error_df) > 0):
        error_df.to_csv(str(Path(output_folder / 'ErrorLog-')) +
                        str(dt.date.today()) + '.csv',
                        sep='|',
                        index=False)

    shutil.copy(src=input_f, dst=str(output_folder))
    if config_flag == 'Y':
        shutil.copy(src=config_path, dst=str(output_folder))

    #if rule_folder != '':
    #   shutil.copy(src = str(Path(Path(rule_folder)/'Custom Rule Engine.xlsx')), dst = str(output_folder))

    del df
    del final_df
    gc.collect()

    return print('The run completed.')
Example #5
0
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import RFMScore
import Aggregating_RFMScore_CAH

#################################################################################
#######################       Import data      ##################################
#################################################################################
df = pd.read_excel('./data/HotelCustomersDataset.xlsx')

#################################################################################
#######################       Univariate analysis      ##########################
#################################################################################
profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_file("./results/univariate_report.html")

rfm = RFMScore.create_RFMScore(df)

# Get the number of customers for each segment
rfm.groupby(['RFMScore'])['DocIDHash'].nunique().reset_index().rename(
    columns={'DocIDHash': 'Nb Customers'})

################################################################
# Create segments by aggregating RFM SCores
################################################################
# First we download the segments names by RFM Score
segments_names = pd.read_csv('./results/segments.csv')
rfm = pd.merge(rfm,
               segments_names,
Example #6
0
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
df = pd.read_csv(
    "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv",
    error_bad_lines=False)
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
profile.to_file("../webpages/your_report.html")
Example #7
0
pop["State"] = pop["State"].apply(lambda x: x.replace("?owa", "Iowa"))
pop["State"] = pop["State"].apply(lambda x: x.replace("Louisianaa", "Louisiana"))
pop["State"] = pop["State"].apply(lambda x: x.replace("C0lorado", "Colorado"))

pop = pop.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

pop['State'] = pop['State'].replace(['?est Virginia','?ew York','?outh Dakota','?orth Carolina','?hode Island','?ew Jersey','?ew Mexico','?ew Hampshire','?orth Dakota','?outh Carolina','COlorado']
                                    ,['West Virginia','New York','South Dakota','North Carolina','Rhode Island','New Jersey','New Mexico','New Hampshire','North Dakota','South Carolina','Colorado'])

pop['State'].nunique()

! pip install pandas==0.25

from pandas_profiling import ProfileReport

profile = ProfileReport(pop)
profile

pop.describe()

# Summarize data by City and sum of ten Year Population Growth
pop.groupby('City').sum()

# Summarize data by City and median of Ten year Population Growth
pop.groupby('State').median()

# Group data by state and summarize 2019_Population
grp_state = pop.groupby(["State"])[["2019_Population"]].describe()
grp_state

grp_state.columns = grp_state.columns.droplevel(0)
def profilling(df):
    pr = ProfileReport(df, explorative=True)
    st_profile_report(pr)
# +
df_videos = pd.read_csv(os.path.join('..', 'data',
                                     'AIcia_videos_20210410.csv'))
df_videos['公開日時'] = pd.to_datetime(df_videos['公開日時'])
df_videos['動画時間_s'] = pd.to_timedelta(
    df_videos['動画時間']).apply(lambda x: x.seconds)

df_videos = df_videos.drop(['動画時間'], axis=1)

df_videos.head()
# -

profile = ProfileReport(df_videos,
                        title='Pandas Profiling Report',
                        html={'style': {
                            'full_width': True
                        }})

profile

# # Hierarchical Bayesian Modeling

n_videos = len(df_videos)

# +
# define model and sample

with pm.Model() as model:
    # prior to parameters
    alpha_plus = pm.Normal('alpha_plus', mu=-3, sd=2)
# As not all datasets have an index of files, we generate that ourselves.
files = [f for f in p.rglob("*") if f.is_file()]
series = pd.Series(files, name="files")
# PP only accepts absolute paths
series = series.apply(lambda x: x.absolute()).apply(str)

df = pd.DataFrame(series)

# Generate the profile report
profile = ProfileReport(
    df,
    title="Example showcasing EXIF data (Kaggle 5 Celebrity Faces Dataset)",
    # Disable what's not in our focus
    duplicates=None,
    correlations=None,
    samples=None,
    missing_diagrams=None,
    # Enable files and images (off by default, as it uses relatively expensive computations when not interested)
    explorative=True,
)
# We can also configure the report like this
profile.set_variable(
    "variables.descriptions",
    {
        "files": "The 5 Celebrity Faces Dataset found on Kaggle (dansbecker/5-celebrity-faces-dataset)."
    },
)

# Save the report
profile.to_file("celebrity-faces.html")
import pandas as pd
from pandas_profiling import ProfileReport

### File 1/9: nacional_covid19.csv ###

PATH_TO_FILE = "nacional_covid19.csv"
FINAL_FILE_NAME = "nacional_covid19"
FINAL_HTML_PATH = "../webpages/COVID-19-in-Spain-Dataset/" + FINAL_FILE_NAME + ".html"

df1 = pd.read_csv(PATH_TO_FILE, error_bad_lines=False)
print(df1.info())
profile = ProfileReport(df1, title=FINAL_FILE_NAME, explorative=True)
profile.to_file(FINAL_HTML_PATH)

### File 2/9: nacional_covid19_rango_edad.csv ###

PATH_TO_FILE = "nacional_covid19_rango_edad.csv"
FINAL_FILE_NAME = "nacional_covid19_rango_edad"
FINAL_HTML_PATH = "../webpages/COVID-19-in-Spain-Dataset/" + FINAL_FILE_NAME + ".html"

df2 = pd.read_csv(PATH_TO_FILE, error_bad_lines=False)
print(df2.info())
profile = ProfileReport(df2, title=FINAL_FILE_NAME, explorative=True)
profile.to_file(FINAL_HTML_PATH)

### File 3/9: ccaa_camas_uci_2017.csv ###

PATH_TO_FILE = "ccaa_camas_uci_2017.csv"
FINAL_FILE_NAME = "ccaa_camas_uci_2017"
FINAL_HTML_PATH = "../webpages/COVID-19-in-Spain-Dataset/" + FINAL_FILE_NAME + ".html"
Example #12
0

# ### Read train and test datasets

# In[2]:


sample_submission = pd.read_csv("C:/Users/User/Desktop/ML Projects/housing_prices/dataset/sample_submission.csv")
test = pd.read_csv("C:/Users/User/Desktop/ML Projects/housing_prices/dataset/test.csv")
train = pd.read_csv("C:/Users/User/Desktop/ML Projects/housing_prices/dataset/train.csv")


# In[3]:


train_profile = ProfileReport(train)
train_profile


# In[4]:


# Describe dataset
print("Train set has {} rows and {} columns.".format(train.shape[0], train.shape[1]))
print("Test set has {} rows and {} columns.".format(test.shape[0], test.shape[1]))

# Information about dataset
train_stats = train.describe().T
# Understand each features datatype
train.dtypes
# Display no. of numerical and categorical data types
Example #13
0
import pandas as pd 
from pandas_profiling import ProfileReport

data = pd.read_csv('Assignment2.csv')
profile = ProfileReport(data)
profile.to_file('report.html')
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold


##Loading the model file
loaded_model = pickle.load(open('Best_classifier.pckl', 'rb'))
#Read Test set file
data_read_test=pd.read_csv("test_set.csv")
#See the basic stats of the model
data_read_test.head()
data_read_test = data_read_test.drop(data_read_test.columns[0],axis='columns') 
data_read_test.info() 
data_read_test.describe()

#optional profiling
profile = ProfileReport(data_read_test, title="Pandas Profiling Report")
#profile

#
data_read_test=data_read_test.drop('X32',axis='columns') 

#Getting the shape of dataframe
print("shape of the data:", data_read_test.shape)

#We don't need to encode into numerical as all the columns are numerical 
X = data_read_test

#Checking the variance again in the dataset
sel_variance_threshold = VarianceThreshold() 
X_train_remove_variance = sel_variance_threshold.fit_transform(X)
print(X_train_remove_variance.shape)
Example #15
0
    # Example: Constant variable
    df["source"] = "NASA"

    # Example: Boolean variable
    df["boolean"] = np.random.choice([True, False], df.shape[0])

    # Example: Mixed with base types
    df["mixed"] = np.random.choice([1, "A"], df.shape[0])

    # Example: Unhashable
    df["unhashable"] = [[1]] * df.shape[0]

    # Example: Highly correlated variables
    df["reclat_city"] = df["reclat"] + np.random.normal(scale=5, size=(len(df)))

    # Example: Duplicate observations
    duplicates_to_add = pd.DataFrame(df.iloc[0:10].copy())

    df = df.append(duplicates_to_add, ignore_index=True)

    logo_string = ""

    profile = ProfileReport(
        df,
        title="NASA Meteorites",
        html={"style": {"logo": logo_string}},
        correlations={"cramers": {"calculate": False}},
        explorative=True,
    )
    profile.to_file(Path("./meteorites_report.html"))
Example #16
0
def test_repr(data):
    report = ProfileReport(data)
    assert repr(report) == ""
Example #17
0
    """Display a profile report.

    Parameters
    ----------
    profile_report: pandas_profiling.ProfileReport
        The profile report instance to display.
    key: str or None
        An optional key that uniquely identifies this component. If this is
        None, and the component's arguments are changed, the component will
        be re-mounted in the Streamlit frontend and lose its current state.

    """
    with st.spinner("Generating profile report..."):
        _pandas_profiling(html=profile_report.to_html(), key=key)


if not _RELEASE:
    import numpy as np
    import pandas as pd
    from pandas_profiling import ProfileReport

    df = pd.DataFrame(
        np.random.rand(100, 5),
        columns=['a', 'b', 'c', 'd', 'e']
    )

    pr = ProfileReport(df, explorative=True)
    st.title("Pandas Profiling in Streamlit")
    st.write(df)
    st_profile_report(pr)
Example #18
0
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

PATH_TO_FILE = "https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv"
FINAL_FILE_NAME = "owiddataset"
FINAL_HTML_PATH = "../webpages/" + FINAL_FILE_NAME + ".html"

df = pd.read_csv(PATH_TO_FILE, error_bad_lines=False)
profile = ProfileReport(df, title=FINAL_FILE_NAME, explorative=True)
profile.to_file("../webpages/owiddataset.html")
Example #19
0
Incidents_service = pd.read_excel(
    "C:\\Users\\dell\\Desktop\\P31_Project\\Extras\\DataSet\\Incident_services.xlsx"
)
pd.options.display.max_columns = None  #Helps to see all column names in the Dataset
Incidents_service.head(10)
Incidents_service.dtypes
Incidents_service.shape
Incidents_service.isnull().sum()
Incidents_service.describe()

#import the packages
from pandas_profiling import ProfileReport

# Run the profile report
profile = ProfileReport(Incidents_service,
                        title='Pandas Profiling Report',
                        explorative=True)

# Save the report as html file
profile.to_file(output_file="pandas_profiling_P31.html")

invalid = lambda x: sum(x == "?") / len(x)
Incidents_service.apply(invalid)
#"problem_id"  and "change request" has highest number of missing values which are represented by '?'
#Hence, we can eliminate those columns.

# Column 'ID_status' has some mis-interpreted values with entry- '-100'
Incidents_service.ID_status.value_counts()

#Hence, we can drop those entries as it won't affect our model
Incidents_service = Incidents_service[Incidents_service.ID_status != -100]
Example #20
0
def show_analysis(request):
    df = pd.read_excel('production/data.xls')
    df.to_csv('data.csv', encoding='latin-1')
    prof = ProfileReport(df, minimal=True)
    prof.to_file(output_file='Templates/analysis_output.html')
    return render(request, "analysis_output.html")
Example #21
0
def run(st, data):
    expander = st.beta_expander("Menu", expanded=True)
    with expander:
        ana_choice = st.radio(
            "Analysis",
            ["Data", "Visualization", "Statistics", "Data Profiling"])
        filters = st.checkbox('Add Filters')
        if filters:
            st.info("Select column and values from below")
            filtered_cols = st.multiselect("Select columns to filter",
                                           data.columns.tolist())
            filtered_sets = []
            if len(filtered_cols) > 0:
                iterations = len(filtered_cols) // 3
                difference = len(filtered_cols) % 3
                jack = 0

                while jack < iterations:
                    cols_filtered = []
                    try:
                        cols_filtered = cols_filtered + st.beta_columns(3)
                    except:
                        pass
                    counter = 0
                    for i in range(jack * 3, 3 * jack + 3):
                        filtered_sets.append(
                            cols_filtered[counter].multiselect(
                                filtered_cols[i],
                                data[filtered_cols[i]].unique().tolist()))
                        counter += 1
                    jack += 1
                if difference == 0:
                    pass
                else:
                    cols_filtered = []
                    cols_filtered = cols_filtered + st.beta_columns(difference)
                    counter = 0
                    for i in range(iterations * 3,
                                   iterations * 3 + difference):
                        filtered_sets.append(
                            cols_filtered[counter].multiselect(
                                filtered_cols[i],
                                data[filtered_cols[i]].unique().tolist()))
                        counter += 1

            #Now filtering the data
            tracker = 0
            for filter_value in filtered_sets:
                if len(filter_value) > 0:
                    data = data[data[filtered_cols[tracker]].isin(
                        filter_value)]
                tracker += 1

        if ana_choice == 'Data':
            data_options = st.selectbox(
                "", ["View Records", "Data Correlation", "Pivot"])
            if data_options == "View Records":
                c1, c2 = st.beta_columns(2)
                top_bottom_options = c1.radio("Records", ["Top", "Bottom"])
                num_rec = c2.number_input("No. of Records:",
                                          min_value=0,
                                          max_value=100,
                                          step=1,
                                          value=10)
                if top_bottom_options == 'Top':
                    st.dataframe(data.head(num_rec))
                else:
                    st.dataframe(data.tail(num_rec))
            elif data_options == "Data Correlation":
                select_columns = st.multiselect("Select Columns",
                                                data.columns.tolist())
                corr_view = st.radio("Correlation View", ["Table", "Chart"])
                if corr_view == 'Table':
                    if len(select_columns) == 0:
                        st.dataframe(data.corr())
                    else:
                        st.dataframe(data[select_columns].corr())
                else:
                    if len(select_columns) == 0:
                        st.write(sns.heatmap(data.corr(), annot=True))
                        st.pyplot()
                    else:
                        st.write(
                            sns.heatmap(data[select_columns].corr(),
                                        annot=True))
                        st.pyplot()
            elif data_options == 'Pivot':
                dimensions = st.multiselect("Select X axis columns",
                                            data.columns.tolist())
                measures = st.multiselect("Select Y axis columns",
                                          data.columns.tolist())
                numeric_cols = st.multiselect("Aggregation columns",
                                              data.columns.tolist())
                aggregation_operations = st.selectbox(
                    "Aggregation Operation",
                    ['sum', 'average', 'median', 'count'])
                button = st.button("Execute!!!")
                if button:
                    if len(numeric_cols) > 0:
                        if aggregation_operations == 'sum':
                            operation = np.sum
                        elif aggregation_operations == 'average':
                            operation = np.mean
                        elif aggregation_operations == 'median':
                            operation = np.median
                        elif aggregation_operations == 'count':
                            operation = np.count_nonzero
                        pivot_table = pd.pivot_table(data,
                                                     values=numeric_cols,
                                                     index=measures,
                                                     columns=dimensions,
                                                     aggfunc=operation)
                        st.dataframe(pivot_table)
        elif ana_choice == "Visualization":
            chart_options = st.selectbox(
                'Charts', ['Bar', 'Line', 'Heatmap', 'Distplot', 'Customized'])
            if chart_options == 'Bar':
                x_col = st.selectbox('X', data.columns.tolist())
                y_col = st.selectbox('Y', data.columns.tolist())
                hue_color = st.checkbox("Add color column")
                direction = st.radio('chart direction',
                                     ['vertical', 'horizontal'])
                if hue_color:
                    hue_col = st.selectbox('hue', data.columns.tolist())
                button = st.button("Execute!!!")
                if button:
                    if direction == 'vertical':
                        chart_direction = 'v'
                    else:
                        chart_direction = 'h'
                    if hue_color:
                        if hue_col:
                            st.write(
                                sns.barplot(x=x_col,
                                            y=y_col,
                                            hue=hue_col,
                                            data=data,
                                            orient=chart_direction))
                            st.pyplot()
                        else:
                            st.write(
                                sns.barplot(x=x_col,
                                            y=y_col,
                                            data=data,
                                            orient=chart_direction))
                            st.pyplot()
                    else:
                        st.write(
                            sns.barplot(x=x_col,
                                        y=y_col,
                                        data=data,
                                        orient=chart_direction))
                        st.pyplot()
            elif chart_options == 'Line':
                x_col = st.selectbox('X', data.columns.tolist())
                y_col = st.selectbox('Y', data.columns.tolist())
                hue_color = st.checkbox("Add color column")
                if hue_color:
                    hue_col = st.selectbox('hue', data.columns.tolist())
                button = st.button("Execute!!!")
                if button:
                    if hue_color:
                        if hue_col:
                            st.write(
                                sns.lineplot(x=x_col,
                                             y=y_col,
                                             hue=hue_col,
                                             data=data))
                            st.pyplot()
                        else:
                            st.write(sns.lineplot(x=x_col, y=y_col, data=data))
                            st.pyplot()
                    else:
                        st.write(sns.lineplot(x=x_col, y=y_col, data=data))
                        st.pyplot()
            elif chart_options == 'Heatmap':
                select_columns = st.multiselect("Select Columns",
                                                data.columns.tolist())
                button = st.button("Execute!!!")
                if button:
                    if len(select_columns) == 0:
                        st.write(sns.heatmap(data, annot=True))
                        st.pyplot()
                    else:
                        st.write(sns.heatmap(data[select_columns], annot=True))
                        st.pyplot()
            elif chart_options == 'Distplot':
                x_col = st.selectbox('X', data.columns.tolist())
                col = st.selectbox('column', data.columns.tolist())
                row = st.selectbox('row', data.columns.tolist())
                button = st.button("Execute!!!")
                if button:
                    st.write(
                        sns.displot(
                            data,
                            x=x_col,
                            col=col,
                            row=row,
                            binwidth=3,
                            height=3,
                            facet_kws=dict(margin_titles=True),
                        ))
                    st.pyplot()
            elif chart_options == 'Customized':
                code_area = st.text_area(
                    """Enter your chart script, Return result to value.
                e.g. 
                a = 3
                b = 4
                value = a + b!!!, Don't enter data parameter !!!""")

                button = st.button("Execute!!!")
                if button:
                    loc = {}
                    exec(code_area, {'data': data}, loc)
                    return_workaround = loc['value']
                    st.write(return_workaround)
                    st.pyplot()
        elif ana_choice == 'Statistics':
            test_selection = st.selectbox('Category', [
                'Value Count', 'Normality Test', 'Correlation Test',
                'Stationary Test', 'Parametric Test', 'Non Parametric Test'
            ])
            statistics = stats.Statistics(data)
            if test_selection == 'Value Count':
                select_columns = st.selectbox("Select Columns",
                                              data.columns.tolist())
                mode = st.radio('Value Counts', ['Table', 'Chart'])
                if mode == 'Table':
                    value_counts = statistics.__get__stats__(select_columns)
                    st.dataframe(value_counts)
                else:
                    value_counts = statistics.__get__stats__(select_columns)
                    st.write(value_counts[:20].plot(kind='barh'))
                    st.pyplot()
            elif test_selection == 'Normality Test':
                st.write("""
                        Tests whether a data sample has a Gaussian distribution. \n
                        H0: the sample has a Gaussian distribution. \n
                        H1: the sample does not have a Gaussian distribution"""
                         )

                select_test = st.selectbox(
                    'Tests', ['ShapiroWilk', 'DAgostino', 'AndersonDarling'])
                col = st.selectbox('Select Column', data.columns.tolist())
                text_option = st.checkbox('Text')
                chart_option = st.checkbox('Chart')
                if text_option:
                    t, p = statistics.normality_tests(data[col],
                                                      test_type=select_test)
                    st.write('#### ' + t + " (" + str(p) + ")")
                if chart_option:
                    st.write(sns.kdeplot(x=col, data=data))
                    st.pyplot()

        elif ana_choice == 'Data Profiling':
            st.markdown("""
            ##### The Data Profiling is done automatically using Pandas Profiling tool.\n \n \n \n
            """)
            limited_records = st.checkbox("Execute on Limited Records!!!")
            select_columns = st.multiselect("Select Columns",
                                            data.columns.tolist())
            if len(select_columns) == 0:
                cols = data.columns.tolist()
            else:
                cols = select_columns
            if limited_records:
                num_rec = st.number_input("No. of Records:",
                                          min_value=0,
                                          max_value=1000000,
                                          step=1,
                                          value=100)
            else:
                num_rec = len(data)
            execute_profiling = st.button('Execute!!!')
            if execute_profiling:
                st.title(f"Pandas Profiling on {num_rec} records")

                report = ProfileReport(data[cols].loc[:num_rec, :],
                                       explorative=True)
                st.write(data)
                st_profile_report(report)
Example #22
0
            "occupation",
            "relationship",
            "race",
            "sex",
            "capital-gain",
            "capital-loss",
            "hours-per-week",
            "native-country",
        ],
    )

    # Prepare missing values
    df = df.replace("\\?", np.nan, regex=True)

    # Initialize the report
    profile = ProfileReport(df, title="Census Dataset", explorative=True)

    # show column definition
    definitions = json.load(open(f"census_column_definition.json"))
    profile.set_variable(
        "dataset",
        {
            "description":
            'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. Extraction was done by Barry Becker from the 1994 Census database. A set of reasonably clean records was extracted using the following conditions: ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0)). Prediction task is to determine whether a person makes over 50K a year.',
            "copyright_year": "1996",
            "author": "Ronny Kohavi and Barry Becker",
            "creator": "Barry Becker",
            "url": "https://archive.ics.uci.edu/ml/datasets/adult",
        },
    )
    profile.set_variable("variables.descriptions", definitions)
def main():		
	"""Diabetes Prediction App"""
	html_temp = """
		<div style="background-color:navy;padding:10px;border-radius:10px">
		<h1 style="color:white;text-align:center;">Echidna AI</h1>
		<h5 style="color:white;text-align:center;">Diabetes Prediction WEB APP</h5>
		</div>
		"""

	components.html(html_temp)
	image = Image.open('C:/Users/Adila/Documents/hep_app/LOGO.png')
	st.image(image, use_column_width=True)

	menu = ["Home", "Login", "SignUp", "Book An Appointment", "Profile Report", "About", "Privacy Policy"]
	submenu = ["Plot", "Prediction",]

	choice = st.sidebar.selectbox("Menu", menu)
	if choice == "Home":
		st.subheader("---------------WELCOME TO THE DIABETES PREDICTION APP-----------------")
		




		html_temp2 = """
		<div style="background-color:navy;padding:3px;border-radius:10px">
		<h1 style="color:white;text-align:center;">How to Login?</h1>
		<h5 style="color:white;text-align:center;">press the arrow on the top left corner and choose the LOGIN from menu to get started</h5>
		</div>
		"""
		components.html(html_temp2)

		html_temp3 = """
		<div style="background-color:navy;padding:3px;border-radius:10px">
		<h1 style="color:white;text-align:center;">How to Sign Up?</h1>
		<h5 style="color:white;text-align:center;">press the arrow on the top left corner and choose the SIGN UP from menu to get started</h5>
		</div>
		"""
		components.html(html_temp3)
		st.title("Brief explanation on Diabetes Mellitus")
		st.subheader("------Diabetes mellitus (DM), commonly known as diabetes, is a group of metabolic disorders characterized by a high blood sugar level over a prolonged period. Symptoms often include frequent urination, increased thirst, and increased appetite. If left untreated, diabetes can cause many complications. Acute complications can include diabetic ketoacidosis, hyperosmolar hyperglycemic state, or death. Serious long-term complications include cardiovascular disease, stroke, chronic kidney disease, foot ulcers, damage to the nerves, damage to the eyes and cognitive impairment.")
		st.subheader("------Diabetes is due to either the pancreas not producing enough insulin, or the cells of the body not responding properly to the insulin produced. There are three main types of diabetes mellitus:")
		st.subheader("------Type 1 diabetes results from the pancreas's failure to produce enough insulin due to loss of beta cells. This form was previously referred to as insulin-dependent diabetes mellitus (IDDM) or  juvenile diabetes . The loss of beta cells is caused by an autoimmune response. The cause of this autoimmune response is unknown.") 
		st.subheader("------Type 2 diabetes begins with insulin resistance, a condition in which cells fail to respond to insulin properly. As the disease progresses, a lack of insulin may also develop. This form was previously referred to as non-insulin-dependent diabetes mellitus  (NIDDM) or  adult-onset diabetes .  The most common cause is a combination of excessive body weight and insufficient exercise.") 
		st.subheader("------Gestational diabetes is the third main form and occurs when pregnant women without a previous history of diabetes develop high blood sugar levels.")
		st.subheader("------Type 1 diabetes must be managed with insulin injections. Prevention and treatment of type 2 diabetes involves maintaining a healthy diet, regular physical exercise, a normal body weight, and avoiding use of tobacco. Type 2 diabetes may be treated with medications such as insulin sensitizers with or without insulin. Control of blood pressure and maintaining proper foot and eye care are important for people with the disease. Insulin and some oral medications can cause low blood sugar.  Weight loss surgery in those with obesity is sometimes an effective measure in those with type 2 diabetes. Gestational diabetes usually resolves after the birth of the baby.")
		st.subheader("------As of 2019, an estimated 463 million people had diabetes worldwide (8.8% of the adult population), with type 2 diabetes making up about 90% of the cases.  Rates are similar in women and men. Trends suggest that rates will continue to rise. Diabetes at least doubles a person's risk of early death. In 2019, diabetes resulted in approximately 4.2 million deaths. It is the 7th leading cause of death globally. The global economic cost of diabetes related health expenditure in 2017 was estimated at US$727 billion. In the United States, diabetes cost nearly US$327 billion in 2017. Average medical expenditures among people with diabetes are about 2.3 times higher.")

	elif choice == "Login":
		username = st.sidebar.text_input("Username")
		password = st.sidebar.text_input("Password",type='password')
		if st.sidebar.checkbox("Login"):
			create_usertable()
			hashed_pswd = generate_hashes(password)
			result = login_user(username,verify_hashes(password, hashed_pswd))
			#if password == "12345":
			if result:
				st.success("Welcome {}".format(username))

				activity = st.selectbox("Activity", submenu)
				if activity == "Plot":
					st.subheader("Data Vis Plot")
					df = pd.read_csv("data/clean_hepatitis_dataset.csv")
					st.dataframe(df)

					df['class'].value_counts().plot(kind='bar')
					st.pyplot()

					#Freq Dist Plot
					freq_df = pd.read_csv("data/freq_df_hepatitis_dataset.csv")
					st.bar_chart(freq_df['count'])

					
					if st.checkbox("Area Chart"):
						all_columns = df.columns.to_list()
						feat_choices = st.multiselect("Choose a Feature",all_columns)
						new_df = df[feat_choices]
						st.area_chart(new_df)

				elif activity == "Prediction":
					st.subheader("Predictive Analytics")

					age = st.number_input("Age",7,80)
					sex = st.radio("Sex",tuple(gender_dict.keys()))
					steroid = st.radio("Do You Take Steroids?",tuple(feature_dict.keys()))
					antivirals = st.radio("Do You Take Antivirals?",tuple(feature_dict.keys()))
					fatigue = st.radio("Do You Have Fatigue",tuple(feature_dict.keys()))
					spiders = st.radio("Presence of Spider Naeve",tuple(feature_dict.keys()))
					ascites = st.selectbox("Ascities",tuple(feature_dict.keys()))
					varices = st.selectbox("Presence of Varices",tuple(feature_dict.keys()))
					bilirubin = st.number_input("bilirubin Content",0.0,8.0)
					alk_phosphate = st.number_input("Alkaline Phosphate Content",0.0,296.0)
					sgot = st.number_input("Sgot",0.0,648.0)
					albumin = st.number_input("Albumin",0.0,6.4)
					protime = st.number_input("Prothrombin Time",0.0,100.0)
					histology = st.selectbox("Histology",tuple(feature_dict.keys()))
					feature_list = [age,get_value(sex,gender_dict),get_fvalue(steroid),get_fvalue(antivirals),get_fvalue(fatigue),get_fvalue(spiders),get_fvalue(ascites),get_fvalue(varices),bilirubin,alk_phosphate,sgot,albumin,int(protime),get_fvalue(histology)]

					st.write(feature_list)
					pretty_result = {"age":age,"sex":sex,"steroid":steroid,"antivirals":antivirals,"fatigue":fatigue,"spiders":spiders,"ascites":ascites,"varices":varices,"bilirubin":bilirubin,"alk_phosphate":alk_phosphate,"sgot":sgot,"albumin":albumin,"protime":protime,"histolog":histology}
					st.json(pretty_result)
					single_sample = np.array(feature_list).reshape(1,-1)

					#ML
					model_choice = st.selectbox("Select Model", ["KNN", "DecisionTree", "LR"]) 
					if st.button("Predict"):
						if model_choice == "KNN":
							loaded_model = load_model("models/knn_hepB_model.pkl")
							prediction = loaded_model.predict(single_sample)
							pred_prob = loaded_model.predict_proba(single_sample)
						elif model_choice == "DecisionTree":
							loaded_model = load_model("models/decision_tree_clf_hepB_model.pkl")
							prediction = loaded_model.predict(single_sample)
							pred_prob = loaded_model.predict_proba(single_sample)
						else:
							loaded_model = load_model("models/logistic_regression_hepB_model.pkl")
							prediction = loaded_model.predict(single_sample)
							pred_prob = loaded_model.predict_proba(single_sample)

						#st.write(prediction)
						#prediction_label = {"You have a risk to have diabetes":1, "You don't have a risk to have diabetes":2}
						#final_result = get_key(prediction,prediction_label)
						#st.write(Ffinal_result)
						if prediction == 1:
							st.warning("Patient has a risk to have Diabetes")
							pred_probability_score = {"Dibetes":pred_prob[0][0]*100,"No Diabetes":pred_prob[0][1]*100}
							st.subheader("Prediction Probability Score using Neural network with {}".format(model_choice))
							st.json(pred_probability_score)
							st.subheader("Prescriptive Analytics")
							st.markdown(prescriptive_message_temp,unsafe_allow_html=True)
							
						else:
							st.success("Patient don't have a risk of having Diabetes")
							pred_probability_score = {"Has a risk":pred_prob[0][0]*100,"No Risk":pred_prob[0][1]*100}
							st.subheader("Prediction Probability Score using Neural network with {}".format(model_choice))
							st.json(pred_probability_score)



					if st.checkbox("Interpret"):
						if model_choice == "KNN":
							loaded_model = load_model("models/knn_hepB_model.pkl")
							
						elif model_choice == "DecisionTree":
							loaded_model = load_model("models/decision_tree_clf_hepB_model.pkl")
							
						else:
							loaded_model = load_model("models/logistic_regression_hepB_model.pkl")
							

						# loaded_model = load_model("models/logistic_regression_model.pkl")							
						# 1 Die and 2 Live
						df = pd.read_csv("data/clean_hepatitis_dataset.csv")
						x = df[['age', 'sex', 'steroid', 'antivirals','fatigue','spiders', 'ascites','varices', 'bilirubin', 'alk_phosphate', 'sgot', 'albumin', 'protime','histology']]
						feature_names = ['age', 'sex', 'steroid', 'antivirals','fatigue','spiders', 'ascites','varices', 'bilirubin', 'alk_phosphate', 'sgot', 'albumin', 'protime','histology']
						class_names = ['Die(1)','Live(2)']
						explainer = lime.lime_tabular.LimeTabularExplainer(x.values,feature_names=feature_names, class_names=class_names,discretize_continuous=True)
						# The Explainer Instance
						exp = explainer.explain_instance(np.array(feature_list), loaded_model.predict_proba,num_features=13, top_labels=13)
						exp.show_in_notebook(show_table=True, show_all=True)
						# exp.save_to_file('lime_oi.html')
						st.write(exp.as_list())
						new_exp = exp.as_list()
						label_limits = [i[0] for i in new_exp]
						# st.write(label_limits)
						label_scores = [i[1] for i in new_exp]
						plt.barh(label_limits,label_scores)
						st.pyplot()
						plt.figure(figsize=(20,10))
						fig = exp.as_pyplot_figure()
						st.pyplot()	
				


			else:		
				st.warning("Incorrect Username or Password")
	elif choice == "SignUp":
		new_username = st.text_input("User name")
		new_password = st.text_input("Password", type='password')

		confirm_password = st.text_input("Confirm Password",type='password')
		if new_password == confirm_password:
			st.success("Password Confirmed")
		else:
			st.warning("Passwords not the same")

		if st.button("Submit"):
			create_usertable()
			hashed_new_password = generate_hashes(new_password)
			add_userdata(new_username,hashed_new_password)
			st.success("You have created a new account")
			st.info("Login to get started")

	elif choice == "Profile Report":
		st.title("What is Profile Report?")
		st.subheader("------This technology can be used to upload datasets that you may have as analyst or if you are a doctor who wants to get a clear idea about percentage of patients who has this or that problem. And then the technology itself will analyze it.")
		st.title("How to start to analyze the data?")
		st.subheader("------To analyze the data you just simply need to upload a CSV file") 
		st.title("What kind of files this technology accepts?")
		st.subheader("------Basically a CSV files")
		st.title("How is it analyzing the data?")
		st.subheader("------Mainly, technology itself uses a neural networks to analyze the data, and then it will represent the data as a charts")
		


		data_file = st.file_uploader("UPLOAD CSV",type=['csv'])
		st.set_option('deprecation.showfileUploaderEncoding', False)
		if data_file is not None:
			df = pd.read_csv(data_file)
			st.dataframe(df.head())
			profile = ProfileReport(df)
			st_profile_report(profile)

			
	elif choice == "Book An Appointment":
		st.title("Book An Appointments")
		st.title("Integration with Nilai medical center website")
		st.subheader("------Developer integrated this WebApp with existed website to make sure that patients can book an appointment to a real medical") 
		components.iframe('https://nmc.encoremed.io/',width=700,height=2000)

	elif choice == "About":
		st.title("About App")
		st.title("F.A.Q.")
		st.title("What is Echidna AI?")
		st.subheader("------Basically, its an a WEB APP that can help people to predict Diabetes")
		st.title("What kind of functions do Echidna Have?")
		st.subheader("------The main purpose of the Echidna AI is to provide a solution for people to predict diabetes and to help analysts to analyze the data in a better way. And the data itself can be stored inside this WEB APP because it has a neural network that can store data inside nodes")
		st.title("Is it Open source Alghorithm?")
		st.subheader("------The Echidna AI® algorithm")
		st.subheader("------This web app was released as open source software under the GNU Affero General Public Licence, version 3. This ensures that academics and others interested in the core of the algorithms at least start with a working implementation. The terms of the licence ensure that implementations are transparent and open, and are, in turn, open for others to use and/or modify under the same licence.")
		st.title("Is Echidna AI can be recommended for clinical use?")
		st.subheader("------It can be recommended for clinical use, software developers can use this professionally supported software development kits.")
		st.title("Would Echidna AI be supported in future?")
		st.subheader("------Echidna AI®-2020 will be released to licencees of our Echidna AI® software development kit in the new year, for deployment from August. Which means that it will be suported")
		st.title("Do Echidna AI patented or is it has a copyright?")
		st.subheader("Yes, Echidna AI has  a copyright, but it is an open source software that can be modified")
		st.subheader("------Copyright ©Echidna 2020. ALL RIGHTS RESERVED.")
		st.subheader("------Materials on this web site are protected by copyright law. Access to the materials on this web site for the sole purpose of personal educational and research use only. Where appropriate a single print out of a reasonable proportion of these materials may be made for personal education, research and private study. Materials should not be further copied, photocopied or reproduced, or distributed in electronic form. Any unauthorised use or distribution for commercial purposes is expressly forbidden. Any other unauthorised use or distribution of the materials may constitute an infringement of ClinRisk's copyright and may lead to legal action.")
		st.subheader("------For avoidance of doubt, any use of this site as a web service to obtain a Echidna AI® for any purpose is expressly forbidden. Similarly, use of this website for developing or testing software of any sort is forbidden unless permission has been explicitly granted.")
		st.subheader("------BMI predictor algorithm © 2020 Echidna Inc.")
		st.subheader("------WebApp and risk engine built by Adilan Akhramovich WebApp design ©Echidna 2020.")
		#components.iframe('https://quickdraw.withgoogle.com',height=2000)
		components.html(footer_html,height=500)

	elif choice == "Privacy Policy":
		st.title("Privacy Policy of Echidna Inc.")
		st.subheader("------At ECHIDNA AI, one of our main priorities is the privacy of our visitors. This Privacy Policy document contains types of information that is collected and recorded by ECHIDNA AI and how we use it.")

		st.subheader("------If you have additional questions or require more information about our Privacy Policy, do not hesitate to contact us.")

		st.subheader("------This Privacy Policy applies only to our online activities and is valid for visitors to our webapp with regards to the information that they shared and/or collect in ECHIDNA AI. This policy is not applicable to any information collected offline or via channels other than this webapp.")

		st.title("Consent")
		st.subheader("------By using our webapp, you hereby consent to our Privacy Policy and agree to its terms.")

		st.title("Information we collect")
		st.subheader("------The personal information that you are asked to provide, and the reasons why you are asked to provide it, will be made clear to you at the point we ask you to provide your personal information.")

		st.subheader("------If you contact us directly, we may receive additional information about you such as your name, email address, phone number, the contents of the message and/or attachments you may send us, and any other information you may choose to provide.")

		st.title("How we use your information?")
		st.subheader("We use the information we collect in various ways, including to:")

		st.subheader("------Provide, operate, and maintain our webapp")
		st.subheader("------Improve, personalize, and expand our webapp")
		st.subheader("------Understand and analyze how you use our webapp")
		st.subheader("------Develop new products, services, features, and functionality")
		st.subheader("------Communicate with you, either directly or through one of our partners, including for customer service, to provide you with updates and other information relating to the webapp, and for marketing and promotional purposes")
		st.subheader("------Send you emails")
		st.subheader("------Find and prevent fraud")

		st.title("Log Files")
		st.subheader("------ECHIDNA AI follows a standard procedure of using log files. These files log visitors when they visit websites. All hosting companies do this and a part of hosting services' analytics. The information collected by log files include internet protocol (IP) addresses, browser type, Internet Service Provider (ISP), date and time stamp, referring/exit pages, and possibly the number of clicks. These are not linked to any information that is personally identifiable. The purpose of the information is for analyzing trends, administering the site, tracking users' movement on the website, and gathering demographic information.")

		st.title("Advertising Partners Privacy Policies")
		st.subheader("------You may consult this list to find the Privacy Policy for each of the advertising partners of ECHIDNA AI.")

		st.subheader("------Third-party ad servers or ad networks uses technologies like cookies, JavaScript, or Web Beacons that are used in their respective advertisements and links that appear on ECHIDNA AI, which are sent directly to users' browser. They automatically receive your IP address when this occurs. These technologies are used to measure the effectiveness of their advertising campaigns and/or to personalize the advertising content that you see on websites that you visit.")

		st.subheader("------Note that ECHIDNA AI has no access to or control over these cookies that are used by third-party advertisers.")

		st.title("Third Party Privacy Policies")
		st.subheader("------ECHIDNA AI's Privacy Policy does not apply to other advertisers or websites. Thus, we are advising you to consult the respective Privacy Policies of these third-party ad servers for more detailed information. It may include their practices and instructions about how to opt-out of certain options.")

		st.subheader("------You can choose to disable cookies through your individual browser options. To know more detailed information about cookie management with specific web browsers, it can be found at the browsers' respective websites.")

		st.title("MCPA Privacy Rights (Do Not Sell My Personal Information)")
		st.subheader("Under the MCPA, among other rights, consumers have the right to:")

		st.subheader("------Request that a business that collects a consumer's personal data disclose the categories and specific pieces of personal data that a business has collected about consumers.")

		st.subheader("------Request that a business delete any personal data about the consumer that a business has collected.")

		st.subheader("------Request that a business that sells a consumer's personal data, not sell the consumer's personal data.")

		st.subheader("------If you make a request, we have one month to respond to you. If you would like to exercise any of these rights, please contact us.")

		st.title("GDPR Data Protection Rights")
		st.subheader("We would like to make sure you are fully aware of all of your data protection rights. Every user is entitled to the following:")

		st.subheader("------The right to access – You have the right to request copies of your personal data. We may charge you a small fee for this service.")

		st.subheader("------The right to rectification – You have the right to request that we correct any information you believe is inaccurate. You also have the right to request that we complete the information you believe is incomplete.")

		st.subheader("------The right to erasure – You have the right to request that we erase your personal data, under certain conditions.")

		st.subheader("------The right to restrict processing – You have the right to request that we restrict the processing of your personal data, under certain conditions.")

		st.subheader("------The right to object to processing – You have the right to object to our processing of your personal data, under certain conditions.")

		st.subheader("------The right to data portability – You have the right to request that we transfer the data that we have collected to another organization, or directly to you, under certain conditions.")

		st.subheader("------If you make a request, we have one month to respond to you. If you would like to exercise any of these rights, please contact us.")

		st.title("Children's Information")
		st.subheader("------Another part of our priority is adding protection for children while using the internet. We encourage parents and guardians to observe, participate in, and/or monitor and guide their online activity.")

		st.subheader("------ECHIDNA AI does not knowingly collect any Personal Identifiable Information from children under the age of 13. If you think that your child provided this kind of information on our website, we strongly encourage you to contact us immediately and we will do our best efforts to promptly remove such information from our records.")
Example #24
0
#Explore data
list(df)
df.shape
df.info()
df.describe().transpose()
df.head(n=20)
df.tail()
pd.isna(df)
df.isnull().sum()
df.corr()

# In[ ]:

# Generate profile report using pandas_profiling
ProfileReport(df)

# In[3]:

#Creating target variabled Purchase_Ch

df['Purchase_CH'] = np.where(df['Purchase'] == "CH", 1, 0)

# In[4]:

#Convert Store7 to a binary variable
df['Store7'] = np.where(df['Store7'] == "Yes", 1, 0)

# In[5]:

#Check distribution for StoreID
# Using VS Code
# In the terminal >>
#pip install pandas
#pip install pandas-profiling

import pandas as pd
from pandas_profiling import ProfileReport

df = pd.read_csv('RCOM.csv')
print(df)
# if DLL load failed error occurs>> inside terminal>>
#pip uninstall pandas
#pip install pandas==1.0.1

# >>>>>>>>>>>>>>>>     Generate a Report     >>>>>>>>>>>>>>>>>>>>>>>>>>>>
profile = ProfileReport(df)
# if the dataset is very large & you want to minimise the report, instead of the above line, use---->>
#profile = ProfileReport(df, minimal = True)

profile.to_file(output_file='RCOM_Report.html')
#Now the .html file will be created wait for some time and then view it with live server
 def _get_profile_report(self, title, minimal):
     return ProfileReport(self.dataframe, minimal=minimal, title=title)
Example #27
0
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 28 08:49:08 2020

@author: PC-4
"""

# %% import
from pandas_profiling import ProfileReport

import numpy as np
import pandas as pd

# from warnings import filterwarnings
# filterwarnings("ignore")

# %% data
df = pd.read_csv("train.csv", index_col="datetime")
df.rename(columns={"count": "demand"}, inplace=True)

# %% Overall
profile = ProfileReport(df, minimal=False, progress_bar=False, title="Overall")
profile.to_file("Overall.html")
consolF = consolF[consolF.columns[(consolF.isna().sum()==0)]]
consolF.isna().sum()

consolF
consolF.to_csv('/content/gdrive/MyDrive/shp/DatosFinales.csv')

data_train= consolF[consolF.tipo_data == "train"].drop(["tipo_data"], 1)
data_test= consolF[consolF.tipo_data == "test"].drop(["tipo_data"], 1)

data_train

data_test.columns

"""# Estadísticas descriptivas"""

ProfileReport(consolF, title="Datos", minimal=True)

"""Una manera rápida de la visualización de los datos que el reto nos indica es a través de este scatter plot donde para nuestra variable de interés $y$ observamos la nube de puntos para algunas variables que nos parecieron relevantes."""

pd.plotting.scatter_matrix(data_test[['area', 'estrato', 'banos', 'piso', 'valoradministracion', 'y',
                                     'latitud', 'longitud']], figsize=(16,16))

mapHouses = folium.Map(location=[4.624335, -74.063644], zoom_start=10)
dfSamplesTrain = data_test
locations = dfSamplesTrain[['latitud', 'longitud']]
housinfo = ['area: '+str(area)+', banos: '+str(bano)+', estrato: '+str(estrato)+', valor venta: '+str(valorventa) for area,bano,estrato,valorventa in zip(dfSamplesTrain['area'],dfSamplesTrain['banos'],dfSamplesTrain['estrato'],dfSamplesTrain['valorventa'])]
locationlist = locations.values.tolist()
for point in range(len(locationlist)):
    folium.Marker(locationlist[point],popup=housinfo[point]).add_to(mapHouses)
mapHouses
Example #29
0
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

df = pd.read_csv("algar-dataset-treino.csv")
prof = ProfileReport(df)
prof.to_file(output_file='output.html')
from matplotlib import pyplot as plt
from pandas_profiling import ProfileReport
from sklearn.metrics import mean_squared_error
from sklearn.metrics import average_precision_score

#Leitura dos dados a partir do arquivo Excel.
data_set = pd.read_excel("teste_smarkio_lbs.xls")
print(data_set)

#Utilização de comandos da biblioteca do xlrd para manipulação dos dados dentro da planilha.
book = xlrd.open_workbook_xls("teste_smarkio_lbs.xls")
page1 = book.sheet_by_index(0)

#Relatório com a análise descritiva unidimensional exportado para arquivo .html.
profile = ProfileReport(data_set, title = "Análise Exploratória dos Dados")
profile.to_notebook_iframe((output_file="analise_exploratoria_dos_dados.html"))

#Tratamento dos dados para colocá-los em listas.
limit = len(page1.col_values(colx=0)) - 1
print(limit)
i = 1

list_true_class = page1.col_values(start_rowx=1, colx=3)
list_pred_class = page1.col_values(start_rowx=1, colx=0)
print(list_true_class)

while i < limit:
    if list_true_class[i] == "":
        list_true_class[(i)] = list_pred_class[i]
    i = i + 1