def render_eda(): st.title("Create a Complete Report of your data.") st.subheader("Exploratory Data Analysis using pandas profiling.") st.write("""All you need to do is upload a dataset and get a quick sense of your data.""") data = st.file_uploader("Upload Dataset", type=["csv", "txt"]) if data is not None: df = pd.read_csv(data) # to adjust profile report check this link # https://pandas-profiling.github.io/pandas-profiling/docs/master/rtd/index.html # use --> (minimal=True) setting for large datasets pr = ProfileReport(df, explorative=True) st.title("Pandas Profiling Report in Streamlit") st.write(df) st_profile_report(pr) # save report pr.to_file("Output.html") st.write("Your report has been saved!")
def run_training(experiment_name, data_profiling) -> None: """Train the model.""" _logger.info(f'Working on: {os.getcwd()}') if experiment_name == False: experiment_name = datetime.now().strftime("model_experiment_%Y%m%d_%H%M%S") _logger.info( "The run training name was fixed in {}".format(experiment_name)) # read training data data = load_dataset(file_name=config.TRAINING_DATA_FILE) _logger.info("The dataset contains {} rows and {} columns".format( data.shape[0], data.shape[1])) _logger.info("Dataset info: \n{}". format( data.describe(percentiles=[], include="all").T.to_string())) model_subfloder = config.TRAINED_MODEL_DIR/experiment_name _logger.info("Creating model folder in {}".format(str(model_subfloder))) if data_profiling: _logger.info("Creating a data report for data training") profile = ProfileReport(data, title=experiment_name, explorative=True) profile.to_file(config.REPORT_DIR / "data_train_report.html") _logger.info("A report in html was saved in {}".format(config.REPORT_DIR)) # divide train and test X_train, X_test, y_train, y_test = train_test_split( data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0) # we are setting the seed here # transform the target y_train = np.log(y_train) y_test = np.log(y_test) pipeline.price_pipe.fit(X_train[config.FEATURES], y_train) _logger.info(f'saving model version: {_version}') save_pipeline(pipeline_to_persist=pipeline.price_pipe) _logger.info(f'Logs saved on: {config.LOG_DIR}')
def test_modular_present(tdf): profile = ProfileReport( tdf, title="Modular test", duplicates={"head": 10}, samples={ "head": 10, "tail": 10 }, interactions={ "targets": ["mass (g)"], "continuous": True }, correlations={ "pearson": { "calculate": True }, "spearman": { "calculate": True }, "kendall": { "calculate": True }, "phi_k": { "calculate": True }, "cramers": { "calculate": True }, }, missing_diagrams={ "matrix": True, "bar": True, "dendrogram": True, "heatmap": True, }, pool_size=1, ) html = profile.to_html() assert "Correlations</h1>" in html assert "Duplicate rows</h1>" in html assert "Sample</h1>" in html assert "Missing values</h1>" in html
async def inspect_data(request: web.Request): filename = request.match_info['file'] project = request.match_info['project'] df = pd.read_csv(request.app['settings'].PROJECT_DIR + "/" + project + "/files/" + filename) profile = ProfileReport(df, title='Pandas Profiling Report', html={'style': { 'full_width': True }}) path = request.app[ 'settings'].PROJECT_DIR + "/" + project + "/files/" + filename.replace( ".csv", ".html") profile.to_file(output_file=path) with open(path, "r", encoding='utf-8') as f: text = f.read() print("responding now") return web.Response(text=text, content_type='text/html')
def test_example(get_data_file, test_output_dir): file_name = get_data_file( "meteorites.csv", "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD", ) # For reproducibility np.random.seed(7331) df = pd.read_csv(file_name) # Note: Pandas does not support dates before 1880, so we ignore these for this analysis df["year"] = pd.to_datetime(df["year"], errors="coerce") # Example: Constant variable df["source"] = "NASA" # Example: Boolean variable df["boolean"] = np.random.choice([True, False], df.shape[0]) # Example: Mixed with base types df["mixed"] = np.random.choice([1, "A"], df.shape[0]) # Example: Highly correlated variables df["reclat_city"] = df["reclat"] + np.random.normal(scale=5, size=(len(df))) # Example: Duplicate observations duplicates_to_add = pd.DataFrame(df.iloc[0:10]) duplicates_to_add["name"] += " copy" df = df.append(duplicates_to_add, ignore_index=True) output_file = test_output_dir / "profile.html" profile = ProfileReport( df, title="NASA Meteorites", samples={"head": 5, "tail": 5}, minimal=True ) profile.to_file(output_file=output_file) assert (test_output_dir / "profile.html").exists(), "Output file does not exist" assert ( type(profile.get_description()) == dict and len(profile.get_description().items()) == 7 ), "Unexpected result" assert "<span class=badge>10</span>" in profile.to_html()
def report( self, dataset: str = "dataset", n_rows: Optional[Union[int, float]] = None, # float for 1e3... filename: Optional[str] = None, ): """Create an extensive profile analysis report of the data. The profile report is rendered in HTML5 and CSS3. Note that this method can be slow for n_rows>10k. Parameters ---------- dataset: str, optional (default="dataset") Data set to get the report from. n_rows: int or None, optional (default=None) Number of (randomly picked) rows in to process. None for all rows. filename: str or None, optional (default=None) Name to save the file with (as .html). None to not save anything. Returns ------- profile: ProfileReport Created report object. """ self.log("Creating profile report...", 1) n_rows = getattr(self, dataset).shape[0] if n_rows is None else int(n_rows) profile = ProfileReport(getattr(self, dataset).sample(n_rows)) if filename: if not filename.endswith(".html"): filename = filename + ".html" profile.to_file(filename) self.log("Report saved successfully!", 1) return profile
def test_modular_absent(tdf): profile = ProfileReport( tdf, title="Modular test", duplicates={"head": 0}, samples={ "head": 0, "tail": 0 }, interactions=None, correlations=None, missing_diagrams=None, ) html = profile.to_html() assert "Correlations</h1>" not in html assert "Duplicate rows</h1>" not in html assert "Sample</h1>" not in html assert "Missing values</h1>" not in html
def test_html_export_png(test_output_dir): n_rows = 10 n_columns = 10 df = pd.DataFrame( np.random.randint(0, 1000, size=(n_rows, n_columns)), columns=[f"column_{c}" for c in range(n_columns)], ) profile = ProfileReport(df, minimal=True, html={"inline": False}, plot={"image_format": "png"}) report = test_output_dir / "export_png.html" profile.to_file(report) assert report.exists() assets_dir = test_output_dir / "export_png_assets" check_assets(assets_dir, "png", n_css=3, n_js=3)
def test_issue_120(get_data_file): file_name = get_data_file( "pandas_profiling_bug.txt", "https://github.com/pandas-profiling/pandas-profiling/files/2386812/pandas_profiling_bug.txt", ) df = pd.read_csv(file_name) report = ProfileReport( df, correlations={"cramers": { "calculate": False }}, vars={"cat": { "check_composition": True }}, ) html = report.to_html() assert type(html) == str assert "<p class=h4>Dataset statistics</p>" in html
def test_issue_120(get_data_file): file_name = get_data_file( "pandas_profiling_bug.txt", "https://github.com/pandas-profiling/pandas-profiling/files/2386812/pandas_profiling_bug.txt", ) df = pd.read_csv(file_name) report = ProfileReport( df, correlations=None, progress_bar=False, pool_size=1, vars={"cat": { "words": True, "characters": True }}, ) _ = report.report assert report.description_set is not None
def test_subdir(test_output_dir): n_rows = 10 n_columns = 10 df = pd.DataFrame( np.random.randint(0, 1000, size=(n_rows, n_columns)), columns=[f"column_{c}" for c in range(n_columns)], ) profile = ProfileReport(df, minimal=True, html={"inline": False}) subdir_path = test_output_dir / "subdir" subdir_path.mkdir() subdir_path.joinpath("test.py").touch() report = subdir_path / "subdir.html" profile.to_file(report) assert report.exists() assets_dir = subdir_path / "subdir_assets" check_assets(assets_dir, "svg", n_css=3, n_js=3) assert subdir_path.joinpath("test.py").exists()
def main(dir_main, make_profile=False): dir_main = Path(dir_main) path_true_news = Path(dir_main / CORPUS_DIR / TRUE_CSV) path_fake_news = Path(dir_main / CORPUS_DIR / FAKE_CSV) path_profile = Path(dir_main / PROFILE_REPORT_HTML) # load and format data df_all = import_data(path_true_news, path_fake_news) if make_profile: from pandas_profiling import ProfileReport # takes forever prof = ProfileReport(df_all) prof.to_file(output_file=path_profile) # vectorize title only df_all = vectorize_content(df_all, label_col='label', text_col='title') df_all.describe() pass
def test_interactions_target(): n_rows = 10 n_columns = 50 n_targets = 2 df = pd.DataFrame( np.random.randint(0, 1000, size=(n_rows, n_columns)), columns=[f"column_{c}" for c in range(n_columns)], ) targets = [f"column_{target}" for target in range(0, n_targets)] profile = ProfileReport(df, minimal=True, interactions={ "continuous": True, "targets": targets }) total = sum( len(v.keys()) for k, v in profile.get_description()["scatter"].items()) assert total == n_targets * n_columns
def test_issue523(): # https://github.com/pandas-dev/pandas/issues/33803 data = [ 1871248, 12522551, 1489260, 6657093, pd.NA, pd.NA, pd.NA, pd.NA, pd.NA, 1489260, pd.NA, 2468576, ] df = pd.DataFrame({"col": data}, dtype=pd.Int64Dtype()) profile_report = ProfileReport(df, title="Test Report", progress_bar=False) assert len(profile_report.get_description()) > 0
def profiling_report(df, minimal_mode=False, dark_mode=True): """ Utiliza la libreria pandas_profiling para hacer una exploración visual rápida de los datos Parameters ---------- df : dataframe dataframe with data to analyse. minimal_mode : string, optional En el caso de que sea True, hace cálculo de correlaciones no lineales. The default is False. dark_mode : string, optional si es en el modo oscuro o no. The default is True. Returns ------- .html con la exploración de los datos. """ # esto hace la logica de como guardar el archivo nomás if dark_mode: type_html = "-black" else: type_html = "" if minimal_mode: title_mode = "no expensive computations" mode = title_mode.replace(" ", "-") else: title_mode = "" mode = title_mode.replace(" ", "-") title = "Exploratory Data Analysis: Floating Data" prof = ProfileReport(df, title=title, explorative=False, minimal=minimal_mode, orange_mode=dark_mode) # guardar el html path_output =\ f'results/exploratory-analysis/{mode}-eda.html' prof.to_file(output_file=path_output)
def test_modular_description_set(tdf): profile = ProfileReport( tdf, title="Modular test", duplicates=None, samples={ "head": 0, "tail": 0 }, correlations=None, interactions=None, missing_diagrams={ "matrix": False, "bar": False, "dendrogram": False, "heatmap": False, }, pool_size=1, ) html = profile.get_description() assert len(html) > 0
def test_issue_120(get_data_file): file_name = get_data_file( "pandas_profiling_bug.txt", "https://github.com/pandas-profiling/pandas-profiling/files/2386812/pandas_profiling_bug.txt", ) df = pd.read_csv(file_name) report = ProfileReport( df, correlations={ "cramers": { "calculate": False }, "phi_k": { "calculate": False }, "kendall": { "calculate": False }, "spearman": { "calculate": False }, "pearson": { "calculate": False }, "recoded": { "calculate": False }, }, progress_bar=False, pool_size=0, vars={"cat": { "words": True, "characters": True }}, ) _ = report.report assert report.description_set is not None
def test_html_export_cdn(test_output_dir): n_rows = 10 n_columns = 10 df = pd.DataFrame( np.random.randint(0, 1000, size=(n_rows, n_columns)), columns=[f"column_{c}" for c in range(n_columns)], ) profile = ProfileReport( df, minimal=True, html={ "inline": False, "use_local_assets": False }, ) report = test_output_dir / "cdn.html" profile.to_file(report) assert report.exists() assets_dir = test_output_dir / "cdn_assets" check_assets(assets_dir, "svg", n_css=1, n_js=1)
def print_basic_details(df, file_name, report=False, open_html=False): from pandas_profiling import ProfileReport raw_num, col_num, col_list_names, duplicated_raws, col_types = get_basic_details( df) print(''' The number of raws in {0} DF are: {1} The number of columns in {0} DF are: {2} The names of columns in {0} DF are: \n {3} The count of duplicated raws in {0} DF is: {4} Columns types in {0} DF are: \n {5} '''.format(file_name, raw_num, col_num, col_list_names, duplicated_raws, col_types)) if report: profile = ProfileReport( df, title='Pandas Profiling Report - {} Data Frame'.format(file_name), html={'style': { 'full_width': True }}) profile.to_file(output_file="{}.html".format(file_name)) if open_html: import webbrowser webbrowser.open('{}.html'.format(file_name))
def get_profile_results(data): """profiles pandas dataframe""" if isinstance(data, pd.DataFrame): profile = ProfileReport( data, title='Snowflake Data Profiler from Hashmap', progress_bar=False, explorative=True, correlations={ "pearson": {"calculate": True}, "spearman": {"calculate": False}, "kendall": {"calculate": False}, "phi_k": {"calculate": False}, "cramers": {"calculate": False}, }, ) p = profile.to_html() # this step sometimes fails with matplotlib errors about threads. I've only fixed it by adjusting requirements.txt in the past. I've just specified the specific versions of libraries. Pyarrow seems to have an impact on this. return p else: raise TypeError('This is not a pandas dataframe.')
def profile(self, title: str = 'Dataset profile report', html_path: str = None, show_report_in_notebook: bool = False): """Generates a pandas-profiling report of the dataset to be displayed in a jupyter notebook. Optionally saves the report as an html file :param html_path: If provided, the pandas-profiling report will be saved to disk :param show_report_in_notebook: Whether or not to show report in jupyter notebook :return: None """ if not os.path.exists(html_path): logger.info('Generating the profiling report') profile_report = ProfileReport(self.data, title=title) if html_path is not None: profile_report.to_file(html_path) logger.info( f'Saved the pandas-profiling report to ``{html_path}``') profile_report.to_notebook_iframe() else: logger.info( f'A profiling report was already generated and will be loaded from ``{html_path}``' ) display(IFrame(src=html_path, width=10**3, height=10**3))
def test_issue864(): def random_list(n): return [random.randrange(0, 100) for _ in range(0, n)] df = pd.DataFrame({"a": random_list(30)}) profile = ProfileReport(df) def test_with_value(n_extreme_obs): """Generate HTML and validate the tabs contain the proper tab titles.""" profile.config.n_extreme_obs = n_extreme_obs profile.invalidate_cache() reg_min = f"*<a href=* aria-controls=* role=tab data-toggle=tab>Minimum {n_extreme_obs} values</a>*" reg_max = f"*<a href=* aria-controls=* role=tab data-toggle=tab>Maximum {n_extreme_obs} values</a>*" html = profile.to_html() assert fnmatch.fnmatch(html, reg_min) assert fnmatch.fnmatch(html, reg_max) test_with_value(5) test_with_value(10) test_with_value(12)
def profiling_page(): logger.info({"message": "Loading profiling page."}) st.title("Profiling Tables") # Select table db = Database(file_name=st.session_state.db_name) db_tables = db.show_tables() if len(db_tables) == 0: st.warning("The database has no tables available.") logger.warning({"message": "The database has no tables available."}) st.stop() st.write( "You can select an entire table or create your custom SQL-statement.") with st.form(key="profiling_form"): query = st.text_area("SQL-statement", value="SELECT * FROM table", height=300, help="SQL-statement based on SQLite syntax.") st.write(' ') if st.form_submit_button(label='Profiling'): logger.info({"message": "Profiling Table."}) df_query = db.query(query) else: df_query = None if df_query is not None: pr = ProfileReport(df_query, explorative=True, dark_mode=True) st_profile_report(pr) logger.info({"message": "Profiling page loaded."})
""" Daten ablesen und für die Modellbildung vorbereiten Die Daten aus den vorgegebenen .txt* Datei ablesen, nicht aufgefassennen Daten werden als NaN ergänzt Die wichtigen Parameter heraussuchen, möglich sind ("Gear","amb","HCnt"...) Eine Analyse-report durch Pandas_profiling herstellen, umd einen Überblick der Daten herzustellen Die Daten visualisieren, inkulusiv zeitliche Verlauf, linearregrassion je eines Zeitintervall (90 Tagen hier) """ import datetime import os import matplotlib import matplotlib.pyplot as plt import numpy as np import pandas as pd from ml_dataprocessing_txttocsv import ml_data_processing_txt2cav from visualisation import interval_reg, normal_draw from pandas_profiling import ProfileReport # ob man Daten konvertieren muss (1: ja, 0: nein) data_convert = 1 # Speicherort der Daten auswaehlen path_data = 'E:\\0000_Daten\\0000_Daten' # Die Zeichnung direkt abspeichern ohne vorzustellen matplotlib.use('agg') # Die Daten konvertieren zu csv Datei if data_convert == 1: ml_data_processing_txt2cav(path_data) # starten folder = ['Workpart1_WTG', 'Workpart2_WTG'] for folder_name in folder: file_path = 'E:\\csv_data_group_8\\' + folder_name print('working on ' + folder_name) file_name = os.listdir(file_path)
for i in df.quality.iteritems(): value = (i[1]) if value >= 6.5: quality_bool.append(1) else: quality_bool.append(0) df['quality_bool'] = quality_bool # Putting the Column in Data Frame df.quality_bool.value_counts() # Understanding the comportament of my data #Importing ProfileReport to help on Descriptive Analisys from pandas_profiling import ProfileReport profile = ProfileReport(df, title='Relatory of Red Wine Quality', html={'style': { 'full_width': True }}) profile profile.to_file(output_file="redwine_quality.html") #Dowloading the relatory # Cleaning Data df.duplicated() # Finding duplicated rows df_without_duplicates = df.drop_duplicates( ) # Creating othes Data Frame withour duplicated rows # Comparing Data Frames df.info() df_without_duplicates.info() df_without_duplicates.mean() df_without_duplicates = df_without_duplicates.rename(
#adding the column t and t_squared in the data air['t'] = t air['t_square'] = air['t'] * air['t'] #performing the log operation on the Passenger data and then adding in to in the main data . log_pass = np.log(air['Passengers']) air['log_pass'] = log_pass # In[7]: air # In[8]: #EDA report = ProfileReport(air, title="Profile Report of the Airlines data", explorative=True) # In[29]: report.to_widgets() # In[ ]: #REPORT SAYS THAT: #1) THERE ARE NO MISSING VALUES #2) NO DUPLICATE ROWS #3) NO MULTICOLINEARITY # In[9]:
from statsmodels.nonparametric.smoothers_lowess import lowess import phik from helpers import encode_dates, loguniform, similarity_encode df = pd.read_csv( r"data\appstore_games.csv", parse_dates=["Original Release Date", "Current Version Release Date"], index_col=[], delimiter=",", low_memory=False, ) PROFILE = False if PROFILE: profile = ProfileReport(df) profile.to_file("pandas_profiling_report.html") print( pd.concat([df.dtypes, df.nunique() / len(df)], axis=1).rename({ 0: "dtype", 1: "proportion unique" }, axis=1).sort_values(["dtype", "proportion unique"])) TARGET = "Average User Rating" print(f"Missing targets: {df[TARGET].isnull().sum()}") print(f"% missing: {df[TARGET].isnull().sum() / len(df):.0%}") DROP_MISSING = False if DROP_MISSING:
baysis_selected.drop('Bes2', axis='columns', inplace=True) ############### ### Scatter ### ############### ########### ### Box ### ########### ############## ### Report ### ############## if generate_report: report = ProfileReport(baysis_selected, title='BAYSIS Selected Dataset Report') report.to_file(work_path + file_prefix + '_report.html') ################### ### Encoding ### ################### # define column types nominal_columns = [ "Str", "Kat", "Typ", "UArt1", "UArt2", "AUrs1", "AUrs2", "AufHi", "Char1", "Char2", "Bes1", "Bes2", "Lich1", "Lich2", "Zust1", "Zust2", "WoTag", 'Month' ] dichotomous_columns = ["Alkoh"] ordinal_columns = ["Betei", "Fstf", "FeiTag"]
uploaded_file = st.sidebar.file_uploader("Upload your input CSV file", type=["csv"]) # st.sidebar.markdown(""" # [Example CSV input file](https://raw.githubusercontent.com/NewHarmony/100_Days_of_Code/data/master/Data_Science/Streamlit/percent_bachelors_degrees_women_usa.csv) # """) #Pandas Profiling Report if uploaded_file is not None: @st.cache def load_csv(): csv = pd.read_csv(uploaded_file) return csv df = load_csv() pr = ProfileReport(df, explorative=True) st.header('**Input DataFrame**') st.write(df) st.write('--') st.header('**Pandas Profiling Report**') st_profile_report(pr) else: st.info('Waiting for CSV file to be uploaded.') if st.button('Click to use Example Dataset'): #Example data @st.cache def load_data(): csv = pd.read_csv("percent_bachelors_degrees_women_usa.csv") return csv df = load_data()
def get_data(autophrase_params, data_in, false_positive_phrases, false_positive_substrings): # Make data directories os.makedirs('data/temp', exist_ok=True) os.makedirs('data/out', exist_ok=True) # Read in raw data def normalize_languages(x): def is_utf8(value): try: value.encode() except UnicodeEncodeError: return False return True def sub(value): return re.sub(r' [Ll]anguages?', '', value) return list( np.unique( [sub(value) for value in eval(x).values() if is_utf8(value)])) def normalize_countries(x): return sorted(eval(x).values()) def normalize_genres(x): def sub(value): # Replace with a more common genre name if value == 'Animal Picture': return 'Animals' if value in ['Biographical film', 'Biopic [feature]']: return 'Biography' if value == 'Buddy Picture': return 'Buddy' if value == 'Comdedy': return 'Comedy' if value == 'Coming of age': return 'Coming-of-age' if value == 'Detective fiction': return 'Detective' if value == 'Education': return 'Educational' if value in ['Gay Interest', 'Gay Themed']: return 'Gay' if value == 'Gross out': return 'Gross-out' if value == 'Pornography': return 'Pornographic' if value == 'Social issues': return 'Social problem' return re.sub(' [Ff]ilms?| [Mm]ovies?', '', value) return list(np.unique([sub(value) for value in eval(x).values()])) def clean_summary(summary): return (summary.str.replace(r'{{.*?}}', '') # Remove Wikipedia tags .str.replace(r'http\S+', '') # Remove URLs .str.replace(r'\s+', ' ') # Combine whitespace .str.strip() # Strip whitespace .replace('', pd.NA) # Replace empty strings with NA ) movies = pd.read_csv( f'{data_in}/movie.metadata.tsv', converters={ 'languages': normalize_languages, 'countries': normalize_countries, 'genres': normalize_genres }, delimiter='\t', header=None, index_col='id', names='id name date revenue runtime languages countries genres'.split( ), usecols=[ 0, 2, 3, 4, 5, 6, 7, 8 ]).assign(date=lambda x: pd.to_datetime(x.date, errors='coerce')) summaries = pd.read_csv( f'{data_in}/plot_summaries.txt', delimiter='\t', header=None, index_col='id', names='id summary'.split()).assign( summary=lambda x: clean_summary(x.summary)).dropna() # Combine movie metadata and plot summaries into df df = movies.merge(summaries, on='id').sort_values('date').reset_index(drop=True) # Run AutoPhrase on plot summaries with open('data/temp/summaries.txt', 'w') as f: f.write('\n'.join(df.summary)) autophrase_params = ' '.join( [f'{param}={value}' for param, value in autophrase_params.items()]) os.system( f'cd AutoPhrase && {autophrase_params} ./auto_phrase.sh && {autophrase_params} ./phrasal_segmentation.sh' ) # Add phrases to df def extract_highlighted_phrases(segmentation): def is_false_positive(s): s = s.lower() if len(s) == 1: # Only 1 character return True if s in false_positive_phrases: return True for false_positive_substring in false_positive_substrings: if false_positive_substring in s: return True return False return (segmentation.str.findall(r'<phrase>(.+?)</phrase>').apply( lambda x: [s.lower() for s in x if not is_false_positive(s)]).apply( np.unique).apply(list).values) df['phrases'] = extract_highlighted_phrases( pd.read_csv('model/autophrase/segmentation.txt', delimiter=r'\n', engine='python', header=None, squeeze=True)) # Export df df.to_pickle('data/out/data.pkl') ProfileReport(df).to_file('data/out/report.html')