def generate_profiling_reports(version=None): """ Generates a pandas-profiling based report for the current training and test set and store them in HTML in the corresponding reports path. :param version: Version of the data to generate the reports. If not specified, last version is chosen. """ logger = logging.getLogger(__name__) logger.info("Requested profiling report generation") from src.common_paths import get_reports_version_path from src.file_loaders import load_numerai_data if not version: logger.info("Using last version of the data") else: logger.info("Using data version: {0}".format(version)) report_path = get_reports_version_path(version) logger.info("Loading data...") df_train, df_test = load_numerai_data(version) logger.info("Data loaded successfully!") logger.info("Generating pandas profiling report for training set...") report_tr = pandas_profiling.ProfileReport(df_train) logger.info("Generating pandas profiling report for test set...") report_te = pandas_profiling.ProfileReport(df_test) logger.info("Reports generated successfully. Storing them.") report_tr.to_file(os.path.join(report_path, "profiling_report_train.html")) report_te.to_file(os.path.join(report_path, "profiling_report_test.html")) logger.info("Reports stored in {0}".format(report_path))
def info_report(self, directory: Union[Path, str, None] = None): """Generate an html-report for a DataFrame. To install with conda: conda install -n <env_name> pandas-profiling Args: directory (Union[Path, str, None], optional): the directory to create the report in. Defaults to None (which implies showing it directly, for example in a Jupyter Notebook). """ try: import pandas_profiling as pp if directory is None: # Show profile report (in Jupyter Notebook) pp.ProfileReport(self.__dataframe) else: # Write profile report to html file (from script) _directory = directory if not isinstance(directory, Path): _directory = Path(directory) if not _directory.exists(): Path.mkdir(_directory, parents=True) filename = _directory.joinpath("pandas_profiler_result.html") pp.ProfileReport(self.__dataframe).to_file(outputfile=filename) except ImportError: print( "pandas_profiling is not installed. Please install through conda or pip." )
def main(): if (sys.argv[-1] == '--collect'): url_pois = 'https://pt.foursquare.com/explore?mode=url&ne=-29.358988%2C-50.837817&q=Sele%C3%A7%C3%B5es%20principais&sw=-29.41889%2C-50.887942' url_city = 'http://www.dataviva.info/pt/location/5rs020102' e = Extraction(url_pois, url_city) e.poi_data_extraction() e.city_data_extraction() # Gera relatório do dataset file = 'foursquare_data.csv' df = pd.read_csv(file, parse_dates=True, encoding='UTF-8') profile = pandas_profiling.ProfileReport(df) profile.to_file(outputfile='dataset_report.html') P = Process(file) df = P.method() df_report = pd.read_csv('preprocessed.csv', parse_dates=True, encoding='UTF-8') profile = pandas_profiling.ProfileReport(df_report) profile.to_file(outputfile='preprocessed_dataset_report.html') R = Recommendation(df) R.pattern_recommendation() R.new_recommendation() R.compare() # R.test_rec() ont = Ontology() ont.write_owl()
def get_RawProfile(df): profile = prof.ProfileReport(df) profile.to_file(outputfile="myoutputfile.html") parent_path = os.getcwd() path = parent_path + "/myoutputfile.html" chrome_path = '/usr/bin/google-chrome %s' webbrowser.get(chrome_path).open(path) #msno.matrix(df) return prof.ProfileReport(df)
def feature_selection(subjects): for id, subject in enumerate(subjects.values()): report = pandas_profiling.ProfileReport(subject) report.to_file(outputfile=f"{DATA_REPORTS}/subject{id + 1}.html") rejected_variables = report.get_rejected_variables(threshold=0.9) subject.drop(rejected_variables, axis=1, inplace=True) print("4.Features was selected\n")
def generate_report(in_data, is_csv=False): '''Convert a csv file / dataframe into an HTML report''' if is_csv: in_data = import_csv(in_data) report = pp.ProfileReport(in_data) report.title = 'Your data profile' return report
def profile(data_url): df = getCSV(data_url) df = convert_date(df) profile = pandas_profiling.ProfileReport(df) profile.to_file(outputfile=html) export_csv(df)
def eda(data): data.head() data.shape data.size data.sample() data.tail() data.info() data.describe() pp.ProfileReport(data) plt.rcParams["figure.figsize"] = (15, 5) plt.title('Sensor Verileri') aX = data['AccX'] plt.plot(aX, label="AccX", color='red') bX = data['AccY'] plt.plot(bX, label="AccY", color='blue') cX = data['AccZ'] plt.plot(cX, label="AccZ", color='purple') plt.legend() plt.rcParams["figure.figsize"] = (15, 5) plt.title('Sensor Verileri') aX = data['GyroX'] plt.plot(aX, label="GyroX", color='red') bX = data['GyroY'] plt.plot(bX, label="GyroY", color='blue') cX = data['GyroZ'] plt.plot(cX, label="GyroZ", color='green') plt.legend() sns.pairplot(data)
def test_issue397(): # Note: warnings are expected with np.inf values df = pd.DataFrame.from_dict( { "float-inf": pd.Series([np.inf, 3.0, 4.0, np.NINF], dtype="float"), "integer": pd.Series([3, 4, 5, 6], dtype="int"), "float": pd.Series([3.0, 4.0, np.nan, 6], dtype="float"), "integer-inf": pd.Series([3, np.inf, 5, 7]), "cat": ["Foo", "Bar", "Great", "Var"], } ) report = pandas_profiling.ProfileReport( df, vars={"num": {"low_categorical_threshold": 0}} ) assert report.config.vars.num.low_categorical_threshold == 0 description = report.description_set assert description["table"]["types"] == {"Categorical": 1, "Numeric": 4} assert description["variables"]["float-inf"]["p_infinite"] == 0.5 assert description["variables"]["float-inf"]["n_infinite"] == 2 assert description["variables"]["integer-inf"]["p_infinite"] == 0.25 assert description["variables"]["integer-inf"]["n_infinite"] == 1 assert description["variables"]["integer"]["p_infinite"] == 0 assert description["variables"]["integer"]["n_infinite"] == 0 assert description["variables"]["float"]["p_infinite"] == 0 assert description["variables"]["float"]["n_infinite"] == 0 assert "p_infinite" not in description["variables"]["cat"] assert "n_infinite" not in description["variables"]["cat"]
def output_profiling_report(self): df = pd.read_csv(self._csv_path, parse_dates=True, encoding='UTF-8') profile = pdp.ProfileReport(df) file_path = house_prices.output_dir_path( ) + 'house_prices_outputfile.html' logger.debug('output_profiling_report : ' + file_path) profile.to_file(outputfile=file_path)
def test_export_to_file(self): p = pandas_profiling.ProfileReport(self.df) filename = os.path.join(self.test_dir, "profile_%s.html" % hash(self)) p.to_file(outputfile=filename) self.assertLess(200, os.path.getsize(filename))
def run(self): """ Analyses the VOTable file containing the GACS-dev query results """ logger.info('Input VOTable file: %s' % self.input().path) t = Table.read(self.input().path, format='votable') df = pd.DataFrame(np.ma.filled(t.as_array()), columns=t.colnames) gaiamagcols = [ 'dec', 'dec_error', 'dist', 'phot_g_mean_flux', 'phot_g_mean_mag', 'ra', 'source_id' ] gaiadf = df[gaiamagcols] profile = pandas_profiling.ProfileReport(gaiadf) analysis_context = { 'gacs_dfdescription': gaiadf.describe().to_html( classes='table table-striped table-bordered table-hover'), 'pandas_profiling': profile.html } # logger.debug('analysis_context %s' % analysis_context) # JSON will be the context used for the template with open(self.output().path, 'wb') as out: json.dump(analysis_context, out)
def show_dataframe(self, minimal=True): with st.beta_container(): # Options options = [table[0] for table in self.connection.cursor().execute( "SELECT name FROM sqlite_master WHERE type='table';").fetchall()] table = st.selectbox(self.text, options, index=3) st.info(f"Note: due to limited output size, the displayed DataFrame is limited to the first " f"{self.limit_rows} rows only.\n\nHowever, the Pandas Profiling Report " f"calculates on the full DataFrame.") col1, col2 = st.beta_columns(2) with col1: df = _load_df(table, self.connection) self.show_df = df.head(self.limit_rows) # Only shows limited rows self.profile_df = df # Show DataFrame's info buffer = io.StringIO() df.info(buf=buffer) st.text(buffer.getvalue()) # Show HiPlot xp = hip.Experiment.from_dataframe(self.show_df) xp.display_st(key="hip") with col2: # Show Pandas Profile Report self.profile_report = pp.ProfileReport(self.profile_df, minimal=minimal, progress_bar=False) with st.spinner("Generating profile report..."): components.html(self.profile_report.to_html(), height=1500, scrolling=True)
def test_issue437(): try: # pd.NA does not exist in some pandas versions _ = pd.NA except: pass else: tmp_list = [ 0.15416284237967237, 0.7400496965154048, 0.26331501518513467, 0.5337393933802977, 0.014574962485419674, 0.918747008099885, 0.9007148541170122, 0.03342142762634459, 0.9569493362751168, 0.13720932135607644, ] # If exist, we should handle it properly df = pd.DataFrame( { "a": tmp_list + [np.inf, -np.inf], "b": tmp_list + [None, np.nan], "c": tmp_list + [0, pd.NA], } ) report = pandas_profiling.ProfileReport(df) description_set = report.description_set assert description_set["variables"]["a"]["type"] == Variable.TYPE_NUM assert description_set["variables"]["b"]["type"] == Variable.TYPE_NUM assert description_set["variables"]["c"]["type"] == Variable.TYPE_NUM
def column_search(Filter=''): print( 'Filter on Feature name, tags, attributes, or feature set name. Search multiple values ' 'with "&" and "|" Enter a single Feature name for a detailed report. ' ) res_df = __filter_df(pdf, Filter) if len(res_df) == 1: print("Generating Report...") col_name = res_df['name'].values[0] print(col_name) data = fs.get_training_set([col_name], current_values_only=True).cache() print('Gathering data') df_size = spark_df_size(data) print('Profiling Data') if pandas_profile: if df_size >= 5e8: # It's too big for pandas print("Dataset is too large. Profiling with Spark instead") display( spark_df_profiling.ProfileReport(data.cache(), explorative=True)) else: display( pandas_profiling.ProfileReport(data.toPandas(), explorative=True)) else: display( spark_df_profiling.ProfileReport(data.cache(), explorative=True)) return res_df
def create_report(self, dataset): """ Function creates profile report with open-source tool ``pandas_profiling`` **Args** =============== ================================================================= ==================================== Parameter Data Type Description =============== ================================================================= ==================================== df_x ``pandas.DataFrame`` (number_of_samples, number_of_features) Features for modeling df_y ``pandas.DataFrame`` (number_of_samples, ) Y for modeling output_to_file ``Boolean`` Whether or not to output filepath ``Str`` Path to output file =============== ================================================================= ==================================== **Returns** ================ =================================== ======================== Parameter Data Type Description ================ =================================== ======================== profile_report ``pandas_profiling.ProfileReport`` Reporting Result ================ =================================== ======================== """ df_xy = pd.concat([dataset.x, dataset.y], axis=1) self.__profile_report = pp.ProfileReport(df_xy) return self.__profile_report
def generate_report(dataframe, name="PumpItUp-EDA"): """Generate a report using Pandas profiling""" import pandas_profiling as pdp profile_train_df = pdp.ProfileReport(dataframe, title="Pandas Profiling Report", explorative=True) profile_train_df.to_file(output_file=f"../{name}.html")
def main(): #st.header("Data Quality Profling Tool") data_file = st.file_uploader("Upload CSV or Excel File", type=['csv', 'xlsx']) if data_file is not None: try: df = pd.read_csv(data_file) except: df = pd.read_excel(data_file, engine='openpyxl') st.subheader("Sample Data from File") st.dataframe(df.head()) #st.subheader("Data Quality Profile") profile = pp.ProfileReport(df, title="Data Quality Profile Report", minimal=True) st.text("Please wait for Report to generate...") profile.to_file('profile_report.html') #filepath = st.text_input("Where do you want to save the report?") #if filepath is not None: #download = st.button("Download Report") #if download: #profile.to_file(filepath+"\Data Quality Profile.html") #st_profile_report(profile) st.markdown(get_binary_file_downloader_html( 'profile_report.html', 'Data Quality Profile Report'), unsafe_allow_html=True)
def correlation_report(df): """ Performs a correlation report and removes highly correlated features. Parameters ---------- df: dataframe features Returns ------- df: feature dataframe without high correlated features """ # TODO use another package # To correct a bug in pandas_profiling package BACKEND = matplotlib.get_backend() import pandas_profiling matplotlib.use(BACKEND) profile = pandas_profiling.ProfileReport(df) profile.to_file(outputfile="CorrelationReport.html") inp = str(input('Do you wish to remove correlated features? Enter y/n: ')) if inp == 'y': reject = profile.get_rejected_variables(threshold=0.9) if not list(reject): print('No features to remove') for rej in reject: print('Removing ' + str(rej)) df = df.drop(rej, axis=1) return df
def generate_report(self, df=None, file_name="output.html"): # type: (pd, string) -> string """Generating the report using pandas profiler :param df: Pandas dataframe :return: Report file path :rtype: String .. note:: Usage of the function >>> generate_report(df) >>> generate_report(df=<your dataframe>) """ if df is None: df = self.df profile = pp.ProfileReport(df) if not file_name.endswith('.html'): self.logger.warn(f"The file name {file_name} not ends with .html. Renaming the filename") file_name = f"{file_name}.html" report_file = os.path.join(self.report_dir, file_name) profile.to_file(report_file) return report_file
def webbroser(): # 2 - Webbroser # 2.1 df = pd.DataFrame(data, columns=['Nationality', 'Overall', 'Potential']) pandas_profiling.ProfileReport(df).to_file('./data/report.html') # 2.2 webbrowser.open(os.getcwd()+"./data/report.html")
def make_pandas_profiling_report(df): print(f"\nPandas profiling report start...") # You can choose config between: "config_default." \"config_minimal." \"config_optimal." config_file = pandas_profiling_dir / "config_optimal.yaml" # Make: Pandas Profile report pp_train = pp.ProfileReport(df, config_file=config_file) pp_train.to_file(reports_dir / f"PandasProfile_train.html")
def form_reporter_table(filename): current_path = os.getcwd() fil_name = '/{}.csv'.format(filename) file_path = current_path + fil_name data = pd.read_csv(file_path) print(data) prf = pandas_profiling.ProfileReport(data) prf.to_file('./{}.html'.format(filename))
def profiling_output(self, obj_elab): data = obj_elab.dataset # profile = data.profile_report(title=flask.session["selected_file"]) profile = pandas_profiling.ProfileReport(data) print(dir(profile)) profile.title = obj_elab.selected_file profile.to_file(output_file="./templates/profiling.html") return render_template("profiling.html")
def profile(): os.makedirs("profile/", exist_ok=True) for x in colcat: df[x] = df[x].factorize()[0] ##### Pandas Profile ################################### profile = pandas_profiling.ProfileReport(df) profile.to_file(output_file="profile/raw_report.html")
def statistic_info(FGL, path): """ 指定したDataFrame型のデータの特徴量同士の相関や欠損値など統計的な情報をHTMLで出力します. :param FGL: DataFrame :return: なし (統計データプロファイルを作成) """ path = path + "/ana.html" ppf.ProfileReport(FGL).to_file(outputfile=path)
def feature_report(): mmp_config = config.Config() print('Reading train.h5...') with timer('Reading train.h5'): df_train = pd.read_hdf(mmp_config.TRAIN_H5_PATH, key='data') print('Reading test.h5...') with timer('Reading test.h5'): df_test = pd.read_hdf(mmp_config.TEST_H5_PATH, key='data') with timer('Train report'): train_report = pdf.ProfileReport(df_train) train_report.to_file(mmp_config.TRAIN_REPORT_PATH) with timer('Test report'): test_report = pdf.ProfileReport(df_test) test_report.to_file(mmp_config.TEST_REPORT_PATH)
def profiling(df): print('Start Profiling') start = time.time() sample_for_profiling = df profile_target = pdp.ProfileReport(sample_for_profiling) print("Profile", time.time() - start, "s") profile_target.to_file("profile_target.html")
def eda(report=False, read_local=False): """ 探索性数据分析主程序入口 Args: report: 是否保留分析报告,由于该报告对大数据集会占用大量时间,默认不生成 read_local: 是否选择读取本地文件 生成以下四份文件: 1、数据对象文件: “data/train.pickle” 2、变量类型配置文件: “config/variable_type.csv” 3、样例数据: “data/sample.xlsx” 4、结果报告: “result/report.html” """ # 加载数据并保存本地 print(">>> 全量数据集加载") data = load_data(mode="train", read_local=read_local) # 处理同名列 data.columns = [ j + f'.{i}' if data.columns.duplicated()[i] else j for i, j in enumerate(data.columns) ] data.to_pickle('data/train.pickle') # 生成变量类别配置文件 # 变量是否保留进行特征工程(isSave): 0-不保留;1-保留 # 变量类型(Type): numeric-数值类型;category-类别类型;datetime-日期类型;prediction-预测列;identifier-业务标识符; text-字符列 print(">>> 生成配置表config/variable_type.csv,请完善配置") res = pd.DataFrame( data={ 'Variable': data.columns, 'isSave:[0/1]': [1] * len(data.columns), 'Type:[identifier/numeric/category/datetime/text/prediction]': ['numeric'] * len(data.columns), 'Default': [''] * len(data.columns), 'Comment': [''] * len(data.columns) }) res.to_csv("config/variable_type.csv", index=False, encoding="utf_8_sig") # 抽样探索 print(">>> 生成探索性分析抽样data/sample.xlsx") sample = data.sample(min(len(data), 500)) sample.reset_index(drop=True, inplace=True) with pd.ExcelWriter('data/sample.xlsx') as writer: sample.to_excel(writer, sheet_name="抽样数据500条", index=False) sample.corr().to_excel(writer, sheet_name="相关系数", index=False) sample.describe().to_excel(writer, sheet_name="数值数据汇总") try: sample.select_dtypes('object').describe().to_excel( writer, sheet_name="分类数据汇总") except: pass # 保存分析报告 if report: assert not sample.empty, "采样集为空,请确认配置文件" print(">>> 生成数据分析报告result/report.html") report = pp.ProfileReport(sample) report.to_file('result/report.html')
def create_analysis(file_name): import pandas as pd import pandas_profiling as pp df = pd.read_csv("uploads/" + file_name) report = pp.ProfileReport(df) os.chdir('outputs/') report.to_file(file_name.split(".")[0] + ".html") os.chdir('../')