Ejemplo n.º 1
0
def generate_profiling_reports(version=None):
    """
    Generates a pandas-profiling based report for the current training and test set and store them in HTML in the
    corresponding reports path.
    :param version: Version of the data to generate the reports. If not specified, last version is chosen.
    """
    logger = logging.getLogger(__name__)
    logger.info("Requested profiling report generation")
    from src.common_paths import get_reports_version_path
    from src.file_loaders import load_numerai_data

    if not version:
        logger.info("Using last version of the data")
    else:
        logger.info("Using data version: {0}".format(version))
    report_path = get_reports_version_path(version)
    logger.info("Loading data...")
    df_train, df_test = load_numerai_data(version)
    logger.info("Data loaded successfully!")
    logger.info("Generating pandas profiling report for training set...")
    report_tr = pandas_profiling.ProfileReport(df_train)
    logger.info("Generating pandas profiling report for test set...")
    report_te = pandas_profiling.ProfileReport(df_test)
    logger.info("Reports generated successfully. Storing them.")
    report_tr.to_file(os.path.join(report_path, "profiling_report_train.html"))
    report_te.to_file(os.path.join(report_path, "profiling_report_test.html"))
    logger.info("Reports stored in {0}".format(report_path))
Ejemplo n.º 2
0
    def info_report(self, directory: Union[Path, str, None] = None):
        """Generate an html-report for a DataFrame. To install with conda:
           conda install -n <env_name> pandas-profiling

        Args:
            directory (Union[Path, str, None], optional): the directory to
                create the report in. Defaults to None (which implies showing
                it directly, for example in a Jupyter Notebook).
        """

        try:
            import pandas_profiling as pp
            if directory is None:
                # Show profile report (in Jupyter Notebook)
                pp.ProfileReport(self.__dataframe)
            else:
                # Write profile report to html file (from script)
                _directory = directory
                if not isinstance(directory, Path):
                    _directory = Path(directory)
                if not _directory.exists():
                    Path.mkdir(_directory, parents=True)
                filename = _directory.joinpath("pandas_profiler_result.html")
                pp.ProfileReport(self.__dataframe).to_file(outputfile=filename)
        except ImportError:
            print(
                "pandas_profiling is not installed. Please install through conda or pip."
            )
Ejemplo n.º 3
0
def main():
    if (sys.argv[-1] == '--collect'):
        url_pois = 'https://pt.foursquare.com/explore?mode=url&ne=-29.358988%2C-50.837817&q=Sele%C3%A7%C3%B5es%20principais&sw=-29.41889%2C-50.887942'
        url_city = 'http://www.dataviva.info/pt/location/5rs020102'

        e = Extraction(url_pois, url_city)
        e.poi_data_extraction()
        e.city_data_extraction()

    # Gera relatório do dataset
    file = 'foursquare_data.csv'
    df = pd.read_csv(file, parse_dates=True, encoding='UTF-8')
    profile = pandas_profiling.ProfileReport(df)
    profile.to_file(outputfile='dataset_report.html')

    P = Process(file)
    df = P.method()

    df_report = pd.read_csv('preprocessed.csv',
                            parse_dates=True,
                            encoding='UTF-8')
    profile = pandas_profiling.ProfileReport(df_report)
    profile.to_file(outputfile='preprocessed_dataset_report.html')

    R = Recommendation(df)
    R.pattern_recommendation()
    R.new_recommendation()
    R.compare()
    # R.test_rec()

    ont = Ontology()
    ont.write_owl()
Ejemplo n.º 4
0
def get_RawProfile(df):
    profile = prof.ProfileReport(df)
    profile.to_file(outputfile="myoutputfile.html")
    parent_path = os.getcwd()
    path = parent_path + "/myoutputfile.html"
    chrome_path = '/usr/bin/google-chrome %s'
    webbrowser.get(chrome_path).open(path)
    #msno.matrix(df)
    return prof.ProfileReport(df)
Ejemplo n.º 5
0
def feature_selection(subjects):
    for id, subject in enumerate(subjects.values()):
        report = pandas_profiling.ProfileReport(subject)
        report.to_file(outputfile=f"{DATA_REPORTS}/subject{id + 1}.html")
        rejected_variables = report.get_rejected_variables(threshold=0.9)
        subject.drop(rejected_variables, axis=1, inplace=True)
    print("4.Features was selected\n")
Ejemplo n.º 6
0
def generate_report(in_data, is_csv=False):
    '''Convert a csv file / dataframe into an HTML report'''
    if is_csv:
        in_data = import_csv(in_data)
    report = pp.ProfileReport(in_data)
    report.title = 'Your data profile'
    return report
Ejemplo n.º 7
0
def profile(data_url):
    df = getCSV(data_url)
    df = convert_date(df)
    profile = pandas_profiling.ProfileReport(df)
    profile.to_file(outputfile=html)

    export_csv(df)
def eda(data):

    data.head()
    data.shape
    data.size
    data.sample()
    data.tail()
    data.info()
    data.describe()
    pp.ProfileReport(data)

    plt.rcParams["figure.figsize"] = (15, 5)
    plt.title('Sensor Verileri')
    aX = data['AccX']
    plt.plot(aX, label="AccX", color='red')
    bX = data['AccY']
    plt.plot(bX, label="AccY", color='blue')
    cX = data['AccZ']
    plt.plot(cX, label="AccZ", color='purple')
    plt.legend()

    plt.rcParams["figure.figsize"] = (15, 5)
    plt.title('Sensor Verileri')
    aX = data['GyroX']
    plt.plot(aX, label="GyroX", color='red')
    bX = data['GyroY']
    plt.plot(bX, label="GyroY", color='blue')
    cX = data['GyroZ']
    plt.plot(cX, label="GyroZ", color='green')
    plt.legend()

    sns.pairplot(data)
Ejemplo n.º 9
0
def test_issue397():
    # Note: warnings are expected with np.inf values
    df = pd.DataFrame.from_dict(
        {
            "float-inf": pd.Series([np.inf, 3.0, 4.0, np.NINF], dtype="float"),
            "integer": pd.Series([3, 4, 5, 6], dtype="int"),
            "float": pd.Series([3.0, 4.0, np.nan, 6], dtype="float"),
            "integer-inf": pd.Series([3, np.inf, 5, 7]),
            "cat": ["Foo", "Bar", "Great", "Var"],
        }
    )

    report = pandas_profiling.ProfileReport(
        df, vars={"num": {"low_categorical_threshold": 0}}
    )
    assert report.config.vars.num.low_categorical_threshold == 0

    description = report.description_set

    assert description["table"]["types"] == {"Categorical": 1, "Numeric": 4}

    assert description["variables"]["float-inf"]["p_infinite"] == 0.5
    assert description["variables"]["float-inf"]["n_infinite"] == 2

    assert description["variables"]["integer-inf"]["p_infinite"] == 0.25
    assert description["variables"]["integer-inf"]["n_infinite"] == 1

    assert description["variables"]["integer"]["p_infinite"] == 0
    assert description["variables"]["integer"]["n_infinite"] == 0

    assert description["variables"]["float"]["p_infinite"] == 0
    assert description["variables"]["float"]["n_infinite"] == 0

    assert "p_infinite" not in description["variables"]["cat"]
    assert "n_infinite" not in description["variables"]["cat"]
 def output_profiling_report(self):
     df = pd.read_csv(self._csv_path, parse_dates=True, encoding='UTF-8')
     profile = pdp.ProfileReport(df)
     file_path = house_prices.output_dir_path(
     ) + 'house_prices_outputfile.html'
     logger.debug('output_profiling_report : ' + file_path)
     profile.to_file(outputfile=file_path)
Ejemplo n.º 11
0
    def test_export_to_file(self):

        p = pandas_profiling.ProfileReport(self.df)
        filename = os.path.join(self.test_dir, "profile_%s.html" % hash(self))
        p.to_file(outputfile=filename)

        self.assertLess(200, os.path.getsize(filename))
Ejemplo n.º 12
0
    def run(self):
        """
        Analyses the VOTable file containing the GACS-dev query results
        """
        logger.info('Input VOTable file: %s' % self.input().path)
        t = Table.read(self.input().path, format='votable')
        df = pd.DataFrame(np.ma.filled(t.as_array()), columns=t.colnames)

        gaiamagcols = [
            'dec', 'dec_error', 'dist', 'phot_g_mean_flux', 'phot_g_mean_mag',
            'ra', 'source_id'
        ]
        gaiadf = df[gaiamagcols]

        profile = pandas_profiling.ProfileReport(gaiadf)

        analysis_context = {
            'gacs_dfdescription':
            gaiadf.describe().to_html(
                classes='table table-striped table-bordered table-hover'),
            'pandas_profiling':
            profile.html
        }

        # logger.debug('analysis_context %s' % analysis_context)
        # JSON will be the context used for the template
        with open(self.output().path, 'wb') as out:
            json.dump(analysis_context, out)
Ejemplo n.º 13
0
    def show_dataframe(self, minimal=True):
        with st.beta_container():
            # Options
            options = [table[0] for table in self.connection.cursor().execute(
                "SELECT name FROM sqlite_master WHERE type='table';").fetchall()]
            table = st.selectbox(self.text, options, index=3)
            st.info(f"Note: due to limited output size, the displayed DataFrame is limited to the first "
                    f"{self.limit_rows} rows only.\n\nHowever, the Pandas Profiling Report "
                    f"calculates on the full DataFrame.")

            col1, col2 = st.beta_columns(2)
            with col1:
                df = _load_df(table, self.connection)
                self.show_df = df.head(self.limit_rows)  # Only shows limited rows
                self.profile_df = df

                # Show DataFrame's info
                buffer = io.StringIO()
                df.info(buf=buffer)
                st.text(buffer.getvalue())

                # Show HiPlot
                xp = hip.Experiment.from_dataframe(self.show_df)
                xp.display_st(key="hip")

            with col2:
                # Show Pandas Profile Report
                self.profile_report = pp.ProfileReport(self.profile_df, minimal=minimal, progress_bar=False)
                with st.spinner("Generating profile report..."):
                    components.html(self.profile_report.to_html(), height=1500, scrolling=True)
Ejemplo n.º 14
0
def test_issue437():
    try:
        # pd.NA does not exist in some pandas versions
        _ = pd.NA
    except:
        pass
    else:
        tmp_list = [
            0.15416284237967237,
            0.7400496965154048,
            0.26331501518513467,
            0.5337393933802977,
            0.014574962485419674,
            0.918747008099885,
            0.9007148541170122,
            0.03342142762634459,
            0.9569493362751168,
            0.13720932135607644,
        ]
        # If exist, we should handle it properly
        df = pd.DataFrame(
            {
                "a": tmp_list + [np.inf, -np.inf],
                "b": tmp_list + [None, np.nan],
                "c": tmp_list + [0, pd.NA],
            }
        )

        report = pandas_profiling.ProfileReport(df)
        description_set = report.description_set

        assert description_set["variables"]["a"]["type"] == Variable.TYPE_NUM
        assert description_set["variables"]["b"]["type"] == Variable.TYPE_NUM
        assert description_set["variables"]["c"]["type"] == Variable.TYPE_NUM
Ejemplo n.º 15
0
    def column_search(Filter=''):
        print(
            'Filter on Feature name, tags, attributes, or feature set name. Search multiple values '
            'with "&" and "|" Enter a single Feature name for a detailed report. '
        )

        res_df = __filter_df(pdf, Filter)

        if len(res_df) == 1:
            print("Generating Report...")
            col_name = res_df['name'].values[0]
            print(col_name)
            data = fs.get_training_set([col_name],
                                       current_values_only=True).cache()
            print('Gathering data')
            df_size = spark_df_size(data)
            print('Profiling Data')
            if pandas_profile:
                if df_size >= 5e8:  # It's too big for pandas
                    print("Dataset is too large. Profiling with Spark instead")
                    display(
                        spark_df_profiling.ProfileReport(data.cache(),
                                                         explorative=True))
                else:
                    display(
                        pandas_profiling.ProfileReport(data.toPandas(),
                                                       explorative=True))
            else:
                display(
                    spark_df_profiling.ProfileReport(data.cache(),
                                                     explorative=True))
        return res_df
Ejemplo n.º 16
0
    def create_report(self, dataset):
        """
        Function creates profile report with open-source tool ``pandas_profiling``
        
        **Args** 
            =============== ================================================================= ==================================== 
            Parameter       Data Type                                                         Description
            =============== ================================================================= ==================================== 
            df_x            ``pandas.DataFrame`` (number_of_samples, number_of_features)      Features for modeling
            df_y            ``pandas.DataFrame`` (number_of_samples, )                        Y for modeling
            output_to_file  ``Boolean``                                                       Whether or not to output 
            filepath        ``Str``                                                           Path to output file
            =============== ================================================================= ==================================== 
            
        **Returns** 
            ================ =================================== ======================== 
            Parameter        Data Type                           Description
            ================ =================================== ======================== 
            profile_report   ``pandas_profiling.ProfileReport``  Reporting Result 
            ================ =================================== ======================== 
            
        """
        df_xy = pd.concat([dataset.x, dataset.y], axis=1)
        self.__profile_report = pp.ProfileReport(df_xy)

        return self.__profile_report
Ejemplo n.º 17
0
def generate_report(dataframe, name="PumpItUp-EDA"):
    """Generate a report using Pandas profiling"""
    import pandas_profiling as pdp
    profile_train_df = pdp.ProfileReport(dataframe,
                                         title="Pandas Profiling Report",
                                         explorative=True)
    profile_train_df.to_file(output_file=f"../{name}.html")
Ejemplo n.º 18
0
def main():
    #st.header("Data Quality Profling Tool")
    data_file = st.file_uploader("Upload CSV or Excel File",
                                 type=['csv', 'xlsx'])

    if data_file is not None:
        try:
            df = pd.read_csv(data_file)
        except:
            df = pd.read_excel(data_file, engine='openpyxl')
        st.subheader("Sample Data from File")
        st.dataframe(df.head())
        #st.subheader("Data Quality Profile")
        profile = pp.ProfileReport(df,
                                   title="Data Quality Profile Report",
                                   minimal=True)
        st.text("Please wait for Report to generate...")
        profile.to_file('profile_report.html')
        #filepath = st.text_input("Where do you want to save the report?")
        #if filepath is not None:
        #download = st.button("Download Report")
        #if download:
        #profile.to_file(filepath+"\Data Quality Profile.html")
        #st_profile_report(profile)
        st.markdown(get_binary_file_downloader_html(
            'profile_report.html', 'Data Quality Profile Report'),
                    unsafe_allow_html=True)
Ejemplo n.º 19
0
def correlation_report(df):
    """ Performs a correlation report and removes highly correlated features.
    Parameters
    ----------
    df: dataframe
      features
    Returns
    -------
    df: feature dataframe without high correlated features
    """
    # TODO use another package
    # To correct a bug in pandas_profiling package
    BACKEND = matplotlib.get_backend()
    import pandas_profiling
    matplotlib.use(BACKEND)

    profile = pandas_profiling.ProfileReport(df)
    profile.to_file(outputfile="CorrelationReport.html")
    inp = str(input('Do you wish to remove correlated features? Enter y/n: '))
    if inp == 'y':
        reject = profile.get_rejected_variables(threshold=0.9)
        if not list(reject):
            print('No features to remove')
        for rej in reject:
            print('Removing ' + str(rej))
            df = df.drop(rej, axis=1)
    return df
Ejemplo n.º 20
0
    def generate_report(self, df=None, file_name="output.html"):
        # type: (pd, string) -> string
        """Generating the report using pandas profiler

        :param df: Pandas dataframe

        :return: Report file path
        :rtype: String

        .. note::
            Usage of the function
            
            >>> generate_report(df)
            >>> generate_report(df=<your dataframe>)

        """
        if df is None:
            df = self.df
        profile = pp.ProfileReport(df) 

        if not file_name.endswith('.html'):
            self.logger.warn(f"The file name {file_name} not ends with .html. Renaming the filename")
            file_name = f"{file_name}.html"

        report_file = os.path.join(self.report_dir, file_name)
        profile.to_file(report_file)
        return report_file
Ejemplo n.º 21
0
def webbroser():
    # 2 - Webbroser
    # 2.1
    df = pd.DataFrame(data, columns=['Nationality', 'Overall', 'Potential'])
    pandas_profiling.ProfileReport(df).to_file('./data/report.html')

    # 2.2
    webbrowser.open(os.getcwd()+"./data/report.html")
Ejemplo n.º 22
0
def make_pandas_profiling_report(df):
    print(f"\nPandas profiling report start...")
    # You can choose config between: "config_default." \"config_minimal." \"config_optimal."
    config_file = pandas_profiling_dir / "config_optimal.yaml"

    # Make: Pandas Profile report
    pp_train = pp.ProfileReport(df, config_file=config_file)
    pp_train.to_file(reports_dir / f"PandasProfile_train.html")
Ejemplo n.º 23
0
def form_reporter_table(filename):
    current_path = os.getcwd()
    fil_name = '/{}.csv'.format(filename)
    file_path = current_path + fil_name
    data = pd.read_csv(file_path)
    print(data)
    prf = pandas_profiling.ProfileReport(data)
    prf.to_file('./{}.html'.format(filename))
Ejemplo n.º 24
0
 def profiling_output(self, obj_elab):
     data = obj_elab.dataset
     # profile = data.profile_report(title=flask.session["selected_file"])
     profile = pandas_profiling.ProfileReport(data)
     print(dir(profile))
     profile.title = obj_elab.selected_file
     profile.to_file(output_file="./templates/profiling.html")
     return render_template("profiling.html")
Ejemplo n.º 25
0
def profile():
    os.makedirs("profile/", exist_ok=True)
    for x in colcat:
        df[x] = df[x].factorize()[0]

    ##### Pandas Profile   ###################################
    profile = pandas_profiling.ProfileReport(df)
    profile.to_file(output_file="profile/raw_report.html")
Ejemplo n.º 26
0
def statistic_info(FGL, path):
    """
    指定したDataFrame型のデータの特徴量同士の相関や欠損値など統計的な情報をHTMLで出力します.
    :param FGL: DataFrame
    :return: なし (統計データプロファイルを作成)
    """
    path = path + "/ana.html"
    ppf.ProfileReport(FGL).to_file(outputfile=path)
Ejemplo n.º 27
0
def feature_report():
    mmp_config = config.Config()
    print('Reading train.h5...')
    with timer('Reading train.h5'):
        df_train = pd.read_hdf(mmp_config.TRAIN_H5_PATH, key='data')

    print('Reading test.h5...')
    with timer('Reading test.h5'):
        df_test = pd.read_hdf(mmp_config.TEST_H5_PATH, key='data')

    with timer('Train report'):
        train_report = pdf.ProfileReport(df_train)
        train_report.to_file(mmp_config.TRAIN_REPORT_PATH)

    with timer('Test report'):
        test_report = pdf.ProfileReport(df_test)
        test_report.to_file(mmp_config.TEST_REPORT_PATH)
Ejemplo n.º 28
0
def profiling(df):

    print('Start Profiling')
    start = time.time()
    sample_for_profiling = df
    profile_target = pdp.ProfileReport(sample_for_profiling)
    print("Profile", time.time() - start, "s")
    profile_target.to_file("profile_target.html")
Ejemplo n.º 29
0
def eda(report=False, read_local=False):
    """
    探索性数据分析主程序入口
    Args:
        report: 是否保留分析报告,由于该报告对大数据集会占用大量时间,默认不生成
        read_local: 是否选择读取本地文件
    生成以下四份文件:
        1、数据对象文件:         “data/train.pickle”
        2、变量类型配置文件:      “config/variable_type.csv”
        3、样例数据:             “data/sample.xlsx”
        4、结果报告:             “result/report.html”
    """
    # 加载数据并保存本地
    print(">>> 全量数据集加载")
    data = load_data(mode="train", read_local=read_local)
    # 处理同名列
    data.columns = [
        j + f'.{i}' if data.columns.duplicated()[i] else j
        for i, j in enumerate(data.columns)
    ]
    data.to_pickle('data/train.pickle')

    # 生成变量类别配置文件
    # 变量是否保留进行特征工程(isSave): 0-不保留;1-保留
    # 变量类型(Type): numeric-数值类型;category-类别类型;datetime-日期类型;prediction-预测列;identifier-业务标识符; text-字符列
    print(">>> 生成配置表config/variable_type.csv,请完善配置")
    res = pd.DataFrame(
        data={
            'Variable':
            data.columns,
            'isSave:[0/1]': [1] * len(data.columns),
            'Type:[identifier/numeric/category/datetime/text/prediction]':
            ['numeric'] * len(data.columns),
            'Default': [''] * len(data.columns),
            'Comment': [''] * len(data.columns)
        })
    res.to_csv("config/variable_type.csv", index=False, encoding="utf_8_sig")

    # 抽样探索
    print(">>> 生成探索性分析抽样data/sample.xlsx")
    sample = data.sample(min(len(data), 500))
    sample.reset_index(drop=True, inplace=True)
    with pd.ExcelWriter('data/sample.xlsx') as writer:
        sample.to_excel(writer, sheet_name="抽样数据500条", index=False)
        sample.corr().to_excel(writer, sheet_name="相关系数", index=False)
        sample.describe().to_excel(writer, sheet_name="数值数据汇总")
        try:
            sample.select_dtypes('object').describe().to_excel(
                writer, sheet_name="分类数据汇总")
        except:
            pass

    # 保存分析报告
    if report:
        assert not sample.empty, "采样集为空,请确认配置文件"
        print(">>> 生成数据分析报告result/report.html")
        report = pp.ProfileReport(sample)
        report.to_file('result/report.html')
def create_analysis(file_name):
    import pandas as pd
    import pandas_profiling as pp

    df = pd.read_csv("uploads/" + file_name)
    report = pp.ProfileReport(df)
    os.chdir('outputs/')
    report.to_file(file_name.split(".")[0] + ".html")
    os.chdir('../')