Ejemplo n.º 1
0
def test_get_categorical_2():
    data = Table(
        os.path.join(os.path.dirname(__file__), 'data', 'msleep_ggplot.csv'))
    data2 = Table(
        os.path.join(os.path.dirname(__file__), 'data',
                     'msleep_get_categorical.csv'))
    assert data.get_categorical().equals(data2) == False
Ejemplo n.º 2
0
def test_get_categorical_1():
    data = Table(
        os.path.join(os.path.dirname(__file__), 'data', 'Freedman.csv'))
    data2 = Table(
        os.path.join(os.path.dirname(__file__), 'data',
                     'Freedman_get_categorical.csv'))
    assert data.get_categorical().equals(data2) == False
Ejemplo n.º 3
0
def test_table_get_categorical_3():
    data = Table(os.path.join(os.path.dirname(__file__), 'data', 'cereal.csv'))
    assert list(data.get_categorical().columns) == ['name', 'mfr', 'type']
Ejemplo n.º 4
0
def test_table_get_categorical_2():
    data = Table(
        os.path.join(os.path.dirname(__file__), 'data', 'msleep_ggplot.csv'))
    assert list(data.get_categorical().columns) == [
        'name', 'genus', 'vore', 'order', 'conservation'
    ]
Ejemplo n.º 5
0
def test_table_get_categorical_1():
    data = Table(
        os.path.join(os.path.dirname(__file__), 'data', 'Freedman.csv'))
    assert list(data.get_categorical().columns) == ['Location']
Ejemplo n.º 6
0
def audit(inp, save):

    if inp:
        start_time = datetime.now()
        description = PrettyTable(['Name', 'Values'])
        description.align['Name'] = "l"

        table = PrettyTable([
            'Features', 'Type', 'Value Type', 'Outliers', 'Missing',
            '(%) Missing', 'Distinct Count', 'Min', 'Mean', 'Max', 'Zeros',
            '(%) Zeros', 'Memory Size'
        ])
        try:
            df_raw = Table(inp)
            df = df_raw.data
            features = df.columns
            table.align["Features"] = "l"
            table.align["Value Type"] = "l"
            table.sortby = "Distinct Count"
            table.reversesort = True

            # Generate dynamic analytical stories
            stories = get_interesting_stories(df_raw)

            # Group duplicates
            dups = df.groupby(df.columns.tolist()).size().reset_index().rename(
                columns={0: 'count'})

            description.add_row(['Total variables', df.shape[1]])
            description.add_row(['Total Observations', df.shape[0]])
            description.add_row(['Missing Cells', df.isnull().sum().sum()])
            description.add_row(
                ['(%) Missing Cells',
                 df.isnull().sum().sum() / len(df)])
            description.add_row(
                ['Duplicate Rows', dups['count'].sum() - dups.shape[0]])
            description.add_row([
                '(%) Duplicate Rows',
                (dups['count'].sum() - dups.shape[0]) / len(dups)
            ])
            description.add_row([
                'Total Size of Memory',
                str(df.memory_usage().sum() / 1000) + 'KiB'
            ])
            description.add_row(['🍋 Total Categorical', count_categorical(df)])
            description.add_row(
                ['🔟 Total Continuous',
                 len(df_raw.get_numerical().columns)])
            #description.add_row(['Started at', start_time.strftime("%d-%b-%y %H:%M:%S")])

            for col in tqdm(features, ascii=True, desc="Auditing.. : "):
                table.add_row([
                    col.strip(), df[col].dtypes,
                    type_of_variable(df[col]),
                    check_outlier(df[col]),
                    check_missing(df[col]),
                    column_missing_percentage(df[col]),
                    distinct_count(df[col]),
                    count_min(df[col]),
                    count_mean(df[col]),
                    count_max(df[col]), df[col].isin([0]).sum(),
                    round(df[col].isin([0]).sum() / len(df.columns), 2),
                    str(df[col].memory_usage() / 1000) + 'KiB'
                ])

            end_time = datetime.now()
            #description.add_row(['Ended at', end_time.strftime("%d-%b-%y %H:%M:%S")])
            description.add_row(
                ['Time Elapsed', (end_time - start_time).total_seconds()])

            # Save the report into HTML
            if save:

                desc_html = description.get_html_string()
                description_summary = desc_html.replace(
                    '<table>', '<table class="table is-bordered">')

                report_html = table.get_html_string()
                report_html = report_html.replace(
                    '<table>', '<table class="table is-bordered">')

                stories_json = [{
                    'question': row['question'],
                    'answer': row['answer']
                } for row in stories]
                loader = template.Loader(
                    os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                 'templates'))

                total_categorical = list(df_raw.get_categorical())
                total_continuous = list(df_raw.get_numerical())

                open(save + '.html', 'wb').write(
                    loader.load("index.html").generate(**locals()))
                click.echo('Report generated with name ' + save + '.html')

            click.echo(description)
            click.echo(table)

        except FileNotFoundError as e:
            logging.warning("Given input file doesn't exists")
            print(
                os.path.join(os.path.dirname(os.path.abspath(__file__)),
                             'templates'))