Beispiel #1
0
def generate_layout():

    ckan_api = CkanApi()

    return html.Div(children=[

        # Datasets By Domain
        header(
            'Portal Totals', 'portal-totals',
            html.Div(
                id='portal-totals-tooltip-div',
                children=[
                    html.Span("Datasets In Portal",
                              style={'font-weight': 'bold'}),
                    html.Span(" - Total number of datasets in portal."),
                    html.Br(),
                    html.Br(),
                    html.Span("Scraped Datasets",
                              style={'font-weight': 'bold'}),
                    html.Span(" - Total number of scraped datasets."),
                    html.Br(),
                    html.Br(),
                    html.Span("Datasets Amended",
                              style={'font-weight': 'bold'}),
                    html.Span(
                        " - Total number of scraped datasets which have been amended by a user."
                    ),
                    html.Br(),
                    html.Br(),
                    html.Span("Datasets Manually Added",
                              style={'font-weight': 'bold'}),
                    html.Span(
                        " - Total number of datasets manually added by a user."
                    )
                ])),
        html.Hr(),

        # LED displays
        led_display(ckan_api.total_datasets(), "Datasets In Portal"),
        led_display(ckan_api.total_scraped_datasets(), "Scraped Datasets"),
        led_display(ckan_api.total_amended_datasets(),
                    "Datasets Amended By User"),
        led_display(ckan_api.total_manual_datasets(),
                    "Datasets Manually Added"),

        # Total dataset in the portal pie chart
        html.Div([
            datasets_in_portal_pie(ckan_api=ckan_api),
        ],
                 style={'vertical-align': 'middle'}),
    ])
Beispiel #2
0
 def __init__(self):
     self.ckan_api = CkanApi()
Beispiel #3
0
class InsightsPage():

    # path to Excel sheet used for creating stas dataframes
    PATH_TO_EXCEL_SHEET = os.path.join(os.getenv('ED_OUTPUT_PATH'), 'tools',
                                       'stats', 'metrics.xlsx')

    def __init__(self):
        self.ckan_api = CkanApi()

    def _get_df_from_excel_sheet(self, sheet_name):
        """ private helper function used to read excel sheets and
            create dataframes from the specified sheet"""

        return pd.read_excel(self.PATH_TO_EXCEL_SHEET,
                             sheet_name,
                             engine='openpyxl')

    def dataset_by_domain_portal_df(self):
        data = self.ckan_api.datasets_by_domain()

        domains = []
        counts = []

        for name, count in data:
            domains.append(name)
            counts.append(count)

        df = pd.DataFrame(columns=['domain', 'page count'])
        df['domain'] = domains
        df['page count'] = counts

        df.sort_values(by='page count',
                       axis='index',
                       ascending=False,
                       inplace=True,
                       ignore_index=True)

        return df

    def resource_by_domain_portal_df(self):
        data = self.ckan_api.resources_by_domain()

        domains = []
        counts = []

        for name, count in data:
            domains.append(name)
            counts.append(count)

        df = pd.DataFrame(columns=['domain', 'resource count'])
        df['domain'] = domains
        df['resource count'] = counts

        df.sort_values(by='resource count',
                       axis='index',
                       ascending=False,
                       inplace=True,
                       ignore_index=True)

        return df

    def dataset_by_domain_table(self):
        """ function used to create the Table component which displays
        the number of pages/datasets obtained from each domain """

        df = self.dataset_by_domain_portal_df()

        # add a total of page count at the end of the df
        total_page_count = df['page count'].sum()

        df_total = pd.DataFrame([['Total', total_page_count]],
                                columns=['domain', 'page count'])
        df = df.append(df_total, ignore_index=True)

        # create the DataTable
        return dash_table.DataTable(
            id='dataset_by_domain_table',
            #columns=[{"name": i, "id": i} \
            #    if i != "page count" else \
            #        {"name": "Dataset Count", "id": i} \
            #            for i in df.columns],
            columns=[{
                "id": "domain",
                "name": "Domain"
            }, {
                "id": "page count",
                "name": "Dataset Count"
            }],
            data=df.to_dict('records'),
            sort_action='native',
            style_cell={
                'textAlign': 'left',
                'whiteSpace': 'normal'
            },
            #fixed_rows={ 'headers': True, 'data': 0 },
            #virtualization=True,
            style_cell_conditional=[
                {
                    'if': {
                        'column_id': 'domain'
                    },
                    'width': '70%',
                    'textAlign': 'right'
                },
                {
                    'if': {
                        'column_id': 'page count'
                    },
                    'width': '30%'
                },
                #{'if': {'row_index': 'odd'},
                #'backgroundColor': 'rgb(248, 248, 248)'}
            ],
            style_table={
                'maxHeight': '300px',
                'maxWidth': '100%',
                'overflowY': 'auto',
                'overflowX': 'hidden',
                'margin': 0,
                'padding': 0,
            },
            style_header={
                'backgroundColor': 'rgb(230, 230, 230)',
                'fontWeight': 'bold',
                'textAlign': 'center',
            })

    def dataset_by_domain_bar(self):
        """ function creates a bar chart which displays the
        number of pages/datasets per domain """

        df = self.dataset_by_domain_portal_df()
        # create the bar chart using the created dataframe
        return dcc.Graph(
            id='dataset_by_domain_graph',
            figure={
                'data': [{
                    'x': df['domain'],
                    'y': df['page count'],
                    'type': 'bar'
                }],
                'layout': {
                    #'title': 'Datasets by Domain'
                }
            },
            config={'modeBarButtonsToRemove': buttonsToRemove})

    def resources_by_domain_table(self):
        """ function is used to create DataTable containing
        the number of resources/domains """

        working_df1 = self.resource_by_domain_portal_df()

        # sort values
        working_df1.sort_values(by='resource count',
                                axis='index',
                                ascending=False,
                                inplace=True,
                                ignore_index=True)

        # add a total of resource count at the end of the df
        total_resource_count = working_df1['resource count'].sum()

        df_total = pd.DataFrame([['Total', total_resource_count]],
                                columns=['domain', 'resource count'])
        working_df1 = working_df1.append(df_total, ignore_index=True)

        # return the created DataTable
        return dash_table.DataTable(
            id='resource_by_domain_table',
            #columns=[{"name": i, "id": i} for i in working_df1.columns],
            columns=[{
                "id": "domain",
                "name": "Domain"
            }, {
                "id": "resource count",
                "name": "Resource Count"
            }],
            data=working_df1.to_dict('records'),
            sort_action='native',
            style_cell={
                'textAlign': 'left',
                'whiteSpace': 'normal'
            },
            #fixed_rows={ 'headers': True, 'data': 0 },
            #virtualization=True,
            style_cell_conditional=[
                {
                    'if': {
                        'column_id': 'domain'
                    },
                    'width': '70%',
                    'textAlign': 'right'
                },
                {
                    'if': {
                        'column_id': 'resource count'
                    },
                    'width': '30%'
                },
                #{'if': {'row_index': 'odd'},
                #'backgroundColor': 'rgb(248, 248, 248)'}
            ],
            style_table={
                'maxHeight': '300px',
                'maxWidth': '100%',
                'overflowY': 'scroll',
                'overflowX': 'hidden',
                'margin': 0,
                'padding': 0
            },
            style_header={
                'backgroundColor': 'rgb(230, 230, 230)',
                'fontWeight': 'bold',
                'textAlign': 'center',
            })

    def total_domains_count_stats(self):
        df = self._get_df_from_excel_sheet('RESOURCE COUNT PER DOMAIN')
        return df['domain'].count()

    def total_page_count_stats(self):
        df = self._get_df_from_excel_sheet('PAGE COUNT PER DOMAIN')
        return df['page count'].sum()

    def total_resource_count_stats(self):
        df = self._get_df_from_excel_sheet('RESOURCE COUNT PER DOMAIN')
        return df['resource count'].sum()

    def total_datasets_count_stats(self):
        df = self._get_df_from_excel_sheet('DATASET COUNT PER SCRAPER')
        return df['dataset count'].sum()

    def resources_by_publisher_portal_df(self):

        data = self.ckan_api.resources_by_publisher()

        publishers = []
        counts = []
        for name, count in data:
            publishers.append(name)
            counts.append(count)

        df = pd.DataFrame(columns=['publisher', 'resource count'])
        df['publisher'] = publishers
        df['resource count'] = counts

        df.sort_values(by='resource count',
                       axis='index',
                       ascending=False,
                       inplace=True,
                       ignore_index=True)

        return df

    def resources_by_domain_pie(self):
        """" function is used to created a pie chart showing
        the number of resources gotten per domain """

        df = self.resource_by_domain_portal_df()

        wrapped_domain_names = []
        for domain in df['domain']:
            wrapped_domain_names.append(
                str('<br>'.join(textwrap.wrap(domain, width=20))))
        df['domain'] = wrapped_domain_names

        pie_figure = go.Figure(data=[
            go.Pie(
                labels=df['domain'],
                values=df['resource count'],
                title={
                    #'text': 'Resources By Domain',
                    'font': {
                        'size': 16
                    },
                    'position': 'bottom right'
                })
        ])
        pie_figure.update_traces(textposition='inside', textinfo='value+label')

        # return the pie chart
        return dcc.Graph(id='resources_by_domain_pie',
                         figure=pie_figure,
                         config={'modeBarButtonsToRemove': buttonsToRemove})

    def resources_by_publisher_table(self):

        df = self.resources_by_publisher_portal_df()

        # add a total of resource count at the end of the df
        total_resource_count = df['resource count'].sum()

        df_total = pd.DataFrame([['Total', total_resource_count]],
                                columns=['publisher', 'resource count'])
        df = df.append(df_total, ignore_index=True)

        # return the created DataTable
        return dash_table.DataTable(
            id='resource_by_publisher_table',
            columns=[{
                "id": "publisher",
                "name": "Publisher"
            }, {
                "id": "resource count",
                "name": "Resource Count"
            }],
            data=df.to_dict('records'),
            sort_action='native',
            style_cell={
                'textAlign': 'left',
                'whiteSpace': 'normal'
            },
            style_cell_conditional=[
                {
                    'if': {
                        'column_id': 'publisher'
                    },
                    'width': '70%',
                    'textAlign': 'right'
                },
                {
                    'if': {
                        'column_id': 'resource count'
                    },
                    'width': '30%'
                },
                #{'if': {'row_index': 'odd'},
                #'backgroundColor': 'rgb(248, 248, 248)'}
            ],
            style_table={
                'maxHeight': '300px',
                'maxWidth': '100%',
                'overflowY': 'scroll',
                'overflowX': 'hidden',
                'margin': 0,
                'padding': 0
            },
            style_header={
                'backgroundColor': 'rgb(230, 230, 230)',
                'fontWeight': 'bold',
                'textAlign': 'center',
            })

    def resources_by_publisher_pie(self):

        df = self.resources_by_publisher_portal_df()

        wrapped_publisher_names = []
        for publisher in df['publisher']:
            wrapped_publisher_names.append(
                str('<br>'.join(textwrap.wrap(publisher, width=20))))
        df['publisher'] = wrapped_publisher_names

        pie_figure = go.Figure(data=[
            go.Pie(
                labels=df['publisher'],
                values=df['resource count'],
                title={
                    #'text': 'Resources By Domain',
                    'font': {
                        'size': 16
                    },
                    'position': 'bottom right'
                })
        ])
        pie_figure.update_traces(textposition='inside', textinfo='value+label')

        # return the pie chart
        return dcc.Graph(id='resources_by_publisher_pie',
                         figure=pie_figure,
                         config={'modeBarButtonsToRemove': buttonsToRemove})

    def dataset_by_office_portal_data(self):
        rows = []
        total = 0

        datasets_by_publisher = self.ckan_api.datasets_by_publisher()
        for name, count in datasets_by_publisher:
            rows.append({'s': name, 'datopian': count})
            total += count

        rows.sort(key=lambda item: item['datopian'], reverse=True)

        rows.append({'s': 'Total', 'datopian': total})
        return rows

    def get_datasets_bars_portal_data(self):

        data_list = list()
        datasets_by_publisher = self.ckan_api.datasets_by_publisher()
        for name, count in datasets_by_publisher:
            legend_str = '<br>'.join(textwrap.wrap(name, width=20))
            data_list.append({
                'x': ['Datasets'],
                'y': [count],
                'type': 'bar',
                'name': legend_str
            })

        data_list.sort(key=lambda item: item['y'][0], reverse=True)
        return data_list