def post_process_report(self):
        # import data
        df = pd.read_csv(filepath_or_buffer='data/non_html_page_report.csv')

        # explode so we have one attachment for each row
        df['attachment_path'] = df['attachment_path'].apply(ast.literal_eval)
        df_long = df.explode(column='attachment_path').copy()

        # extract links
        df_long['attachment_ext'] = df_long['attachment_path'].apply(
            lambda x: extract_from_path(data=x, part='ext'))
        # un-nest so can easily replace blanks
        df_long['attachment_ext'] = df_long['attachment_ext'].apply(
            lambda x: ''.join(x))
        df_long['attachment_ext'] = df_long['attachment_ext'].replace(
            to_replace='', value=np.NaN)

        # remove non-attachment and empty rows
        df_long = df_long.dropna(subset=['attachment_path', 'attachment_ext'],
                                 how='any',
                                 axis=0)

        # filter for after Sep 2018 for Specialist and Travel Advice publishers
        df_long['first_published_at'] = df_long['first_published_at'].astype(
            'datetime64[ns]')
        cond_one = (df_long['publishing_app'] == 'specialist-publisher') & (
            df_long['first_published_at'] > '2018-09-30')
        cond_two = (df_long['publishing_app'] == 'travel-advice-publisher') & (
            df_long['first_published_at'] > '2018-09-30')
        cond_three = df_long['publishing_app'].isin(
            ['publisher', 'service-manual-publisher'])
        df_long = df_long[cond_one | cond_two | cond_three].copy()

        # export three sets of files
        #   i. all data in one file
        #   ii. sample data in one file (for viewing purposes)
        #   iii. all data but split by primary publishing organisation (for viewing purposes)

        # i.
        df_long.to_csv(
            path_or_buf='data/inaccessible_nonhtml_reports/full.csv',
            index=False)

        # ii.
        df_long.sample(n=10000, random_state=42).to_csv(
            path_or_buf='data/inaccessible_nonhtml_reports/sample.csv',
            index=False)

        # iii.
        df_long = df_long.set_index('publishing_app')
        for key in df_long.index.unique():
            df_long.loc[key].to_csv(
                'data/inaccessible_nonhtml_reports/{}.csv'.format(key),
                index=False,
                header=True)
Esempio n. 2
0
    def count_attachment_from_html(text: str) -> dict:
        """
        Extracts attachments as identified by links from a GOV.UK webpage via looking at href tags.
        Very similar to extract_links_from_html() but returns more results.
        Example: government/publications/measles-mumps-and-rubella-lab-confirmed-cases-in-england-2019
        Reference:
            - `src/helpers/prepreprocess_text/py`

        :param text: String of the HTML code to extract attachments from.
        :return: Dictionary of count of attachment extensions.
        """
        try:
            soup = BeautifulSoup(text, 'html5lib')
            links = [
                link.get('href') for link in soup.find_all(name='a', href=True)
            ]
            # extract extension
            attachments = extract_from_path(data=links, part='ext')
            # take valid attachments only
            attachments = [x for x in attachments if x in ATTACHMENTS]
            # take unique html attachments
            attachments_html = [html for html in links if html.startswith('/')]
            attachments_html = list(set(attachments_html))
            # count repeated attachment elements in list
            attachment_counts = dict(Counter(attachments))
            # add html counts
            html_count = len(attachments_html)
            # cast 0s to None to be consistent with other attachments
            if html_count == 0:
                html_count = None

            attachment_counts.update({'.html': html_count})

            return attachment_counts

        except Exception as e:
            print("error @count_attachment_from_html", e)
Esempio n. 3
0
    def process_page(self, content_item, html):

        content_item['primary_publishing_organisation'] = extract_subtext(
            text=content_item['organisations'],
            key='primary_publishing_organisation',
            index=1)

        # ignore cases we do not want to return
        publishers = [
            "publisher", "service-manual-publisher", "specialist-publisher",
            "travel-advice-publisher"
        ]
        if not content_item['publishing_app'] in publishers:
            return []

        attachments = (".chm|.csv|.diff|.doc|.docx|.dot|.dxf|.eps|" +
                       ".gif|.gml|.ics|.jpg|.kml|.odp|.ods|.odt|.pdf|" +
                       ".png|.ppt|.pptx|.ps|.rdf|.ris|.rtf|.sch|.txt|" +
                       ".vcf|.wsdl|.xls|.xlsm|.xlsx|.xlt|.xml|.xsd|.xslt|" +
                       ".zip")
        if not any(
                re.findall(pattern=attachments,
                           string=content_item['details'])):
            return []

        if pd.isna(content_item['details']):
            return []

        # extract attachment url
        # each method gives different results
        # need both methods to capture different ways attachments can be on webpage
        content_item['attachment_url_one'] = extract_links_from_html(
            text=content_item['details'])
        content_item['attachment_url_two'] = self.extract_attachment(
            text=content_item['details'], element='url')
        content_item['attachment_url_three'] = self.extract_attachment_smart(
            text=content_item['details'])

        # combine two lists
        content_item['attachment_path'] = content_item['attachment_url_one'] \
                                          + content_item['attachment_url_two'] \
                                          + content_item['attachment_url_three']
        # remove duplicates
        content_item['attachment_path'] = list(
            dict.fromkeys(content_item['attachment_path']))

        # extract file extension from attachment url
        content_item['attachment_ext'] = extract_from_path(
            data=content_item['attachment_path'], part='ext')

        # return only pages with attachments by ignoring empty lists
        if not content_item['attachment_ext']:
            return []
        else:
            return [
                content_item['base_path'],
                content_item['primary_publishing_organisation'],
                content_item['publishing_app'], content_item['document_type'],
                content_item['first_published_at'],
                content_item['attachment_path']
            ]
Esempio n. 4
0
                 names=list(CONTENT_STORE_HEADER.keys()),
                 dtype=CONTENT_STORE_HEADER,
                 parse_dates=CONTENT_STORE_DATE)

# drop empty rows
df_process = df.dropna(subset=['details'])

# take one page
test = df[df["base_path"] == "/government/publications/success-profiles"]
test = df[df["base_path"] ==
          "/government/publications/screening-tests-for-you-and-your-baby"]
test = test['details'].iloc[0]
test = BeautifulSoup(test, features='lxml')
# get page links
page_links = [link.get('href') for link in test.find_all('a', href=True)]
page_attachments = extract_from_path(data=page_links, part='ext')
page_html = [html for html in page_links if html.startswith('/')]
# get valid attachments only
page_attachments = [x for x in page_attachments if x in ATTACHMENTS]
# get unique elements
page_html = list(set(page_html))
# add html links
page_attachments.extend(page_html)
# count repeated attachment elements in list
attachment_counts = dict(Counter(page_attachments))
# add html counts
attachment_counts.update({'html': len(page_html)})

# try using existing function
test = df[df["base_path"] == "/government/publications/success-profiles"]
test = test['details'].iloc[0]
from src.helpers.preprocess_text import extract_from_path

import pandas as pd
import numpy as np
import ast

# import data
df = pd.read_csv(filepath_or_buffer='data/non_html_page_report.csv')

# explode so we have one attachment for each row
df['attachment_path'] = df['attachment_path'].apply(ast.literal_eval)
df_long = df.explode(column='attachment_path').copy()

# extract links
df_long['attachment_ext'] = df_long['attachment_path'].apply(lambda x: extract_from_path(data=x,
                                                                                         part='ext'))
# un-nest so can easily replace blanks
df_long['attachment_ext'] = df_long['attachment_ext'].apply(lambda x: ''.join(x))
df_long['attachment_ext'] = df_long['attachment_ext'].replace(to_replace='',
                                                              value=np.NaN)

# remove non-attachment and empty rows
df_long = df_long.dropna(subset=['attachment_path', 'attachment_ext'], how='any', axis=0)

# filter for after Sep 2018 for Specialist and Travel Advice publishers
df_long['first_published_at'] = df_long['first_published_at'].astype('datetime64[ns]')
cond_one = (df_long['publishing_app'] == 'specialist-publisher') & (df_long['first_published_at'] > '2018-09-30')
cond_two = (df_long['publishing_app'] == 'travel-advice-publisher') & (df_long['first_published_at'] > '2018-09-30')
cond_three = df_long['publishing_app'].isin(['publisher', 'service-manual-publisher'])
df_long = df_long[cond_one | cond_two | cond_three].copy()