Exemple #1
0
def eth_facts_service(scraper, tree):

    scraper.dataset.publisher = GOV['department-for-education']
    scraper.dataset.title = tree.xpath('//*[@id="title"]/text()')[0].strip()
    scraper.dataset.contactPoint = tree.xpath(
        '//*[@id="footer"]/div/div[4]/a[2]/@href')
    scraper.dataset.issued = parse(
        tree.xpath('//*[@id="history"]/p[1]/span/text()')[0]).date()
    scraper.dataset.modified = parse(
        tree.xpath('//*[@id="history"]/p[2]/span/text()')[0]).date()

    for node in tree.xpath(
            "//*//*[@itemtype='http://schema.org/DataDownload']/a"):
        distribution = Distribution(scraper)
        distribution.title = node.attrib['data-event-label']
        distribution.downloadURL = urljoin(scraper.uri, node.attrib['href'])
        distribution.issued = scraper.dataset.issued
        distribution.modified = scraper.dataset.modified
        fileType = re.search(
            '\(([^)]+)',
            str(
                tree.xpath(
                    "//*//*[@itemtype='http://schema.org/DataDownload']/a/text()"
                ))).group(1)

        distribution.mediaType = {
            'csv': CSV,
            'excel': Excel
        }.get(fileType,
              mimetypes.guess_type(distribution.downloadURL)[0])
        if distribution.mediaType in ACCEPTED_MIMETYPES:
            scraper.distributions.append(distribution)
        else:
            pass
Exemple #2
0
def scrape(scraper, tree):
    # It's not clear whether the pages are collections of datasets or datasets with distributions.
    # Assume the latter for simplicity for now.
    scraper.dataset.publisher = GOV['welsh-government']
    # OGLv3 license is quoted for the whole site on https://gov.wales/copyright-statement
    scraper.dataset.rights = "https://gov.wales/copyright-statement"
    scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/'
    scraper.dataset.title = tree.xpath('//h1//text()')[0].strip()
    scraper.dataset.description = tree.xpath(
        "//div[contains(concat(' ', @class, ' '), ' hero-block__summary ')]/div/p/text()"
    )[0].strip()
    meta = tree.xpath("//div[@class='header-meta']")[0]
    published = meta.xpath(
        "div[contains(concat(' ', @class, ' '), ' first-published ')]/" + \
        "div[contains(concat(' ', @class, ' '), ' item ')]/text()")[0].strip()
    scraper.dataset.issued = parse(published, dayfirst=True)
    updated = meta.xpath(
        "div[contains(concat(' ', @class, ' '), ' last-updated ')]/" + \
        "div[contains(concat(' ', @class, ' '), ' item ')]//time/@datetime")[0].strip()
    scraper.dataset.modified = isoparse(updated)

    @lru_cache()
    def fetch_page(url):
        page = scraper.session.get(url)
        return html.fromstring(page.text)

    for article in tree.xpath("//div[@role='article']"):
        title_div = article.xpath("div[@class = 'index-list__title']")[0]
        meta_div = article.xpath("div[@class = 'index-list__meta']")[0]
        release_page = fetch_page(title_div.xpath('a/@href')[0])
        for details in release_page.xpath(
                "//div[@id = 'release--data']//div[@class = 'document__details']"
        ):
            distribution = Distribution(scraper)
            distribution.downloadURL = details.xpath("h3/a/@href")[0]
            distribution.title = details.xpath("h3/a/div/text()")[0].strip()
            distribution.issued = isoparse(
                details.xpath(
                    "//div[contains(concat(' ', @class, ' '), ' meta__released ')]//time/@datetime"
                )[0])
            distribution.modified = isoparse(
                details.xpath(
                    "//div[contains(concat(' ', @class, ' '), ' meta__update_history ')]//time/@datetime"
                )[0])
            dist_meta = details.xpath("h3/a/span/text()")[0].strip()
            meta_match = FILE_TYPE_AND_SIZE_RE.match(dist_meta)
            if meta_match:
                distribution.mediaType = {'ODS': ODS}.get(meta_match.group(1))
                size_qualifier = meta_match.group(3)
                size = float(meta_match.group(2))
                if size_qualifier == "KB":
                    distribution.byteSize = int(size * 1024)
                elif size_qualifier == "kB":
                    distribution.byteSize = int(size * 1000)
            else:
                distribution.mediaType, _ = mimetypes.guess_type(
                    distribution.downloadURL)
            scraper.distributions.append(distribution)
Exemple #3
0
def scrape(scraper, tree):

    # A quick safety in case people are using this scraper incorrectly
    if "?search=" not in scraper.uri:
        raise Exception(
            """Aborting. This scraper is intended to run off the DCNI seach page.
        Please modify your url to use the site search.

        If in doubt, work from this page, change the quoted search text and capture the url
        https://www.communities-ni.gov.uk/publications/topic/8182?search=%22Northern+Ireland+Housing+Bulletin%22&Search-exposed-form=Go&sort_by=field_published_date
        """)

    scraper.dataset.publisher = GOV[
        'department-for-communities-northern-ireland']
    scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-" \
        "government-licence/version/3/'

    # We're taking each search result as a distribution
    search_result_urls = []
    for linkObj in tree.xpath("//h3/a"):

        # linkObj.items() is eg ("href", "www.foo.com") where we want a url
        href = [x[1] for x in linkObj.items() if x[0] == "href"][0]

        # Add to distributions url list, get the root from the original url
        search_result_urls.append(
            scraper.uri.split("/publications/topic")[0] + href)

    # keep track of dates issued so we can find the latest
    last_issued = None

    for url in search_result_urls:

        # Get the distribution page
        page = scraper.session.get(url)
        distro_tree = html.fromstring(page.text)

        # Get any spreadsheets (ods or excel) linked on the page
        spreadsheet_files = [
            x for x in distro_tree.xpath('//a/@href')
            if x.lower().endswith(".ods") or x.lower().endswith(".xlsx")
        ]

        # Now map them together, so we have the supporting info for each relevent download
        # TODO - make better, kinda nasty
        title_download_map = {}
        for spreadsheet_file in spreadsheet_files:

            # Create our new distribution object
            this_distribution = Distribution(scraper)

            # Identify the correct title
            this_distribution.title = distro_tree.xpath(
                "//a[@href='" + spreadsheet_file +
                "']/text()".format(spreadsheet_file))[0]
            this_distribution.downloadURL = spreadsheet_file

            if this_distribution.downloadURL.lower().endswith(".xlsx"):
                media_type = Excel
            elif this_distribution.downloadURL.lower().endswith(".ods"):
                media_type = ODS
            else:
                raise Exception(
                    "Aborting. Unexpected media type for url: '{}'".format(
                        this_distribution.downloadURL))
            this_distribution.mediaType = media_type

            # Published and modifed time
            this_distribution.issued = isoparse(
                distro_tree.xpath(
                    "//*[@property='article:published_time']/@content")
                [0]).date()
            this_distribution.modified = isoparse(
                distro_tree.xpath(
                    "//*[@property='article:modified_time']/@content")
                [0]).date()
            this_distribution.description = distro_tree.xpath(
                "//*[@class='field-summary']/p/text()")[0]

            if last_issued is None:
                last_issued = this_distribution.issued
            elif this_distribution.issued > last_issued:
                last_issued = this_distribution.issued

            scraper.distributions.append(this_distribution)

    # Whatever date the latest distribution was issued, is the last issued date for this "dataset"
    scraper.dataset.issued = last_issued
Exemple #4
0
def content_api_guidance(scraper, metadata):

    title = metadata.get("title", None)
    if title is None:
        logging.warning(
            f'The title for dataset {scraper.url} not set, title field missing from content api'
        )
    else:
        scraper.dataset.title = title

    description = metadata.get("description", None)
    if description is None:
        logging.warning(
            f'The description for dataset {scraper.url} not set, description field missing from content api'
        )
    else:
        scraper.dataset.description = description

    first_published_at = metadata.get("first_published_at", None)
    if first_published_at is None:
        logging.warning(
            f'The issued date for dataset {scraper.url} not set, issued date field missing from content api'
        )
    else:
        scraper.dataset.issued = first_published_at

    public_updated_at = metadata.get("public_updated_at", None)
    if public_updated_at is None:
        logging.warning(
            f'The modified date for dataset {scraper.url} not set, modified date field missing from content api'
        )
    else:
        scraper.dataset.modified = public_updated_at

    scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/'

    if 'links' in metadata and 'organisations' in metadata['links']:
        orgs = metadata['links']['organisations']
        if len(orgs) == 0:
            logging.warning("No publishing organisations listed.")
        elif len(orgs) >= 1:
            if len(orgs) > 1:
                logging.warning(
                    'More than one organisation listed, taking the first.')
            scraper.dataset.publisher = orgs[0]["web_url"]

    for attachment in metadata['details']['attachments']:
        try:
            distro = Distribution(scraper)

            dist_title = attachment.get('title')
            distro.title = dist_title

            dist_downloadURL = attachment.get('url')
            distro.downloadURL = attachment['url']

            distro.mediaType, _ = mimetypes.guess_type(distro.downloadURL)

            distro.issued = scraper.dataset.issued

            distro.modified = scraper.dataset.modified

            scraper.distributions.append(distro)

        except KeyError:
            logging.warning(
                f'Failed to extract attachment {json.dumps(attachment, indent=2)}'
            )