コード例 #1
0
def scrape_pages(scraper, tree):
    scraper.catalog.title = tree.xpath("//title/text()")[0].strip()
    scraper.catalog.dataset = []
    scraper.catalog.uri = scraper.uri + "#catalog"
    scraper.catalog.publisher = GOV['hm-revenue-customs']
    scraper.catalog.rights = "https://www.uktradeinfo.com/AboutUs/Pages/TermsAndConditions.aspx"
    # from above terms, link to crown copyright at the National Archives says default license is OGL
    scraper.catalog.license = "http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/"
    # just scrape the first table for now; others are archives or factsheets.
    tables = tree.xpath("//table[contains(concat(' ', @class, ' '), ' hmrc ') or (contains(@summary, 'Tax & Duty bulletins'))]")
    for table in tables:
        header = True
        columns = []
        for row in table.xpath("tbody/tr"):
            if header:
                columns = [t.strip() for t in row.xpath("th/text()")]
                header = False
            else:
                dataset = Dataset(scraper.uri)
                dataset.publisher = scraper.catalog.publisher
                dataset.license = scraper.catalog.license
                dataset.distribution = []
                bulletin_date = None
                for k, v in zip(columns, row.xpath("td")):
                    if k == 'Bulletin Title' or k == 'Title' or k == 'Factsheet Title':
                        dataset.title = v.text
                    elif k == 'Publication Source' or k == 'Source':
                        pass
                    elif k == 'Release Date' or k == 'Released':
                        dataset.issued = parse(v.text.strip(), dayfirst=True)
                    elif k == 'Bulletin Date' or k == 'Period':
                        bulletin_date = v.text
                    elif k == 'View' or k == 'View Archive':
                        href = v.xpath("a/@href")[0]
                        view_url = urljoin(scraper.uri, href)
                        if '?viewname' in view_url:
                            # this is a link off to a separate "archive" page with links to the
                            # actual dataset releases
                            archive_page = scraper.session.get(view_url)
                            archive_tree = html.fromstring(archive_page.text)
                            for release_row in archive_tree.xpath("//table[@class='hmrc']//tr")[1:]:
                                dist = Distribution(scraper)
                                cols = release_row.xpath("td")
                                dist.downloadURL = urljoin(view_url, cols[1].xpath("a/@href")[0].replace(' ', '%20'))
                                archive_date = cols[0].text
                                dist.issued = parse(archive_date.strip(), dayfirst=True)
                                dist.mediaType, _ = mimetypes.guess_type(dist.downloadURL)
                                dist.title = dataset.title + ' ' + archive_date
                                dataset.distribution.append(dist)
                        else:
                            dist = Distribution(scraper)
                            dist.downloadURL = urljoin(scraper.uri, href)
                            if dist.downloadURL.endswith('.xls') or dist.downloadURL.endswith('.xlsx'):
                                dist.mediaType = Excel
                            dist.title = dataset.title + (' ' + bulletin_date) if bulletin_date else ''
                            dataset.distribution.append(dist)
                scraper.catalog.dataset.append(dataset)
コード例 #2
0
ファイル: govscot.py プロジェクト: GSS-Cogs/gss-utils
def collections(scraper, tree):

    scraper.dataset.title = tree.xpath(
        '.// *[@id="page-content"] / header / div / div[1] / h1')[0].text

    scraper.dataset.description = tree.xpath(
        '.// *[@id="page-content"] / header / div / div[3] / p')[0].text

    pubPages = tree.findall('.//ul[@class="collections-list"]')
    for page in pubPages:
        pages = page.findall('.//a')
        for publication in pages:
            url = urljoin("https://www.gov.scot", publication.attrib['href'])
            r = scraper.session.get(url)
            if r.status_code != 200:
                raise Exception(
                    'Failed to get url {url}, with status code "{status_code}".'
                    .format(url=url, status_code=r.status_code))
            pubTree = html.fromstring(r.text)
            try:
                pubDate = pubTree.xpath(
                    '// *[@id="page-content"] / div[1] / div / header / div[2] / div[1] / section / div[1] / span[2] / strong'
                )[0].text
                scraper.dataset.issued = parse(pubDate).date()
            except:
                scraper.dataset.issued = parse('January 1 1900').date()
                logging.warning(
                    "No issued date found, placement date added (1900-01-01)")
                pass

            # Distributions as embedded in page body
            dists = pubTree.xpath(
                '//*[@id="page-content"]/div[3]/div/div/div[2]/section/div/div[2]/h3/a'
            )
            for element in dists:
                dist = Distribution(scraper)
                dist.title = element.text
                dist.downloadURL = urljoin("https://www.gov.scot",
                                           element.attrib['href'])
                dist.mediaType, _ = mimetypes.guess_type(dist.downloadURL)
                if dist.mediaType in ACCEPTED_MIMETYPES:
                    scraper.distributions.append(dist)

            # Distributions as supportin files (attached top-right of page) of the principle dataset
            dists = pubTree.xpath(
                '//a[contains(@class, "supporting-file__link")]')
            for element in dists:
                dist = Distribution(scraper)
                dist.title = assert_get_one(
                    pubTree.xpath('//h1'),
                    'title for distribution').text.strip()
                dist.downloadURL = urljoin("https://www.gov.scot",
                                           element.attrib['href'])
                dist.mediaType, _ = mimetypes.guess_type(dist.downloadURL)
                if dist.mediaType in ACCEPTED_MIMETYPES:
                    scraper.distributions.append(dist)
コード例 #3
0
def scrape_rts(scraper, metadata_tree):
    """
        HMRC RTS is a special case, where the main page is:
          https://www.uktradeinfo.com/Statistics/RTS/Pages/default.aspx
        the RTS dataset metadata is available from:
          https://www.uktradeinfo.com/Lists/HMRC%20%20Metadata/DispForm.aspx?ID=3&ContentTypeId=0x0100E95984F4DBD401488EB2E5697A7B38EF
        and the zipped data files are linked from:
          https://www.uktradeinfo.com/Statistics/RTS/Pages/RTS-Downloads.aspx
    """
    METADATA_URL = 'https://www.uktradeinfo.com/Lists/HMRC%20%20Metadata/DispForm.aspx?ID=3&ContentTypeId=0x0100E95984F4DBD401488EB2E5697A7B38EF'
    DISTRIBUTION_URL = 'https://www.uktradeinfo.com/Statistics/RTS/Pages/RTS-Downloads.aspx'
    # from above terms, link to crown copyright at the National Archives says default license is OGL
    scraper.dataset.license = "http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/"
    # Ideally, as this looks like a Snarepoint site, we should be able to fetch the metadata as JSON via tha Snarepoint
    # "REST" (where MS still wrongly say REST means CRUD) interface. But we can't, so let's just scrape.
    metadata_page = scraper.session.get(METADATA_URL)
    metadata_tree = html.fromstring(metadata_page.text)

    def metadata_value(prop):
        return ' '.join(metadata_tree.xpath(f"//tr[td/h3/text() = '{prop}']/td[2]/text()")).strip()

    scraper.dataset.title = metadata_value('Title')
    scraper.dataset.description = metadata_value('Identification:Abstract')
    assert metadata_value('Organisation:Responsible Organisation') == 'HM Revenue & Customs – Trade Statistics.', \
        "Expecting org to be 'HM Revenue & Customs – Trade Statistics.', got '" + \
        metadata_value('Organisation:Responsible Organisation') + "'."
    scraper.dataset.publisher = GOV['hm-revenue-customs']
    scraper.dataset.rights = "https://www.uktradeinfo.com/AboutUs/Pages/TermsAndConditions.aspx"
    scraper.dataset.contactPoint = metadata_tree.xpath("//tr[td/h3/text() = 'Organisation:Email Address']/td[2]/a/@href")[0]
    scraper.dataset.keyword = [
        keyword.strip().rstrip('.') for keyword in metadata_value('Classification:Keyword').split(',')]
    assert metadata_value('Classification:National Statistics Theme') == 'Business and Energy', \
        "Expecting National Statistics Theme to be 'Business and Energy"
    assert metadata_value('Classification:Sub-theme') == 'International Trade', \
        "Expecting sub-theme to be 'International Trade"
    scraper.dataset.theme = THEME['business-industry-trade-energy']

    # now fetch list of distributions
    distributions_page = scraper.session.get(DISTRIBUTION_URL)
    distributions_tree = html.fromstring(distributions_page.text)
    for anchor in distributions_tree.xpath("//div[h1[text()='closed periods']]/ul/li/a"):
        dist = Distribution(scraper)
        dist.title = anchor.text.strip()
        dist.downloadURL = urljoin(scraper.uri, anchor.get('href'))
        dist.mediaType, encoding = mimetypes.guess_type(dist.downloadURL)
        scraper.distributions.append(dist)
        
    for anchor in distributions_tree.xpath("//div[h1[text()='Open periods']]/ul/li/a"):
        dist2 = Distribution(scraper)
        dist2.title = anchor.text.strip()
        dist2.downloadURL = urljoin(scraper.uri, anchor.get('href'))
        dist2.mediaType, encoding = mimetypes.guess_type(dist2.downloadURL)
        scraper.distributions.append(dist2)
コード例 #4
0
def eth_facts_service(scraper, tree):

    scraper.dataset.publisher = GOV['department-for-education']
    scraper.dataset.title = tree.xpath('//*[@id="title"]/text()')[0].strip()
    scraper.dataset.contactPoint = tree.xpath(
        '//*[@id="footer"]/div/div[4]/a[2]/@href')
    scraper.dataset.issued = parse(
        tree.xpath('//*[@id="history"]/p[1]/span/text()')[0]).date()
    scraper.dataset.modified = parse(
        tree.xpath('//*[@id="history"]/p[2]/span/text()')[0]).date()

    for node in tree.xpath(
            "//*//*[@itemtype='http://schema.org/DataDownload']/a"):
        distribution = Distribution(scraper)
        distribution.title = node.attrib['data-event-label']
        distribution.downloadURL = urljoin(scraper.uri, node.attrib['href'])
        distribution.issued = scraper.dataset.issued
        distribution.modified = scraper.dataset.modified
        fileType = re.search(
            '\(([^)]+)',
            str(
                tree.xpath(
                    "//*//*[@itemtype='http://schema.org/DataDownload']/a/text()"
                ))).group(1)

        distribution.mediaType = {
            'csv': CSV,
            'excel': Excel
        }.get(fileType,
              mimetypes.guess_type(distribution.downloadURL)[0])
        if distribution.mediaType in ACCEPTED_MIMETYPES:
            scraper.distributions.append(distribution)
        else:
            pass
コード例 #5
0
ファイル: govscot.py プロジェクト: GSS-Cogs/gss-utils
def scrape_old(scraper, tree):
    """Deprecated scraper for 'https://www2.gov.scot/Topics/Statistics/Browse/' links."""

    logging.warning(
        "This scraper has been depreciated. Please use the more recent version if viable"
    )

    scraper.dataset.publisher = GOV['the-scottish-government']
    scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/'
    scraper.dataset.title = tree.xpath(
        "//div[@id='body2']//h2/text()")[0].strip()
    scraper.dataset.description = scraper.to_markdown(
        tree.xpath("//div[@id='body2']//h2/following-sibling::div/child::div"))
    doctable = tree.xpath(
        "//table[contains(concat(' ', @class, ' '), ' dg file ')]")[0]
    for row in doctable.xpath('tr'):
        try:
            if row.xpath('th/text()')[0].strip() == 'File:':
                cell = row.xpath('td')[0]
                dist = Distribution(scraper)
                title_size_date = re.compile(
                    r'(.*)\[([^,]+),\s+([0-9.]+)\s+([^:]+):\s([^\]]+)')
                match = title_size_date.match(cell.text_content())
                if match:
                    dist.title = match.group(1)
                    scraper.dataset.issued = parse(match.group(5),
                                                   dayfirst=True).date()
                    dist.downloadURL = urljoin(scraper.uri,
                                               cell.xpath('a/@href')[0])
                    dist.mediaType, _ = mimetypes.guess_type(dist.downloadURL)
                scraper.distributions.append(dist)
        except:
            break
コード例 #6
0
def scrape(scraper, tree):
    scraper.dataset.title = tree.xpath("//h1/text()")[0].strip()
    scraper.dataset.issued = parse(tree.xpath(
        "//p[contains(concat(' ', @class, ' '), ' date-pub ')]/span[@class='date-display-single']/text()")[0],
                                   dayfirst=True).date()
    scraper.dataset.publisher = GOV['department-of-health-northern-ireland']
    for doc_link in tree.xpath(
            "//div[contains(concat(' ', @class, ' '), ' publicationDocs ')]"
            "//div[contains(concat(' ', @class, ' '), ' nigovfile ')]/a"):
        dist = Distribution(scraper)
        dist.downloadURL = doc_link.get('href')
        dist.title = doc_link.xpath("text()")[0].strip()
        type_size = doc_link.xpath("span[@class='meta']/text()")[0].strip()
        match = type_size_re.match(type_size)
        if match:
            if match.group(1) == 'PDF':
                dist.mediaType = PDF
            else:
                dist.mediaType, _ = mimetypes.guess_type(dist.downloadURL)
            size = float(match.group(2))
            if match.group(3) == 'KB':  # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024
                dist.byteSize = int(size * 1024)
            elif match.group(3) == 'MB':  # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes
                dist.byteSize = int(size * 1000000)
            elif match.group(3) == 'GB':  # https://en.wikipedia.org/wiki/Gigabyte GB = 10^9 bytes
                dist.byteSize = int(size * 1000000000)
        scraper.distributions.append(dist)
コード例 #7
0
ファイル: govscot.py プロジェクト: GSS-Cogs/gss-utils
def publications(scraper, tree):

    scraper.dataset.title = tree.xpath(
        '.// *[@id="page-content"] / div[1] / div / header / div[1] / div / h1'
    )[0].text

    scraper.dataset.description = \
    tree.xpath('.// *[@id="page-content"] / div[1] / div / header / div[2] / div[2] / div / p')[0].text

    try:
        pubDate = tree.xpath(
            '// *[@id="page-content"] / div[1] / div / header / div[2] / div[1] / section / div[1] / span[2] / strong'
        )[0].text
        scraper.dataset.issued = parse(pubDate).date()
    except:
        scraper.dataset.issued = parse('January 1 1900').date()
        logging.warning(
            "No issued date found, placement date added (1900-01-01)")
        pass

    dists = tree.findall(
        './/*[@id="page-content"]/div[3]/div/div/div[2]/section')

    for entry in dists:
        distributions = entry.findall('.//*[@class="no-icon"]')
        for element in distributions:
            dist = Distribution(scraper)
            dist.title = element.text
            dist.downloadURL = urljoin("https://www.gov.scot",
                                       element.attrib['href'])
            dist.mediaType, _ = mimetypes.guess_type(dist.downloadURL)
            if dist.mediaType in ACCEPTED_MIMETYPES:
                scraper.distributions.append(dist)
コード例 #8
0
def extract_distributions(distributions, link_tree, scraper):
    div_attach = next(iter(link_tree.xpath("div[@class='attachment-details']")), None)
    if div_attach is not None:
        div_metadata = next(iter(div_attach.xpath("p[@class='metadata']")), None)
        if div_metadata is not None:
            span_type = next(iter(div_metadata.xpath("span[@class='type']")), None)
            if span_type is not None:
                span_size = next(iter(div_metadata.xpath("span[@class='file-size']/text()")), None)
                if span_size is not None:
                    dist = Distribution(scraper)
                    # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024
                    # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes
                    if span_size.endswith('KB'):
                        dist.byteSize = int(float(span_size[:-2]) * 1024)
                    elif span_size.endswith('kB'):
                        dist.byteSize = int(float(span_size[:-2]) * 1000)
                    elif span_size.endswith('MB'):
                        dist.byteSize = int(float(span_size[:-2]) * 1000000)
                    anchor = next(iter(div_attach.xpath("h2/a")), None)
                    if anchor is not None:
                        url = anchor.get('href')
                        if url is not None:
                            dist.downloadURL = urljoin('https://www.gov.uk/', url)
                        if hasattr(anchor, 'text'):
                            dist.title = anchor.text.strip()
                    dist.mediaType, encoding = mimetypes.guess_type(dist.downloadURL)
                    abbr_type = next(iter(span_type.xpath("abbr/text()")), None)
                    if abbr_type is not None:
                        if abbr_type.upper() == 'PDF':
                            dist.mediaType = PDF
                    distributions.append(dist)
コード例 #9
0
def covid_handler(scraper, tree):
    scraper.dataset.publisher = GOV['national-records-of-scotland']
    scraper.dataset.title = tree.xpath('//*[@id="node-stats-home-page-3315"]/div[1]/div/div/h2/text()')[0].strip()
    scraper.dataset.description = tree.xpath('//*[@id="node-stats-home-page-3315"]/div[2]/div/div/p[4]/text()')[0].strip() #TEMP as no description on page is more applicable

    pubDate = tree.xpath('//*[@id="node-stats-home-page-3315"]/div[2]/div/div/p[1]/text()')[0]
    nextDue = tree.xpath('//*[@id="node-stats-home-page-3315"]/div[2]/div/div/p[1]/text()')[2]
    scraper.dataset.issued = parse(pubDate).date()
    scraper.dataset.updateDueOn = parse(nextDue).date()

    contact = tree.xpath('//*[@id="node-stats-home-page-3315"]/div[2]/div/div/p[11]/a')
    for i in contact:
        scraper.dataset.contactPoint = i.attrib['href']

    dataNodes = tree.findall('.//*[@id="node-stats-home-page-3315"]/div[2]/div/div/table/tbody/tr[1]/td[4]/p[2]/a')

    for node in dataNodes:
        file_type = node.text.lower()
        if file_type in ['excel', 'csv']:
            distribution = Distribution(scraper)
            distribution.title = scraper.dataset.title + ' ' + node.text
            distribution.downloadURL = urljoin(scraper.uri, node.attrib['href'])
            distribution.issued = scraper.dataset.issued
            distribution.mediaType = {
                'csv': 'text/csv',
                'excel': 'application/vnd.ms-excel'
            }.get(
                file_type,
                mimetypes.guess_type(distribution.downloadURL)[0]
            )
            scraper.distributions.append(distribution)
コード例 #10
0
def handler_dataset_landing_page_fallback(scraper, this_dataset_page, tree):
    """
    At time of writing there's an issue with the latest version of datasets 404'ing on the
    versions page.
    
    this function will create what the latest version should be, using the information on the
    base dataset landing page.
    """

    logging.warning(
        "Using fallback logic to scrape latest distribution from dataset landing page (rather "
        "than previous page). This scrape will only have a single distribution of xls."
    )

    this_distribution = Distribution(scraper)

    release_date = this_dataset_page["description"]["releaseDate"]
    this_distribution.issued = parse(release_date.strip()).date()

    # gonna have to go via html ...
    download_url = tree.xpath("//a[text()='xls']/@href")
    this_distribution.downloadURL = download_url

    media_type = Excel
    this_distribution.mediaType = media_type

    this_distribution.title = scraper.dataset.title
    this_distribution.description = scraper.dataset.description
    this_distribution.contactPoint = scraper.dataset.contactPoint

    logging.debug(
        "Created distribution for download '{}'.".format(download_url))
    scraper.distributions.append(this_distribution)
コード例 #11
0
ファイル: lcc.py プロジェクト: GSS-Cogs/gss-utils
def scrape(scraper, tree):
    """
    Scraper for https://www.lowcarboncontracts.uk/data-portal/dataset/*

    Example: https://www.lowcarboncontracts.uk/data-portal/dataset/actual-ilr-income
    """

    article = assert_get_one(tree.xpath('//article'), "article element")

    title_element = assert_get_one(article.xpath('./div/h1'), 'title element')
    scraper.dataset.title = title_element.text.strip()

    description_elements = article.xpath('./div/div/p')
    scraper.dataset.description = "\n\n".join(
        [x.text.strip() for x in description_elements])

    issued_element = assert_get_one(
        article.xpath('./div/section/table/tbody/tr[1]/td/span'),
        "issued element")
    scraper.dataset.issued = parse(issued_element.text.split("(")[0].strip())

    scraper.dataset.license = "http://reference.data.gov.uk/id/open-government-licence"

    for resource in assert_get_one(article.xpath('./div/section[1]/ul[1]'),
                                   "resource list").xpath('./li/a'):

        distro = Distribution(scraper)

        url = f'https://www.lowcarboncontracts.uk/{resource.get("href")}'
        resp = scraper.session.get(url)
        if resp.status_code != 200:
            raise Exception(f'Failed to get url resource {url}')

        distro_tree = html.fromstring(resp.text)
        section = assert_get_one(
            distro_tree.xpath(
                '/html[1]/body[1]/div[3]/div[1]/div[3]/section[1]'),
            "section of distro")

        distro_title_element = assert_get_one(section.xpath('./div/h1'),
                                              "distro title")
        distro.title = distro_title_element.text

        distro_description_element = assert_get_one(
            section.xpath('./div/div/blockquote[1]'), "distro description")
        distro.description = distro_description_element.text

        distro_download_url_element = assert_get_one(
            section.xpath('./div/p/a'), "download url")
        distro.downloadURL = distro_download_url_element.text

        # Note: issued is the one thing not in the "section" element, so xpathing the whole distro tree
        distro_issued_element = assert_get_one(
            distro_tree.xpath('//table[1]/tbody[1]/tr[2]/td[1]'), "issued")
        distro.issued = parse(distro_issued_element.text)

        media_type, _ = mimetypes.guess_type(distro.downloadURL)
        distro.mediaType = media_type if media_type is not None else CSV  # the default/not-specified offering is csv

        scraper.distributions.append(distro)
コード例 #12
0
ファイル: nrscotland.py プロジェクト: jwestw/gss-utils
def scrape(scraper, tree):
    scraper.dataset.publisher = GOV['national-records-of-scotland']
    scraper.dataset.title = tree.xpath(
        '//div[@property = "dc:title"]/h2/text()')[0].strip()
    after_background = tree.xpath(
        '//h3[contains(descendant-or-self::*[text()], "Background")]/following-sibling::*'
    )
    description_nodes = []
    for node in after_background:
        if node.tag != 'h3':
            description_nodes.append(node)
        else:
            break
    scraper.dataset.description = scraper.to_markdown(description_nodes)
    data_nodes = tree.xpath(
        '//h3[contains(descendant-or-self::*[text()], "Data")]/following-sibling::*/child::strong'
    )
    for node in data_nodes:
        for anchor in node.xpath('following-sibling::a'):
            file_type = anchor.text.strip().lower()
            if file_type in ['excel', 'csv']:
                distribution = Distribution(scraper)
                distribution.downloadURL = urljoin(scraper.uri,
                                                   anchor.get('href'))
                distribution.title = node.text.strip()
                distribution.mediaType = {
                    'csv': 'text/csv',
                    'excel': 'application/vnd.ms-excel'
                }.get(file_type,
                      mimetypes.guess_type(distribution.downloadURL)[0])
                scraper.distributions.append(distribution)
コード例 #13
0
ファイル: govwales.py プロジェクト: jwestw/gss-utils
def scrape(scraper, tree):
    # It's not clear whether the pages are collections of datasets or datasets with distributions.
    # Assume the latter for simplicity for now.
    scraper.dataset.publisher = GOV['welsh-government']
    # OGLv3 license is quoted for the whole site on https://gov.wales/copyright-statement
    scraper.dataset.rights = "https://gov.wales/copyright-statement"
    scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/'
    scraper.dataset.title = tree.xpath('//h1//text()')[0].strip()
    scraper.dataset.description = tree.xpath(
        "//div[contains(concat(' ', @class, ' '), ' hero-block__summary ')]/div/p/text()"
    )[0].strip()
    meta = tree.xpath("//div[@class='header-meta']")[0]
    published = meta.xpath(
        "div[contains(concat(' ', @class, ' '), ' first-published ')]/" + \
        "div[contains(concat(' ', @class, ' '), ' item ')]/text()")[0].strip()
    scraper.dataset.issued = parse(published, dayfirst=True)
    updated = meta.xpath(
        "div[contains(concat(' ', @class, ' '), ' last-updated ')]/" + \
        "div[contains(concat(' ', @class, ' '), ' item ')]//time/@datetime")[0].strip()
    scraper.dataset.modified = isoparse(updated)

    @lru_cache()
    def fetch_page(url):
        page = scraper.session.get(url)
        return html.fromstring(page.text)

    for article in tree.xpath("//div[@role='article']"):
        title_div = article.xpath("div[@class = 'index-list__title']")[0]
        meta_div = article.xpath("div[@class = 'index-list__meta']")[0]
        release_page = fetch_page(title_div.xpath('a/@href')[0])
        for details in release_page.xpath(
                "//div[@id = 'release--data']//div[@class = 'document__details']"
        ):
            distribution = Distribution(scraper)
            distribution.downloadURL = details.xpath("h3/a/@href")[0]
            distribution.title = details.xpath("h3/a/div/text()")[0].strip()
            distribution.issued = isoparse(
                details.xpath(
                    "//div[contains(concat(' ', @class, ' '), ' meta__released ')]//time/@datetime"
                )[0])
            distribution.modified = isoparse(
                details.xpath(
                    "//div[contains(concat(' ', @class, ' '), ' meta__update_history ')]//time/@datetime"
                )[0])
            dist_meta = details.xpath("h3/a/span/text()")[0].strip()
            meta_match = FILE_TYPE_AND_SIZE_RE.match(dist_meta)
            if meta_match:
                distribution.mediaType = {'ODS': ODS}.get(meta_match.group(1))
                size_qualifier = meta_match.group(3)
                size = float(meta_match.group(2))
                if size_qualifier == "KB":
                    distribution.byteSize = int(size * 1024)
                elif size_qualifier == "kB":
                    distribution.byteSize = int(size * 1000)
            else:
                distribution.mediaType, _ = mimetypes.guess_type(
                    distribution.downloadURL)
            scraper.distributions.append(distribution)
コード例 #14
0
ファイル: main.py プロジェクト: GSS-Cogs/family-covid-19
def opendata_nhs(scraper, tree):

    # TODO - this feels more like a catalogue than a list of distributions, investigate

    # Populate the dataset
    details = tree.xpath('//tr/td[@class="dataset-details"]')

    dates = tree.xpath('//span[@class="automatic-local-datetime"]/text()')
    date_updated = parse(" ".join([
        x.replace("\n", "").replace("(BST)", "").strip()
        for x in dates[0].split(" ")
    ]))
    date_created = parse(" ".join([
        x.replace("\n", "").replace("(BST)", "").strip()
        for x in dates[1].split(" ")
    ]))

    # Populate distributions
    distro_resources = tree.xpath('//li[@class="resource-item"]')
    for dr in distro_resources:

        download = dr.xpath(
            'div/ul/li/a[contains(@class, "resource-url-analytics")]/@href')[0]

        # Need to go to the preview page for full description and title as they've helpfully truncated both...
        preview_url = "https://www.opendata.nhs.scot" + dr.xpath(
            'div/ul[@class="dropdown-menu"]/li/a/@href')[0]
        r = scraper.session.get(preview_url)
        if r.status_code != 200:
            raise Exception(
                "Unable to follow url to get full description, url: '{}', status code '{}'."
                .format(preview_url, r.status_code))

        preview_tree = html.fromstring(r.text)
        description1 = preview_tree.xpath(
            '//div[contains(@class, "prose notes")]/p/text()')[0]
        # Some (but not all) descriptions have some additional itallic information
        try:
            description2 = preview_tree.xpath(
                '//div[contains(@class, "prose notes")]/p/em/text()')[0]
        except IndexError:
            description2 = ""

        description = description1 + "\n\n" + description2
        description.strip("\n")

        title = preview_tree.xpath('//title/text()')[0]
        this_distribution = Distribution(scraper)

        this_distribution.issued = date_updated
        this_distribution.downloadURL = download
        this_distribution.mediaType = CSV

        this_distribution.title = title.strip()
        this_distribution.description = description

        scraper.distributions.append(this_distribution)
コード例 #15
0
def content_api_publication(scraper, metadata):
    ds = Dataset(scraper.uri)
    if 'title' in metadata:
        ds.title = metadata['title']
    if 'description' in metadata:
        ds.comment = metadata['description']
    if 'details' in metadata:
        # TODO, depends on outcome of https://github.com/GSS-Cogs/gss-utils/issues/308
        ds.description = html2text.html2text(metadata["details"]["body"])
    if 'api_url' in metadata:
        doc_info = scraper.session.get(metadata['api_url']).json()
    else:
        doc_info = metadata
    if 'first_published_at' in doc_info:
        ds.issued = datetime.fromisoformat(doc_info['first_published_at'])
    if 'public_updated_at' in doc_info:
        ds.modified = datetime.fromisoformat(doc_info['public_updated_at'])
    if 'description' in doc_info:
        ds.comment = doc_info['description']
    if 'description' in doc_info:
        # TODO, depends on outcome of https://github.com/GSS-Cogs/gss-utils/issues/308
        ds.description = html2text.html2text(doc_info["details"]["body"])
    if 'links' in doc_info and 'organisations' in doc_info['links']:
        orgs = doc_info['links']['organisations']
        if len(orgs) == 0:
            logging.warning("No publishing organisations listed.")
        elif len(orgs) >= 1:
            if len(orgs) > 1:
                logging.warning(
                    'More than one organisation listed, taking the first.')
            ds.publisher = orgs[0]["web_url"]
    if 'details' in doc_info and 'attachments' in doc_info['details']:
        distributions = []
        for attachment in doc_info['details']['attachments']:
            dist = Distribution(scraper)
            if 'url' in attachment:
                dist.downloadURL = urljoin('https://www.gov.uk/',
                                           attachment['url'])
            if 'title' in attachment:
                dist.title = attachment['title']
            if 'file_size' in attachment:
                dist.byteSize = attachment['file_size']
            if 'content_type' in attachment:
                dist.mediaType = attachment['content_type']
            distributions.append(dist)
        ds.distribution = distributions
    elif 'details' in doc_info and 'documents' in doc_info['details']:
        distributions = []
        for link in doc_info['details']['documents']:
            link_tree = html.fromstring(link)
            extract_distributions(distributions, link_tree, scraper)
        ds.distribution = distributions
    return ds
コード例 #16
0
def scrape_ots_reports(scraper, tree):
    scraper.catalog.title = tree.xpath("//title/text()")[0].strip()
    scraper.catalog.dataset = []
    scraper.catalog.uri = scraper.uri + "#catalog"
    scraper.catalog.publisher = GOV['hm-revenue-customs']
    scraper.catalog.rights = "https://www.uktradeinfo.com/AboutUs/Pages/TermsAndConditions.aspx"
    # from above terms, link to crown copyright at the National Archives says default license is OGL
    scraper.catalog.license = "http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/"
    dataset_titles = {}
    table = tree.xpath("//h1/following-sibling::table")[0]
    header = True
    columns = []
    for row in table.xpath("tbody/tr"):
        if header:
            columns = [t.strip() for t in row.xpath("th/text()")]
            header = False
        else:
            publication_date = None
            for k, v in zip(columns, row.xpath("td")):
                if k == 'Published':
                    try:
                        if v.text is not None:
                            publication_date = parse(v.text.strip().strip(u'\u200B\ufeff'), dayfirst=True)
                    except ValueError as e:
                        logging.warning(f"Unable to parse published date {e}")
                elif k == 'Report':
                    links = v.xpath('a')
                    if len(links) > 0:
                        if links[0].get('href').startswith('https://www.gov.uk/government/statistics/'):
                            logging.warning(f'Dataset is published at gov.uk, see {links[0].get("href")}')
                            continue
                        title = links[0].text.strip().strip(u'\u200B\ufeff')
                        if title not in dataset_titles:
                            dataset = Dataset(scraper.uri)
                            if publication_date is not None:
                                dataset.issued = publication_date
                            dataset.publisher = scraper.catalog.publisher
                            dataset.license = scraper.catalog.license
                            dataset.title = links[0].text.strip().strip(u'\u200B\ufeff')
                            dataset.distribution = []
                            dataset_titles[title] = dataset
                        else:
                            dataset = dataset_titles[title]
                            if publication_date is not None and publication_date > dataset.issued:
                                dataset.issued = publication_date
                        for dist_link in links:
                            dist = Distribution(scraper)
                            dist.title = dist_link.text.strip().strip(u'\u200B\ufeff')
                            dist.downloadURL = urljoin(scraper.uri, dist_link.get('href'))
                            dist.mediaType, encoding = mimetypes.guess_type(dist.downloadURL)
                            dataset.distribution.append(dist)
    if len(dataset_titles) > 0:
        scraper.catalog.dataset = list(dataset_titles.values())
コード例 #17
0
def handler_static_adhoc(scraper, landing_page, tree):

    # A static adhoc is a one-off unscheduled release
    # These pages should be simpler and should lack the historical distributions

    for download in landing_page["downloads"]:

        title = download["title"]
        file = download["file"]

        # Create an empty Distribution object to represent this distribution
        # from here we're just looking to fill in it's fields
        this_distribution = Distribution(scraper)

        # if we can't get the release date, continue but throw a warning.
        try:
            this_distribution.issued = parse(
                landing_page["description"]["releaseDate"]).date()
        except KeyError:
            logging.warning("Unable to acquire or parse release date")

        download_url = ONS_DOWNLOAD_PREFIX + landing_page["uri"] + "/" + file
        this_distribution.downloadURL = download_url

        # TODO - we're doing this in two place, pull it out
        # we've had some issues with type-guessing so we're getting the media type
        # by checking the download url ending
        if download_url.endswith(".csdb"):
            media_type = CSDB
        elif download_url.endswith(".csv"):
            media_type = CSV
        elif download_url.endswith(".xlsx"):
            media_type = Excel
        elif download_url.endswith(".ods"):
            media_type = ODS
        else:
            media_type, _ = mimetypes.guess_type(download_url)
        this_distribution.mediaType = media_type

        this_distribution.title = title

        # inherit metadata from the dataset where it hasn't explicitly been changed
        this_distribution.description = scraper.dataset.description

        logging.debug(
            "Created distribution for download '{}'.".format(download_url))
        scraper.distributions.append(this_distribution)
コード例 #18
0
ファイル: nisra.py プロジェクト: jwestw/gss-utils
def scrape(scraper, tree):
    scraper.dataset.title = tree.xpath("//h1/text()")[0].strip()
    scraper.dataset.issued = parse(
        tree.xpath(
            "//span[text() = 'Date published: ']/following-sibling::span/text()",
            dayfirst=True)[0].strip()).date()
    scraper.dataset.keyword = ', '.join(
        tree.xpath(
            "//div[text()='Statistics: ']/following-sibling::ul/li/a/text()"))
    scraper.dataset.description = scraper.to_markdown(
        tree.xpath(
            "//div[contains(concat(' ', @class, ' '), ' publicationDetails ')]/div[@class='summary']/div/*"
        ))
    scraper.dataset.publisher = str(
        GOV["northern-ireland-statistics-and-research-agency"])
    for anchor in tree.xpath(
            "//div[contains(concat(' ', @class, ' '), ' publicationDocs ')]/div[@class='summary']/div//a"
    ):
        dist = Distribution(scraper)
        dist.title = anchor.xpath('text()')[0].strip()
        dist.downloadURL = anchor.get('href')
        type_size_re = re.compile(r'(.*?)\s*\(([^)]+)\)')
        m = type_size_re.match(anchor.xpath('span/text()')[0].strip())
        if m:
            if m.group(1) == 'Excel':
                dist.mediaType = Excel
            else:
                dist.mediaType, encoding = mimetypes.guess_type(
                    dist.downloadURL)
            size = m.group(2)
            if size.strip() != '':
                if size.upper().endswith(
                        ' KB'
                ):  # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024
                    dist.byteSize = int(float(size[:-3]) * 1024)
                elif size.upper().endswith(
                        ' MB'
                ):  # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes
                    dist.byteSize = int(float(size[:-3]) * 1000000)
        scraper.distributions.append(dist)
コード例 #19
0
def scrape(scraper, tree):
    page = StringIO(scraper.session.get(scraper.uri).text)
    pageGraph = _parse_rdfa_to_graph(page)
    # pageGraph.parse(page, format="html")
    dataset = pageGraph.value(predicate=RDF.type,
                              object=DCAT.Dataset,
                              any=False)
    scraper.dataset.title = pageGraph.value(dataset,
                                            DCTERMS.title).value.strip()
    scraper.dataset.comment = pageGraph.value(
        dataset, DCTERMS.description).value.strip()
    license = str(pageGraph.value(dataset, DCTERMS.license))
    if license == "http://www.nationalarchives.gov.uk/doc/open-government-licence":
        scraper.dataset.license = (
            "http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/"
        )
    else:
        scraper.dataset.license = license
    publisher = pageGraph.value(dataset, DCTERMS.publisher)
    if publisher == URIRef("http://www.gov.wales/"):
        scraper.dataset.publisher = GOV["welsh-government"]
    else:
        scraper.dataset.publisher = publisher
    scraper.dataset.issued = parse(pageGraph.value(dataset, DCTERMS.created),
                                   dayfirst=True).date()
    scraper.dataset.modified = parse(pageGraph.value(dataset,
                                                     DCTERMS.modified),
                                     dayfirst=True).date()
    for pageDist in pageGraph.subjects(RDF.type, DCAT.Distribution):
        dist = Distribution(scraper)
        dist.title = pageGraph.value(pageDist, DCTERMS.title).value.strip()
        # Access URLs seem to have spaces in their query strings
        url_parts = urlparse(pageGraph.value(pageDist, DCAT.accessURL))
        dist.downloadURL = url_parts._replace(
            query=url_parts.query.replace(" ", "+")).geturl()
        dist.mediaType = pageGraph.value(pageDist,
                                         DCAT.mediaType).value.strip()
        scraper.distributions.append(dist)
コード例 #20
0
ファイル: isd_scotland.py プロジェクト: jwestw/gss-utils
def scrape(scraper, tree):
    size_re = re.compile(r'\[([0-9]+)(kb|Mb)\]')
    scraper.catalog.title = tree.xpath('//h2/text()')[0].strip()
    scraper.catalog.uri = scraper.uri + "#catalog"
    scraper.catalog.rights = 'http://www.isdscotland.org/Copyright.asp'
    scraper.catalog.publisher = GOV['information-services-division-scotland']
    title2dataset = {}

    @lru_cache()
    def fetch_page(url):
        page = scraper.session.get(url)
        return html.fromstring(page.text)

    for record in tree.xpath(
            "//div[contains(concat(' ', @class, ' '), ' pubtitlel ')]"):
        dataset_title = record.text.strip()
        if dataset_title not in title2dataset:
            dataset = Dataset(scraper.uri)
            dataset.title = dataset_title
            dataset.publisher = scraper.catalog.publisher
            dataset.rights = scraper.catalog.rights
            dataset.distribution = []
            title2dataset[dataset_title] = dataset
        else:
            dataset = title2dataset[dataset_title]

        datatables_urls = record.xpath(
            "following-sibling::table/descendant::tr[td["
            "contains(text(), 'Data Tables')]]/td["
            "contains(concat(' ', @class, ' '), 'pubcontentr')]/a/@href")
        if len(datatables_urls) == 0:
            continue
        doc_url, frag = urldefrag(urljoin(scraper.uri, datatables_urls[0]))
        # pages appear to have redundant query parameter the same as the fragment id
        doc_url_bits = urlparse(doc_url)
        if doc_url_bits.query is not None and doc_url_bits.query == f'id={frag}':
            doc_url = doc_url_bits._replace(query=None).geturl()
        doc_tree = fetch_page(doc_url)
        anchors = doc_tree.xpath(f"//a[@id='{frag}' or @name='{frag}']")
        if len(anchors) == 0:
            logging.warning(f"Broken link to dataset {datatables_urls[0]}")
            continue

        # publication date is in paragraph before!
        # this is actually the issued date of the distribution
        published = anchors[0].xpath(
            "../preceding-sibling::p[1]/child::*/text()")
        dist_issued = None
        if len(published) > 0 and published[0].startswith('Published '):
            dist_issued = parse(published[0][len('Published '):],
                                dayfirst=True)
            # we'll use the latest publication date for the dataset
            if not (hasattr(dataset, 'issued')
                    and dist_issued <= dataset.issued):
                dataset.issued = dist_issued
        dist_rows = anchors[0].xpath(
            "../following-sibling::table[1]/descendant::tr")
        for row in dist_rows:
            distribution = Distribution(scraper)
            cells = row.xpath('td')
            if len(cells) == 4:
                title_node, download_node, type_node, size_node = cells
            elif len(cells) == 3:
                title_node, download_node, type_node = cells
                size_node = None
            else:
                break
            distribution.title = title_node.text
            if dist_issued is not None:
                distribution.issued = dist_issued
            distribution.downloadURL = download_node[0].get('href')
            type_image = type_node[0].get('src').lower()
            if 'excel' in type_image:
                distribution.mediaType = Excel
            elif 'swf' in type_image:
                distribution.mediaType = 'application/vnd.adobe.flash.movie'
            else:
                distribution.mediaType, encoding = mimetypes.guess_type(
                    distribution.downloadURL)
            if size_node is not None and size_node.text is not None:
                size_match = size_re.match(size_node.text)
                if size_match:
                    if size_match.group(2) == 'Mb':  # should be MB
                        distribution.byteSize = int(
                            size_match.group(1)
                        ) * 1000000  # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes
                    elif size_match.group(
                            2
                    ) == 'kb':  # should be either kB or KB    https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024
                        distribution.byteSize = int(size_match.group(1)) * 1000
            dataset.distribution.append(distribution)

    scraper.catalog.dataset = list(title2dataset.values())
コード例 #21
0
def statistics_handler(scraper, tree):
    scraper.dataset.publisher = GOV['national-records-of-scotland']
    scraper.dataset.title = tree.xpath('//div[@property = "dc:title"]/h2/text()')[0].strip()
    scraper.dataset.description = tree.xpath('//*[@id="block-system-main"]/div/div/div/div[2]/div/div/p[2]/text()')[0].strip()

    contact = tree.xpath('//*[@id="node-stats-home-page-3022"]/div[2]/div/div/p[10]/a')
    
    for i in contact:
        scraper.dataset.contactPoint = i.attrib['href']

    if tree.xpath(".//a[text()='Excel']") or tree.xpath(".//a[text()='CSV']"):

        nodes = tree.xpath(".//a[text()='Excel']") + tree.xpath(".//a[text()='CSV']")
        
        for node in nodes:
            file_type = node.text.lower()
            if file_type in ['excel', 'csv']:
                distribution = Distribution(scraper)
                try:
                    distribution.title = node.getparent().xpath('.//strong/text()')[0].strip()
                except:
                    distribution.title = scraper.dataset.title + ' ' + node.text
                distribution.downloadURL = urljoin(scraper.uri, node.attrib['href'])
                if 'Last update' in tree.xpath('//*[@id="block-system-main"]/div/div/div/div[2]/div/div/p[1]/text()'):
                    distribution.issued = parse(
                        tree.xpath('//*[@id="block-system-main"]/div/div/div/div[2]/div/div/p[1]/text()')[0]).date()
                else:
                    try:
                        distribution.issued = parse(tree.xpath("//*[contains(text(),'Updated')]/text()")[0].lower().replace('last updated:', '')).date()
                    except:
                        logging.warning("No Last Issue Date Found. Please update manually")
                        pass

                distribution.mediaType = {
                    'csv': 'text/csv',
                    'excel': mimetypes.guess_type(distribution.downloadURL)[0]
                }.get(
                    file_type,
                    mimetypes.guess_type(distribution.downloadURL)[0]
                )
                if distribution.mediaType in ACCEPTED_MIMETYPES:
                    scraper.distributions.append(distribution)
                else:
                    pass

    elif tree.findall('.//*[@id="node-stats-home-page-3022"]/div[2]/div/div/p/a'):
        for publication in tree.findall('.//*[@id="node-stats-home-page-3022"]/div[2]/div/div/p/a'):
            if publication.attrib['href'].startswith('/statistics-and-data/statistics/'):
                url = urljoin("https://www.nrscotland.gov.uk/", publication.attrib['href'])
                r = scraper.session.get(url)
                if r.status_code != 200:
                    raise Exception(
                        'Failed to get url {url}, with status code "{status_code}".'.format(url=url,
                                                                                            status_code=r.status_code))
                pubTree = html.fromstring(r.text)

                if pubTree.xpath(".//a[text()='Excel']") or pubTree.xpath(".//a[text()='CSV']"):
                    nodes = pubTree.xpath(".//a[text()='Excel']") + pubTree.xpath(".//a[text()='CSV']")

                    for node in nodes:
                        file_type = node.text.lower()
                        if file_type in ['excel', 'csv']:
                            distribution = Distribution(scraper)
                            distribution.title = scraper.dataset.title + ' ' + publication.text + ' ' + node.text
                            distribution.downloadURL = urljoin(scraper.uri, node.attrib['href'])
                            if 'Last update' in pubTree.xpath(
                                    '//*[@id="block-system-main"]/div/div/div/div[2]/div/div/p/strong/text()')[0]:
                                distribution.issued = parse(pubTree.xpath(
                                    '//*[@id="block-system-main"]/div/div/div/div[2]/div/div/p[1]/text()')[0]).date()
                            else:
                                try:
                                    distribution.issued = parse(re.search('\(([^)]+)', publication.getparent().text_content()).group(1)).date()
                                except:
                                    pass
                            distribution.mediaType = {
                                'csv': 'text/csv',
                                'excel': 'application/vnd.ms-excel'
                            }.get(
                                file_type,
                                mimetypes.guess_type(distribution.downloadURL)[0]
                            )
                            if distribution.mediaType in ACCEPTED_MIMETYPES:
                                scraper.distributions.append(distribution)
                            else:
                                pass
                    else:
                        pass
            else:
                pass
    else:

        for dataset in tree.xpath(".//*[@href[contains(.,'/files/statistics/')]]"):

            distribution = Distribution(scraper)
            distribution.title = dataset.text
            distribution.downloadURL = dataset.attrib['href']
            distribution.mediaType, encoding = mimetypes.guess_type(distribution.downloadURL)
            logging.warning("No Last Issue Date Found. Please update manually")
            if distribution.mediaType in ACCEPTED_MIMETYPES:
                scraper.distributions.append(distribution)
            else:
                pass
コード例 #22
0
def content_api_guidance(scraper, metadata):

    title = metadata.get("title", None)
    if title is None:
        logging.warning(
            f'The title for dataset {scraper.url} not set, title field missing from content api'
        )
    else:
        scraper.dataset.title = title

    description = metadata.get("description", None)
    if description is None:
        logging.warning(
            f'The description for dataset {scraper.url} not set, description field missing from content api'
        )
    else:
        scraper.dataset.description = description

    first_published_at = metadata.get("first_published_at", None)
    if first_published_at is None:
        logging.warning(
            f'The issued date for dataset {scraper.url} not set, issued date field missing from content api'
        )
    else:
        scraper.dataset.issued = first_published_at

    public_updated_at = metadata.get("public_updated_at", None)
    if public_updated_at is None:
        logging.warning(
            f'The modified date for dataset {scraper.url} not set, modified date field missing from content api'
        )
    else:
        scraper.dataset.modified = public_updated_at

    scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/'

    if 'links' in metadata and 'organisations' in metadata['links']:
        orgs = metadata['links']['organisations']
        if len(orgs) == 0:
            logging.warning("No publishing organisations listed.")
        elif len(orgs) >= 1:
            if len(orgs) > 1:
                logging.warning(
                    'More than one organisation listed, taking the first.')
            scraper.dataset.publisher = orgs[0]["web_url"]

    for attachment in metadata['details']['attachments']:
        try:
            distro = Distribution(scraper)

            dist_title = attachment.get('title')
            distro.title = dist_title

            dist_downloadURL = attachment.get('url')
            distro.downloadURL = attachment['url']

            distro.mediaType, _ = mimetypes.guess_type(distro.downloadURL)

            distro.issued = scraper.dataset.issued

            distro.modified = scraper.dataset.modified

            scraper.distributions.append(distro)

        except KeyError:
            logging.warning(
                f'Failed to extract attachment {json.dumps(attachment, indent=2)}'
            )
コード例 #23
0
def scrape_dataset(scraper, dataset_uri: str, contact_point: str, identifier: str) -> (Dataset):
    """
    Populate a single dataset using a single dataset page.
    Example page: https://oifdata.defra.gov.uk/2-1-1/
    """

    dataset = Dataset(scraper.uri)

    r: Response = scraper.session.get(dataset_uri)
    if not r.ok:
        logging.warning('Faliled to get datset {dataset_uri} with status code {r.status_code}')
        return None

    tree: HtmlElement = html.fromstring(r.text)

    title_element: HtmlElement = assert_get_one(tree.xpath('//h1'), 'title of dataset')
    dataset.title = title_element.text_content().strip()

    # To create the description, starting with the first <div> of the page content,
    # we want the text from all the the paragraph <p> elements
    # between the first and second headings <h2> elements.
    page_content_elements: HtmlElement = assert_get_one(tree.xpath("//div[@id='page-content']/div"), 
        'element containing bulk of page written content')

    heading_count = 0
    description_text = ""
    for element in page_content_elements:

        if element.tag.startswith("h"):
            heading_count +=1
        elif element.tag == "p":
            description_text += element.text_content() + "\n"
        
        if heading_count == 2:
            break

    dataset.description = description_text

    try:
        dataset.license = assert_get_one(tree.xpath("//div[@id='oglLicense']/a"), "licence in use").get("href")
    except:
        dataset.license = "http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/"

    # we want the text from a table row <tr> that contains a table header <th> of text "Date last updated"
    issued_row_element = assert_get_one(tree.xpath("//tr/th[contains(text(),'Date last updated')]/parent::*"),
        'table row that contains header text of "Date last updated"')
    time_as_text = assert_get_one(issued_row_element.xpath('./td[1]'), 'Time from row text').text_content()
    dataset.issued = parse(time_as_text)

    dataset.contactPoint = "mailto:"+contact_point

    dataset.publisher = GOV["department-for-environment-food-rural-affairs"]

    # There's only one distribution of data and that's the source csv.
    distribution = Distribution(scraper)

    distribution.title = " ".join(dataset.title.split(" ")[1:])
    distribution.downloadURL = urljoin(scraper.uri, f'/en/data/{identifier}.csv')
    distribution.issued = dataset.issued

    distribution.mediaType, _ = mimetypes.guess_type(distribution.downloadURL)

    dataset.distribution = [distribution]

    return dataset
コード例 #24
0
def handler_dataset_landing_page(scraper, landing_page, tree):

    # A dataset landing page has uri's to one or more datasets via it's "datasets" field.
    # We need to look at each in turn, this is an example one as json:
    # https://www.ons.gov.uk//businessindustryandtrade/internationaltrade/datasets/uktradeingoodsbyclassificationofproductbyactivity/current/data
    for dataset_page_url in landing_page["datasets"]:

        # Get the page as json. Throw an information error if we fail for whatever reason
        dataset_page_json_url = ONS_PREFIX + dataset_page_url["uri"] + "/data"
        r = scraper.session.get(dataset_page_json_url)
        if r.status_code != 200:
            raise ValueError("Scrape of url '{}' failed with status code {}." \
                             .format(dataset_page_json_url, r.status_code))

        # get the response json into a python dict
        this_dataset_page = r.json()

        # start a list of dataset versions (to hold current + all previous) as a list
        # we'll start with just the current/latest version
        versions_list = [ONS_PREFIX + this_dataset_page["uri"] + "/data"]

        # if there are older versions of this datasets availible.
        # iterate and add their uri's to the versions list
        try:
            for version_as_dict in this_dataset_page["versions"]:
                versions_list.append(ONS_PREFIX + version_as_dict["uri"] +
                                     "/data")
        except KeyError:
            logging.debug(
                "No older versions found for {}.".format(dataset_page_url))

        # NOTE - we've had an issue with the very latest dataset not being updated on the previous versions
        # page (the page we're getting the distributions from) so we're taking the details for it from
        # the landing page to use as a fallback in that scenario.

        # iterate through the lot, we're aiming to create at least one distribution object for each
        for i, version_url in enumerate(versions_list):
            logging.debug(
                "Identified distribution url, building distribution object for: "
                + version_url)

            r = scraper.session.get(version_url)
            if r.status_code != 200:

                # If we've got a 404 on the latest, fallback on using the details from the
                # landing page instead
                if r.status_code == 404 and i == len(versions_list) - 1:
                    handler_dataset_landing_page_fallback(
                        scraper, this_dataset_page, tree)
                    continue
                else:
                    raise Exception("Scraper unable to acquire the page: {} with http code {}." \
                                .format(version_url, r.status_code))

            # get the response json into a python dict
            this_page = r.json()

            # Get the download urls, if there's more than 1 format of this version of the dataset
            # each forms a separate distribution
            distribution_formats = this_page["downloads"]
            for dl in distribution_formats:

                # Create an empty Distribution object to represent this distribution
                # from here we're just looking to fill in it's fields
                this_distribution = Distribution(scraper)

                # Every distribution SHOULD have a release date, but it seems they're not
                # always included. If it happens continue but throw a warning.
                try:
                    release_date = this_page["description"]["releaseDate"]
                    this_distribution.issued = parse_as_local_date(
                        release_date.strip())
                except KeyError:
                    logging.warning(
                        "Download {}. Of datasset versions {} of dataset {} does not have "
                        "a release date".format(distribution_formats,
                                                version_url, dataset_page_url))

                # I don't trust dicts with one constant field (they don't make sense), so just in case...
                try:
                    download_url = ONS_DOWNLOAD_PREFIX + this_page[
                        "uri"] + "/" + dl["file"].strip()
                    this_distribution.downloadURL = download_url
                except:
                    # raise up this time. If we don't have a downloadURL it's not much use
                    raise ValueError("Unable to create complete download url for {} on page {}" \
                                     .format(dl, version_url))

                # we've had some issues with type-guessing so we're getting the media type
                # by checking the download url ending
                if download_url.endswith(".csdb"):
                    media_type = CSDB
                elif download_url.endswith(".csv"):
                    media_type = CSV
                elif download_url.endswith(".xlsx"):
                    media_type = Excel
                elif download_url.endswith(".ods"):
                    media_type = ODS
                else:
                    media_type, _ = mimetypes.guess_type(download_url)

                this_distribution.mediaType = media_type

                # inherit metadata from the dataset where it hasn't explicitly been changed
                this_distribution.title = scraper.dataset.title
                this_distribution.description = scraper.dataset.description

                logging.debug("Created distribution for download '{}'.".format(
                    download_url))
                scraper.distributions.append(this_distribution)
コード例 #25
0
ファイル: onscmd.py プロジェクト: jwestw/gss-utils
def scrape(scraper, tree):
    """
    This is a scraper intended to use the ONS cmd (customise my data) functionality.

    :param scraper:         the Scraper object
    :param landing_page:    lxml tree
    :return:
    """

    dataset_document = request_json_data(scraper, scraper.uri)

    scraper.dataset.title = dataset_document["id"]
    scraper.dataset.description = dataset_document["description"]

    # Need to get issued from the assciated publication
    publication_document = request_json_data(
        scraper, dataset_document["publications"][0]["href"] + "/data")
    scraper.dataset.issued = parse(
        publication_document["description"]["releaseDate"])

    # Only take next release it its a date
    try:
        next_release = parse(dataset_document["next_release"])
        scraper.dataset.updateDueOn = next_release
    except:
        pass  # it's fine, "unknown" etc

    # Theoretically you can have more than one contact, but I'm just taking the first
    scraper.dataset.contactPoint = "mailto:" + dataset_document["contacts"][0][
        "email"].strip()

    scraper.dataset.publisher = 'https://www.gov.uk/government/organisations/office-for-national-statistics'
    scraper.dataset.license = "http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/"

    edition_documents = request_json_data(scraper, scraper.uri + "/editions")

    for edition_document in edition_documents["items"]:

        edition_name = edition_document["edition"]

        version_documents = request_json_data(
            scraper, edition_document["links"]["versions"]["href"])

        for version_document in version_documents["items"]:

            version_name = str(version_document["version"])

            this_distribution = Distribution(scraper)

            this_distribution.issued = version_document["release_date"]
            this_distribution.downloadURL = version_document["downloads"][
                "csv"]["href"]
            this_distribution.mediaType = CSV

            this_distribution.title = scraper.dataset.title + ", {}, version {}".format(
                edition_name, version_name)
            this_distribution.description = scraper.dataset.description
            this_distribution.contactPoint = scraper.dataset.contactPoint

            logging.debug("Created distribution for download '{}'.".format(
                this_distribution.downloadURL))
            scraper.distributions.append(this_distribution)
コード例 #26
0
def content_api_sds(scraper, metadata):
    # publications are in the details/body HTML
    # they look to be a collection of datasets

    if 'title' in metadata:
        scraper.catalog.title = metadata['title']
    if 'description' in metadata:
        scraper.catalog.description = metadata['description']
    if 'first_published_at' in metadata:
        scraper.catalog.issued = datetime.fromisoformat(metadata['first_published_at'])
    if 'public_updated_at' in metadata:
        scraper.catalog.modified = datetime.fromisoformat(metadata['public_updated_at'])
    if 'links' in metadata and 'organisations' in metadata['links']:
        orgs = metadata['links']['organisations']
        if len(orgs) == 0:
            logging.warning("No publishing organisations listed.")
        elif len(orgs) >= 1:
            if len(orgs) > 1:
                logging.warning('More than one organisation listed, taking the first.')
            scraper.catalog.publisher = orgs[0]["web_url"]
    scraper.catalog.dataset = []
    if 'details' in metadata and 'body' in metadata['details']:
        body_tree = html.fromstring(metadata['details']['body'])
        # look for the same HTML as is used in content_api_publication yet here
        # joined into one blob
        sections = body_tree.xpath("//section[contains(concat(' ', @class, ' '), ' attachment ')]")
        if len(sections) > 0:
            ds = Dataset(scraper.uri)
            ds.title = scraper.catalog.title
            ds.description = scraper.catalog.description
            ds.publisher = scraper.catalog.publisher
            ds.issued = scraper.catalog.issued
            ds.modified = scraper.catalog.modified
            email_anchor = next(iter(body_tree.xpath("//a[@class='email']")), None)
            if email_anchor is not None:
                ds.contactPoint = email_anchor.get('href')
            ds.distribution = []
            for link_tree in sections:
                extract_distributions(ds.distribution, link_tree, scraper)
            scraper.catalog.dataset.append(ds)
            scraper.select_dataset(latest=True)
        else:
            for heading in body_tree.xpath("//h2[following-sibling::p/descendant::span[@class='attachment-inline']]"):
                id = heading.get('id')
                ds = Dataset(scraper.uri)
                ds.title = heading.text
                ds.description = scraper.catalog.description
                ds.publisher = scraper.catalog.publisher
                ds.issued = scraper.catalog.issued
                ds.modified = scraper.catalog.modified
                email_anchor = next(iter(body_tree.xpath("//a[@class='email']")), None)
                if email_anchor is not None:
                    ds.contactPoint = email_anchor.get('href')
                ds.distribution = []
                for attachment in body_tree.xpath(f"//h2[@id='{id}']/" + \
                                                  f"following-sibling::p[preceding-sibling::h2[1][@id='{id}']]/" + \
                                                  "span[@class='attachment-inline']"):
                    dist = Distribution(scraper)
                    dist.title = next(iter(attachment.xpath("a/text()")), None)
                    dist.downloadURL = next(iter(attachment.xpath("a/@href")), None)
                    dist.mediaType, _ = mimetypes.guess_type(dist.downloadURL)
                    abbr = next(iter(attachment.xpath("descendant::abbr/text()")), None)
                    if abbr is not None:
                        if abbr.upper() == 'PDF':
                            dist.mediaType = PDF
                        elif abbr.upper() == 'ODS':
                            dist.mediaType = ODS
                    size = next(iter(attachment.xpath("descendant::span[@class='file-size']/text()")), None)
                    if size is not None:
                        if size.endswith('KB'):
                            dist.byteSize = int(float(size[:-2]) * 1024)
                        elif size.endswith('kB'):
                            dist.byteSize = int(float(size[:-2]) * 1000)
                        elif size.endswith('MB'):
                            dist.byteSize = int(float(size[:-2]) * 1000000)
                    ds.distribution.append(dist)
                scraper.catalog.dataset.append(ds)
コード例 #27
0
ファイル: ons.py プロジェクト: GSS-Cogs/gss-utils
def handler_dataset_landing_page(scraper, landing_page, tree):
    # A dataset landing page has uri's to one or more datasets via it's "datasets" field.
    # We need to look at each in turn, this is an example one as json:
    # https://www.ons.gov.uk//businessindustryandtrade/internationaltrade/datasets/uktradeingoodsbyclassificationofproductbyactivity/current/data
    for dataset_page_url in landing_page["datasets"]:

        this_dataset_page = get_dict_from_json_url(ONS_PREFIX + dataset_page_url["uri"] + "/data", scraper)

        # create a list, with each entry a dict of a versions url and update date
        versions_dict_list = []

        # Where the dataset is versioned, use the versions as the distributions
        try:
            all_versions = this_dataset_page["versions"]
        except KeyError:
            all_versions = []

        # Release dates:
        # --------------
        # ONS does this odd thing where each version on the /data api
        # has a updateDate field which is actually the date THE DATA
        # WAS SUPERCEDED (so the release fate of the NEXT version of the data).
        # ......this takes a bit of unpicking.

        # If no initial release date for the dataset has been provided
        # We're just going to ignore v1, we don't have a use for it
        # and with no provided release date ... not a lot to be done
        initial_release = this_dataset_page["description"].get("releaseDate", None)

        next_release = None
        # Where there's multiple versions, iterate all and populate a list
        if len(all_versions) != 0:
            try:
                for version_as_dict in all_versions:
                    if next_release is None:
                        release_date = initial_release
                    else:
                        release_date = next_release

                    if release_date is not None:
                        versions_dict_list.append({
                            "url": ONS_PREFIX + version_as_dict["uri"] + "/data",
                            "issued": release_date
                        })
                    next_release = version_as_dict["updateDate"]
            except KeyError:
                logging.debug("No older versions found for {}.".format(dataset_page_url))

        # Add the current release
        versions_dict_list.append({
            "url": ONS_PREFIX + this_dataset_page["uri"] + "/data",
            "issued": initial_release if next_release is None else next_release
        })

        # NOTE - we've had an issue with the very latest dataset not being updated on the previous versions
        # page (the page we're getting the distributions from) so we're taking the details for it from
        # the landing page to use as a fallback in that scenario.

        # iterate through the lot, we're aiming to create at least one distribution object for each
        for i, version_dict in enumerate(versions_dict_list):

            version_url = version_dict["url"]
            issued = version_dict["issued"]

            logging.debug("Identified distribution url, building distribution object for: " + version_url)

            # get the response json into a python dict
            this_page = get_dict_from_json_url(version_url, scraper)

            # Get the download urls, if there's more than 1 format of this version of the dataset
            # each forms a separate distribution
            distribution_formats = this_page["downloads"]
            for dl in distribution_formats:

                # Create an empty Distribution object to represent this distribution
                # from here we're just looking to fill in it's fields
                this_distribution = Distribution(scraper)
                this_distribution.issued = parse_as_local_date(issued)

                # I don't trust dicts with one constant field (they don't make sense), so just in case...
                try:
                    download_url = ONS_DOWNLOAD_PREFIX + this_page["uri"] + "/" + dl["file"].strip()
                    this_distribution.downloadURL = download_url
                except:
                    # Throw a warning and abandon this distribution, ff we don't have a downloadURL it's not much use
                    logging.warning("Unable to create complete download url for {} on page {}"
                                    .format(dl, version_url))
                    continue

                # we've had some issues with type-guessing so we're getting the media type
                # by checking the download url ending
                if download_url.endswith(".csdb"):
                    media_type = CSDB
                else:
                    media_type, _ = mimetypes.guess_type(download_url)

                this_distribution.mediaType = media_type
                
                # inherit metadata from the dataset where it hasn't explicitly been changed
                this_distribution.title = scraper.dataset.title
                this_distribution.description = scraper.dataset.description

                logging.debug("Created distribution for download '{}'.".format(download_url))
                scraper.distributions.append(this_distribution)
コード例 #28
0
def scrape(scraper, tree):
    page_type = tree.xpath(
        "//span[contains(concat(' ', @class, ' '), ' article-header__label ')]/text()"
    )[0]

    if page_type.strip() == 'Series / Collection':
        scraper.catalog.title = tree.xpath("//h1/text()")[0]
        scraper.catalog.uri = scraper.uri + '#catalog'
        scraper.catalog.publisher = GOV['nhs-digital']
        scraper.catalog.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/'
        scraper.catalog.rights = 'https://digital.nhs.uk/about-nhs-digital/terms-and-conditions'
        scraper.catalog.comment = ' '.join(
            tree.xpath(
                "//div[@id='section-summary']/div[@itemprop='description']/*/text()"
            ))
        scraper.catalog.dataset = []

        articles = tree.xpath("//article[@class='cta']")
        for article in articles:
            dataset = Dataset(scraper.uri)
            dataset.distribution = []
            dataset.publisher = scraper.catalog.publisher
            dataset.license = scraper.catalog.license
            article_link = article.xpath('descendant::a')[0]
            dataset.title = article_link.get('title')
            href = article_link.get('href')
            dataset.landingPage = urljoin(scraper.uri, href)
            article_tree = html.fromstring(
                scraper.session.get(dataset.landingPage).text)
            article_type = article_tree.xpath(
                "//span[contains(concat(' ', @class, ' '), ' article-header__label ')]/text()"
            )[0]

            assert article_type.startswith(
                'Publication'), 'Expecting publication'

            details_node = article_tree.xpath("//dl[@class='detail-list']")[0]
            details = {}

            for node in details_node:
                if node.tag == 'dt' and node.get(
                        'class') == 'detail-list__key':
                    key = node.text.strip().lower()
                    if key.endswith(':'):
                        key = key[:-1].strip()
                elif node.tag == 'dd' and node.get(
                        'class') == 'detail-list__value':
                    value = node.text.strip()
                    if key not in details:
                        details[key] = [value]
                    else:
                        details[key].append(value)

            if 'publication date' in details:
                dataset.issued = parse(details['publication date'][0],
                                       dayfirst=True)

            # Todo: spatiotemporal coverage and resolution/granularity
            # Todo: national statistics / official statistics badges

            resources = article_tree.xpath(
                "//ul[@data-uipath='ps.publication.resources-attachments']/li/a"
            )

            for link in resources:
                dist = Distribution(scraper)
                dist.title = link.get('title')

                if hasattr(dataset, 'issued'):
                    dist.issued = dataset.issued

                dist.downloadURL = urljoin(dataset.landingPage,
                                           link.get('href'))
                file_data = link.xpath("div[@class='block-link__body']")[0]
                dist.mediaType = str(file_data.xpath("meta/@content")[0])
                size = file_data.xpath(
                    "span/span[@class='fileSize']/span[@itemprop='contentSize']/text()"
                )[0]
                size_match = re.match(r'([0-9]+(\.[0-9]*)?)\s*(kB|MB|GB)',
                                      size)

                if size_match and size_match.group(3) == 'kB':
                    # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024
                    dist.byteSize = int(float(size_match.group(1)) * 1000)
                elif size_match and size_match.group(3) == 'MB':
                    # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes
                    dist.byteSize = int(float(size_match.group(1)) * 1000000)
                elif size_match and size_match.group(3) == 'GB':
                    # https://en.wikipedia.org/wiki/Gigabyte GB = 10^9 bytes, GiB = 2^30 bytes
                    dist.byteSize = int(
                        float(size_match.group(1)) * 1000000000)
                dataset.distribution.append(dist)

            scraper.catalog.dataset.append(dataset)
コード例 #29
0
def scrape(scraper, tree):

    # A quick safety in case people are using this scraper incorrectly
    if "?search=" not in scraper.uri:
        raise Exception(
            """Aborting. This scraper is intended to run off the DCNI seach page.
        Please modify your url to use the site search.

        If in doubt, work from this page, change the quoted search text and capture the url
        https://www.communities-ni.gov.uk/publications/topic/8182?search=%22Northern+Ireland+Housing+Bulletin%22&Search-exposed-form=Go&sort_by=field_published_date
        """)

    scraper.dataset.publisher = GOV[
        'department-for-communities-northern-ireland']
    scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-" \
        "government-licence/version/3/'

    # We're taking each search result as a distribution
    search_result_urls = []
    for linkObj in tree.xpath("//h3/a"):

        # linkObj.items() is eg ("href", "www.foo.com") where we want a url
        href = [x[1] for x in linkObj.items() if x[0] == "href"][0]

        # Add to distributions url list, get the root from the original url
        search_result_urls.append(
            scraper.uri.split("/publications/topic")[0] + href)

    # keep track of dates issued so we can find the latest
    last_issued = None

    for url in search_result_urls:

        # Get the distribution page
        page = scraper.session.get(url)
        distro_tree = html.fromstring(page.text)

        # Get any spreadsheets (ods or excel) linked on the page
        spreadsheet_files = [
            x for x in distro_tree.xpath('//a/@href')
            if x.lower().endswith(".ods") or x.lower().endswith(".xlsx")
        ]

        # Now map them together, so we have the supporting info for each relevent download
        # TODO - make better, kinda nasty
        title_download_map = {}
        for spreadsheet_file in spreadsheet_files:

            # Create our new distribution object
            this_distribution = Distribution(scraper)

            # Identify the correct title
            this_distribution.title = distro_tree.xpath(
                "//a[@href='" + spreadsheet_file +
                "']/text()".format(spreadsheet_file))[0]
            this_distribution.downloadURL = spreadsheet_file

            if this_distribution.downloadURL.lower().endswith(".xlsx"):
                media_type = Excel
            elif this_distribution.downloadURL.lower().endswith(".ods"):
                media_type = ODS
            else:
                raise Exception(
                    "Aborting. Unexpected media type for url: '{}'".format(
                        this_distribution.downloadURL))
            this_distribution.mediaType = media_type

            # Published and modifed time
            this_distribution.issued = isoparse(
                distro_tree.xpath(
                    "//*[@property='article:published_time']/@content")
                [0]).date()
            this_distribution.modified = isoparse(
                distro_tree.xpath(
                    "//*[@property='article:modified_time']/@content")
                [0]).date()
            this_distribution.description = distro_tree.xpath(
                "//*[@class='field-summary']/p/text()")[0]

            if last_issued is None:
                last_issued = this_distribution.issued
            elif this_distribution.issued > last_issued:
                last_issued = this_distribution.issued

            scraper.distributions.append(this_distribution)

    # Whatever date the latest distribution was issued, is the last issued date for this "dataset"
    scraper.dataset.issued = last_issued