def eth_facts_service(scraper, tree): scraper.dataset.publisher = GOV['department-for-education'] scraper.dataset.title = tree.xpath('//*[@id="title"]/text()')[0].strip() scraper.dataset.contactPoint = tree.xpath( '//*[@id="footer"]/div/div[4]/a[2]/@href') scraper.dataset.issued = parse( tree.xpath('//*[@id="history"]/p[1]/span/text()')[0]).date() scraper.dataset.modified = parse( tree.xpath('//*[@id="history"]/p[2]/span/text()')[0]).date() for node in tree.xpath( "//*//*[@itemtype='http://schema.org/DataDownload']/a"): distribution = Distribution(scraper) distribution.title = node.attrib['data-event-label'] distribution.downloadURL = urljoin(scraper.uri, node.attrib['href']) distribution.issued = scraper.dataset.issued distribution.modified = scraper.dataset.modified fileType = re.search( '\(([^)]+)', str( tree.xpath( "//*//*[@itemtype='http://schema.org/DataDownload']/a/text()" ))).group(1) distribution.mediaType = { 'csv': CSV, 'excel': Excel }.get(fileType, mimetypes.guess_type(distribution.downloadURL)[0]) if distribution.mediaType in ACCEPTED_MIMETYPES: scraper.distributions.append(distribution) else: pass
def scrape(scraper, tree): """ Scraper for https://www.lowcarboncontracts.uk/data-portal/dataset/* Example: https://www.lowcarboncontracts.uk/data-portal/dataset/actual-ilr-income """ article = assert_get_one(tree.xpath('//article'), "article element") title_element = assert_get_one(article.xpath('./div/h1'), 'title element') scraper.dataset.title = title_element.text.strip() description_elements = article.xpath('./div/div/p') scraper.dataset.description = "\n\n".join( [x.text.strip() for x in description_elements]) issued_element = assert_get_one( article.xpath('./div/section/table/tbody/tr[1]/td/span'), "issued element") scraper.dataset.issued = parse(issued_element.text.split("(")[0].strip()) scraper.dataset.license = "http://reference.data.gov.uk/id/open-government-licence" for resource in assert_get_one(article.xpath('./div/section[1]/ul[1]'), "resource list").xpath('./li/a'): distro = Distribution(scraper) url = f'https://www.lowcarboncontracts.uk/{resource.get("href")}' resp = scraper.session.get(url) if resp.status_code != 200: raise Exception(f'Failed to get url resource {url}') distro_tree = html.fromstring(resp.text) section = assert_get_one( distro_tree.xpath( '/html[1]/body[1]/div[3]/div[1]/div[3]/section[1]'), "section of distro") distro_title_element = assert_get_one(section.xpath('./div/h1'), "distro title") distro.title = distro_title_element.text distro_description_element = assert_get_one( section.xpath('./div/div/blockquote[1]'), "distro description") distro.description = distro_description_element.text distro_download_url_element = assert_get_one( section.xpath('./div/p/a'), "download url") distro.downloadURL = distro_download_url_element.text # Note: issued is the one thing not in the "section" element, so xpathing the whole distro tree distro_issued_element = assert_get_one( distro_tree.xpath('//table[1]/tbody[1]/tr[2]/td[1]'), "issued") distro.issued = parse(distro_issued_element.text) media_type, _ = mimetypes.guess_type(distro.downloadURL) distro.mediaType = media_type if media_type is not None else CSV # the default/not-specified offering is csv scraper.distributions.append(distro)
def covid_handler(scraper, tree): scraper.dataset.publisher = GOV['national-records-of-scotland'] scraper.dataset.title = tree.xpath('//*[@id="node-stats-home-page-3315"]/div[1]/div/div/h2/text()')[0].strip() scraper.dataset.description = tree.xpath('//*[@id="node-stats-home-page-3315"]/div[2]/div/div/p[4]/text()')[0].strip() #TEMP as no description on page is more applicable pubDate = tree.xpath('//*[@id="node-stats-home-page-3315"]/div[2]/div/div/p[1]/text()')[0] nextDue = tree.xpath('//*[@id="node-stats-home-page-3315"]/div[2]/div/div/p[1]/text()')[2] scraper.dataset.issued = parse(pubDate).date() scraper.dataset.updateDueOn = parse(nextDue).date() contact = tree.xpath('//*[@id="node-stats-home-page-3315"]/div[2]/div/div/p[11]/a') for i in contact: scraper.dataset.contactPoint = i.attrib['href'] dataNodes = tree.findall('.//*[@id="node-stats-home-page-3315"]/div[2]/div/div/table/tbody/tr[1]/td[4]/p[2]/a') for node in dataNodes: file_type = node.text.lower() if file_type in ['excel', 'csv']: distribution = Distribution(scraper) distribution.title = scraper.dataset.title + ' ' + node.text distribution.downloadURL = urljoin(scraper.uri, node.attrib['href']) distribution.issued = scraper.dataset.issued distribution.mediaType = { 'csv': 'text/csv', 'excel': 'application/vnd.ms-excel' }.get( file_type, mimetypes.guess_type(distribution.downloadURL)[0] ) scraper.distributions.append(distribution)
def handler_dataset_landing_page_fallback(scraper, this_dataset_page, tree): """ At time of writing there's an issue with the latest version of datasets 404'ing on the versions page. this function will create what the latest version should be, using the information on the base dataset landing page. """ logging.warning( "Using fallback logic to scrape latest distribution from dataset landing page (rather " "than previous page). This scrape will only have a single distribution of xls." ) this_distribution = Distribution(scraper) release_date = this_dataset_page["description"]["releaseDate"] this_distribution.issued = parse(release_date.strip()).date() # gonna have to go via html ... download_url = tree.xpath("//a[text()='xls']/@href") this_distribution.downloadURL = download_url media_type = Excel this_distribution.mediaType = media_type this_distribution.title = scraper.dataset.title this_distribution.description = scraper.dataset.description this_distribution.contactPoint = scraper.dataset.contactPoint logging.debug( "Created distribution for download '{}'.".format(download_url)) scraper.distributions.append(this_distribution)
def scrape(scraper, tree): # It's not clear whether the pages are collections of datasets or datasets with distributions. # Assume the latter for simplicity for now. scraper.dataset.publisher = GOV['welsh-government'] # OGLv3 license is quoted for the whole site on https://gov.wales/copyright-statement scraper.dataset.rights = "https://gov.wales/copyright-statement" scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/' scraper.dataset.title = tree.xpath('//h1//text()')[0].strip() scraper.dataset.description = tree.xpath( "//div[contains(concat(' ', @class, ' '), ' hero-block__summary ')]/div/p/text()" )[0].strip() meta = tree.xpath("//div[@class='header-meta']")[0] published = meta.xpath( "div[contains(concat(' ', @class, ' '), ' first-published ')]/" + \ "div[contains(concat(' ', @class, ' '), ' item ')]/text()")[0].strip() scraper.dataset.issued = parse(published, dayfirst=True) updated = meta.xpath( "div[contains(concat(' ', @class, ' '), ' last-updated ')]/" + \ "div[contains(concat(' ', @class, ' '), ' item ')]//time/@datetime")[0].strip() scraper.dataset.modified = isoparse(updated) @lru_cache() def fetch_page(url): page = scraper.session.get(url) return html.fromstring(page.text) for article in tree.xpath("//div[@role='article']"): title_div = article.xpath("div[@class = 'index-list__title']")[0] meta_div = article.xpath("div[@class = 'index-list__meta']")[0] release_page = fetch_page(title_div.xpath('a/@href')[0]) for details in release_page.xpath( "//div[@id = 'release--data']//div[@class = 'document__details']" ): distribution = Distribution(scraper) distribution.downloadURL = details.xpath("h3/a/@href")[0] distribution.title = details.xpath("h3/a/div/text()")[0].strip() distribution.issued = isoparse( details.xpath( "//div[contains(concat(' ', @class, ' '), ' meta__released ')]//time/@datetime" )[0]) distribution.modified = isoparse( details.xpath( "//div[contains(concat(' ', @class, ' '), ' meta__update_history ')]//time/@datetime" )[0]) dist_meta = details.xpath("h3/a/span/text()")[0].strip() meta_match = FILE_TYPE_AND_SIZE_RE.match(dist_meta) if meta_match: distribution.mediaType = {'ODS': ODS}.get(meta_match.group(1)) size_qualifier = meta_match.group(3) size = float(meta_match.group(2)) if size_qualifier == "KB": distribution.byteSize = int(size * 1024) elif size_qualifier == "kB": distribution.byteSize = int(size * 1000) else: distribution.mediaType, _ = mimetypes.guess_type( distribution.downloadURL) scraper.distributions.append(distribution)
def scrape_pages(scraper, tree): scraper.catalog.title = tree.xpath("//title/text()")[0].strip() scraper.catalog.dataset = [] scraper.catalog.uri = scraper.uri + "#catalog" scraper.catalog.publisher = GOV['hm-revenue-customs'] scraper.catalog.rights = "https://www.uktradeinfo.com/AboutUs/Pages/TermsAndConditions.aspx" # from above terms, link to crown copyright at the National Archives says default license is OGL scraper.catalog.license = "http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/" # just scrape the first table for now; others are archives or factsheets. tables = tree.xpath("//table[contains(concat(' ', @class, ' '), ' hmrc ') or (contains(@summary, 'Tax & Duty bulletins'))]") for table in tables: header = True columns = [] for row in table.xpath("tbody/tr"): if header: columns = [t.strip() for t in row.xpath("th/text()")] header = False else: dataset = Dataset(scraper.uri) dataset.publisher = scraper.catalog.publisher dataset.license = scraper.catalog.license dataset.distribution = [] bulletin_date = None for k, v in zip(columns, row.xpath("td")): if k == 'Bulletin Title' or k == 'Title' or k == 'Factsheet Title': dataset.title = v.text elif k == 'Publication Source' or k == 'Source': pass elif k == 'Release Date' or k == 'Released': dataset.issued = parse(v.text.strip(), dayfirst=True) elif k == 'Bulletin Date' or k == 'Period': bulletin_date = v.text elif k == 'View' or k == 'View Archive': href = v.xpath("a/@href")[0] view_url = urljoin(scraper.uri, href) if '?viewname' in view_url: # this is a link off to a separate "archive" page with links to the # actual dataset releases archive_page = scraper.session.get(view_url) archive_tree = html.fromstring(archive_page.text) for release_row in archive_tree.xpath("//table[@class='hmrc']//tr")[1:]: dist = Distribution(scraper) cols = release_row.xpath("td") dist.downloadURL = urljoin(view_url, cols[1].xpath("a/@href")[0].replace(' ', '%20')) archive_date = cols[0].text dist.issued = parse(archive_date.strip(), dayfirst=True) dist.mediaType, _ = mimetypes.guess_type(dist.downloadURL) dist.title = dataset.title + ' ' + archive_date dataset.distribution.append(dist) else: dist = Distribution(scraper) dist.downloadURL = urljoin(scraper.uri, href) if dist.downloadURL.endswith('.xls') or dist.downloadURL.endswith('.xlsx'): dist.mediaType = Excel dist.title = dataset.title + (' ' + bulletin_date) if bulletin_date else '' dataset.distribution.append(dist) scraper.catalog.dataset.append(dataset)
def opendata_nhs(scraper, tree): # TODO - this feels more like a catalogue than a list of distributions, investigate # Populate the dataset details = tree.xpath('//tr/td[@class="dataset-details"]') dates = tree.xpath('//span[@class="automatic-local-datetime"]/text()') date_updated = parse(" ".join([ x.replace("\n", "").replace("(BST)", "").strip() for x in dates[0].split(" ") ])) date_created = parse(" ".join([ x.replace("\n", "").replace("(BST)", "").strip() for x in dates[1].split(" ") ])) # Populate distributions distro_resources = tree.xpath('//li[@class="resource-item"]') for dr in distro_resources: download = dr.xpath( 'div/ul/li/a[contains(@class, "resource-url-analytics")]/@href')[0] # Need to go to the preview page for full description and title as they've helpfully truncated both... preview_url = "https://www.opendata.nhs.scot" + dr.xpath( 'div/ul[@class="dropdown-menu"]/li/a/@href')[0] r = scraper.session.get(preview_url) if r.status_code != 200: raise Exception( "Unable to follow url to get full description, url: '{}', status code '{}'." .format(preview_url, r.status_code)) preview_tree = html.fromstring(r.text) description1 = preview_tree.xpath( '//div[contains(@class, "prose notes")]/p/text()')[0] # Some (but not all) descriptions have some additional itallic information try: description2 = preview_tree.xpath( '//div[contains(@class, "prose notes")]/p/em/text()')[0] except IndexError: description2 = "" description = description1 + "\n\n" + description2 description.strip("\n") title = preview_tree.xpath('//title/text()')[0] this_distribution = Distribution(scraper) this_distribution.issued = date_updated this_distribution.downloadURL = download this_distribution.mediaType = CSV this_distribution.title = title.strip() this_distribution.description = description scraper.distributions.append(this_distribution)
def handler_static_adhoc(scraper, landing_page, tree): # A static adhoc is a one-off unscheduled release # These pages should be simpler and should lack the historical distributions for download in landing_page["downloads"]: title = download["title"] file = download["file"] # Create an empty Distribution object to represent this distribution # from here we're just looking to fill in it's fields this_distribution = Distribution(scraper) # if we can't get the release date, continue but throw a warning. try: this_distribution.issued = parse( landing_page["description"]["releaseDate"]).date() except KeyError: logging.warning("Unable to acquire or parse release date") download_url = ONS_DOWNLOAD_PREFIX + landing_page["uri"] + "/" + file this_distribution.downloadURL = download_url # TODO - we're doing this in two place, pull it out # we've had some issues with type-guessing so we're getting the media type # by checking the download url ending if download_url.endswith(".csdb"): media_type = CSDB elif download_url.endswith(".csv"): media_type = CSV elif download_url.endswith(".xlsx"): media_type = Excel elif download_url.endswith(".ods"): media_type = ODS else: media_type, _ = mimetypes.guess_type(download_url) this_distribution.mediaType = media_type this_distribution.title = title # inherit metadata from the dataset where it hasn't explicitly been changed this_distribution.description = scraper.dataset.description logging.debug( "Created distribution for download '{}'.".format(download_url)) scraper.distributions.append(this_distribution)
def handler_dataset_landing_page(scraper, landing_page, tree): # A dataset landing page has uri's to one or more datasets via it's "datasets" field. # We need to look at each in turn, this is an example one as json: # https://www.ons.gov.uk//businessindustryandtrade/internationaltrade/datasets/uktradeingoodsbyclassificationofproductbyactivity/current/data for dataset_page_url in landing_page["datasets"]: # Get the page as json. Throw an information error if we fail for whatever reason dataset_page_json_url = ONS_PREFIX + dataset_page_url["uri"] + "/data" r = scraper.session.get(dataset_page_json_url) if r.status_code != 200: raise ValueError("Scrape of url '{}' failed with status code {}." \ .format(dataset_page_json_url, r.status_code)) # get the response json into a python dict this_dataset_page = r.json() # start a list of dataset versions (to hold current + all previous) as a list # we'll start with just the current/latest version versions_list = [ONS_PREFIX + this_dataset_page["uri"] + "/data"] # if there are older versions of this datasets availible. # iterate and add their uri's to the versions list try: for version_as_dict in this_dataset_page["versions"]: versions_list.append(ONS_PREFIX + version_as_dict["uri"] + "/data") except KeyError: logging.debug( "No older versions found for {}.".format(dataset_page_url)) # NOTE - we've had an issue with the very latest dataset not being updated on the previous versions # page (the page we're getting the distributions from) so we're taking the details for it from # the landing page to use as a fallback in that scenario. # iterate through the lot, we're aiming to create at least one distribution object for each for i, version_url in enumerate(versions_list): logging.debug( "Identified distribution url, building distribution object for: " + version_url) r = scraper.session.get(version_url) if r.status_code != 200: # If we've got a 404 on the latest, fallback on using the details from the # landing page instead if r.status_code == 404 and i == len(versions_list) - 1: handler_dataset_landing_page_fallback( scraper, this_dataset_page, tree) continue else: raise Exception("Scraper unable to acquire the page: {} with http code {}." \ .format(version_url, r.status_code)) # get the response json into a python dict this_page = r.json() # Get the download urls, if there's more than 1 format of this version of the dataset # each forms a separate distribution distribution_formats = this_page["downloads"] for dl in distribution_formats: # Create an empty Distribution object to represent this distribution # from here we're just looking to fill in it's fields this_distribution = Distribution(scraper) # Every distribution SHOULD have a release date, but it seems they're not # always included. If it happens continue but throw a warning. try: release_date = this_page["description"]["releaseDate"] this_distribution.issued = parse_as_local_date( release_date.strip()) except KeyError: logging.warning( "Download {}. Of datasset versions {} of dataset {} does not have " "a release date".format(distribution_formats, version_url, dataset_page_url)) # I don't trust dicts with one constant field (they don't make sense), so just in case... try: download_url = ONS_DOWNLOAD_PREFIX + this_page[ "uri"] + "/" + dl["file"].strip() this_distribution.downloadURL = download_url except: # raise up this time. If we don't have a downloadURL it's not much use raise ValueError("Unable to create complete download url for {} on page {}" \ .format(dl, version_url)) # we've had some issues with type-guessing so we're getting the media type # by checking the download url ending if download_url.endswith(".csdb"): media_type = CSDB elif download_url.endswith(".csv"): media_type = CSV elif download_url.endswith(".xlsx"): media_type = Excel elif download_url.endswith(".ods"): media_type = ODS else: media_type, _ = mimetypes.guess_type(download_url) this_distribution.mediaType = media_type # inherit metadata from the dataset where it hasn't explicitly been changed this_distribution.title = scraper.dataset.title this_distribution.description = scraper.dataset.description logging.debug("Created distribution for download '{}'.".format( download_url)) scraper.distributions.append(this_distribution)
def scrape(scraper, tree): page_type = tree.xpath( "//span[contains(concat(' ', @class, ' '), ' article-header__label ')]/text()" )[0] if page_type.strip() == 'Series / Collection': scraper.catalog.title = tree.xpath("//h1/text()")[0] scraper.catalog.uri = scraper.uri + '#catalog' scraper.catalog.publisher = GOV['nhs-digital'] scraper.catalog.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/' scraper.catalog.rights = 'https://digital.nhs.uk/about-nhs-digital/terms-and-conditions' scraper.catalog.comment = ' '.join( tree.xpath( "//div[@id='section-summary']/div[@itemprop='description']/*/text()" )) scraper.catalog.dataset = [] articles = tree.xpath("//article[@class='cta']") for article in articles: dataset = Dataset(scraper.uri) dataset.distribution = [] dataset.publisher = scraper.catalog.publisher dataset.license = scraper.catalog.license article_link = article.xpath('descendant::a')[0] dataset.title = article_link.get('title') href = article_link.get('href') dataset.landingPage = urljoin(scraper.uri, href) article_tree = html.fromstring( scraper.session.get(dataset.landingPage).text) article_type = article_tree.xpath( "//span[contains(concat(' ', @class, ' '), ' article-header__label ')]/text()" )[0] assert article_type.startswith( 'Publication'), 'Expecting publication' details_node = article_tree.xpath("//dl[@class='detail-list']")[0] details = {} for node in details_node: if node.tag == 'dt' and node.get( 'class') == 'detail-list__key': key = node.text.strip().lower() if key.endswith(':'): key = key[:-1].strip() elif node.tag == 'dd' and node.get( 'class') == 'detail-list__value': value = node.text.strip() if key not in details: details[key] = [value] else: details[key].append(value) if 'publication date' in details: dataset.issued = parse(details['publication date'][0], dayfirst=True) # Todo: spatiotemporal coverage and resolution/granularity # Todo: national statistics / official statistics badges resources = article_tree.xpath( "//ul[@data-uipath='ps.publication.resources-attachments']/li/a" ) for link in resources: dist = Distribution(scraper) dist.title = link.get('title') if hasattr(dataset, 'issued'): dist.issued = dataset.issued dist.downloadURL = urljoin(dataset.landingPage, link.get('href')) file_data = link.xpath("div[@class='block-link__body']")[0] dist.mediaType = str(file_data.xpath("meta/@content")[0]) size = file_data.xpath( "span/span[@class='fileSize']/span[@itemprop='contentSize']/text()" )[0] size_match = re.match(r'([0-9]+(\.[0-9]*)?)\s*(kB|MB|GB)', size) if size_match and size_match.group(3) == 'kB': # https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024 dist.byteSize = int(float(size_match.group(1)) * 1000) elif size_match and size_match.group(3) == 'MB': # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes dist.byteSize = int(float(size_match.group(1)) * 1000000) elif size_match and size_match.group(3) == 'GB': # https://en.wikipedia.org/wiki/Gigabyte GB = 10^9 bytes, GiB = 2^30 bytes dist.byteSize = int( float(size_match.group(1)) * 1000000000) dataset.distribution.append(dist) scraper.catalog.dataset.append(dataset)
def scrape(scraper, tree): # A quick safety in case people are using this scraper incorrectly if "?search=" not in scraper.uri: raise Exception( """Aborting. This scraper is intended to run off the DCNI seach page. Please modify your url to use the site search. If in doubt, work from this page, change the quoted search text and capture the url https://www.communities-ni.gov.uk/publications/topic/8182?search=%22Northern+Ireland+Housing+Bulletin%22&Search-exposed-form=Go&sort_by=field_published_date """) scraper.dataset.publisher = GOV[ 'department-for-communities-northern-ireland'] scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-" \ "government-licence/version/3/' # We're taking each search result as a distribution search_result_urls = [] for linkObj in tree.xpath("//h3/a"): # linkObj.items() is eg ("href", "www.foo.com") where we want a url href = [x[1] for x in linkObj.items() if x[0] == "href"][0] # Add to distributions url list, get the root from the original url search_result_urls.append( scraper.uri.split("/publications/topic")[0] + href) # keep track of dates issued so we can find the latest last_issued = None for url in search_result_urls: # Get the distribution page page = scraper.session.get(url) distro_tree = html.fromstring(page.text) # Get any spreadsheets (ods or excel) linked on the page spreadsheet_files = [ x for x in distro_tree.xpath('//a/@href') if x.lower().endswith(".ods") or x.lower().endswith(".xlsx") ] # Now map them together, so we have the supporting info for each relevent download # TODO - make better, kinda nasty title_download_map = {} for spreadsheet_file in spreadsheet_files: # Create our new distribution object this_distribution = Distribution(scraper) # Identify the correct title this_distribution.title = distro_tree.xpath( "//a[@href='" + spreadsheet_file + "']/text()".format(spreadsheet_file))[0] this_distribution.downloadURL = spreadsheet_file if this_distribution.downloadURL.lower().endswith(".xlsx"): media_type = Excel elif this_distribution.downloadURL.lower().endswith(".ods"): media_type = ODS else: raise Exception( "Aborting. Unexpected media type for url: '{}'".format( this_distribution.downloadURL)) this_distribution.mediaType = media_type # Published and modifed time this_distribution.issued = isoparse( distro_tree.xpath( "//*[@property='article:published_time']/@content") [0]).date() this_distribution.modified = isoparse( distro_tree.xpath( "//*[@property='article:modified_time']/@content") [0]).date() this_distribution.description = distro_tree.xpath( "//*[@class='field-summary']/p/text()")[0] if last_issued is None: last_issued = this_distribution.issued elif this_distribution.issued > last_issued: last_issued = this_distribution.issued scraper.distributions.append(this_distribution) # Whatever date the latest distribution was issued, is the last issued date for this "dataset" scraper.dataset.issued = last_issued
def scrape(scraper, tree): size_re = re.compile(r'\[([0-9]+)(kb|Mb)\]') scraper.catalog.title = tree.xpath('//h2/text()')[0].strip() scraper.catalog.uri = scraper.uri + "#catalog" scraper.catalog.rights = 'http://www.isdscotland.org/Copyright.asp' scraper.catalog.publisher = GOV['information-services-division-scotland'] title2dataset = {} @lru_cache() def fetch_page(url): page = scraper.session.get(url) return html.fromstring(page.text) for record in tree.xpath( "//div[contains(concat(' ', @class, ' '), ' pubtitlel ')]"): dataset_title = record.text.strip() if dataset_title not in title2dataset: dataset = Dataset(scraper.uri) dataset.title = dataset_title dataset.publisher = scraper.catalog.publisher dataset.rights = scraper.catalog.rights dataset.distribution = [] title2dataset[dataset_title] = dataset else: dataset = title2dataset[dataset_title] datatables_urls = record.xpath( "following-sibling::table/descendant::tr[td[" "contains(text(), 'Data Tables')]]/td[" "contains(concat(' ', @class, ' '), 'pubcontentr')]/a/@href") if len(datatables_urls) == 0: continue doc_url, frag = urldefrag(urljoin(scraper.uri, datatables_urls[0])) # pages appear to have redundant query parameter the same as the fragment id doc_url_bits = urlparse(doc_url) if doc_url_bits.query is not None and doc_url_bits.query == f'id={frag}': doc_url = doc_url_bits._replace(query=None).geturl() doc_tree = fetch_page(doc_url) anchors = doc_tree.xpath(f"//a[@id='{frag}' or @name='{frag}']") if len(anchors) == 0: logging.warning(f"Broken link to dataset {datatables_urls[0]}") continue # publication date is in paragraph before! # this is actually the issued date of the distribution published = anchors[0].xpath( "../preceding-sibling::p[1]/child::*/text()") dist_issued = None if len(published) > 0 and published[0].startswith('Published '): dist_issued = parse(published[0][len('Published '):], dayfirst=True) # we'll use the latest publication date for the dataset if not (hasattr(dataset, 'issued') and dist_issued <= dataset.issued): dataset.issued = dist_issued dist_rows = anchors[0].xpath( "../following-sibling::table[1]/descendant::tr") for row in dist_rows: distribution = Distribution(scraper) cells = row.xpath('td') if len(cells) == 4: title_node, download_node, type_node, size_node = cells elif len(cells) == 3: title_node, download_node, type_node = cells size_node = None else: break distribution.title = title_node.text if dist_issued is not None: distribution.issued = dist_issued distribution.downloadURL = download_node[0].get('href') type_image = type_node[0].get('src').lower() if 'excel' in type_image: distribution.mediaType = Excel elif 'swf' in type_image: distribution.mediaType = 'application/vnd.adobe.flash.movie' else: distribution.mediaType, encoding = mimetypes.guess_type( distribution.downloadURL) if size_node is not None and size_node.text is not None: size_match = size_re.match(size_node.text) if size_match: if size_match.group(2) == 'Mb': # should be MB distribution.byteSize = int( size_match.group(1) ) * 1000000 # https://en.wikipedia.org/wiki/Megabyte MB = 10^6 bytes elif size_match.group( 2 ) == 'kb': # should be either kB or KB https://en.wikipedia.org/wiki/Kilobyte kB = 1000 while KB = 1024 distribution.byteSize = int(size_match.group(1)) * 1000 dataset.distribution.append(distribution) scraper.catalog.dataset = list(title2dataset.values())
def handler_dataset_landing_page(scraper, landing_page, tree): # A dataset landing page has uri's to one or more datasets via it's "datasets" field. # We need to look at each in turn, this is an example one as json: # https://www.ons.gov.uk//businessindustryandtrade/internationaltrade/datasets/uktradeingoodsbyclassificationofproductbyactivity/current/data for dataset_page_url in landing_page["datasets"]: this_dataset_page = get_dict_from_json_url(ONS_PREFIX + dataset_page_url["uri"] + "/data", scraper) # create a list, with each entry a dict of a versions url and update date versions_dict_list = [] # Where the dataset is versioned, use the versions as the distributions try: all_versions = this_dataset_page["versions"] except KeyError: all_versions = [] # Release dates: # -------------- # ONS does this odd thing where each version on the /data api # has a updateDate field which is actually the date THE DATA # WAS SUPERCEDED (so the release fate of the NEXT version of the data). # ......this takes a bit of unpicking. # If no initial release date for the dataset has been provided # We're just going to ignore v1, we don't have a use for it # and with no provided release date ... not a lot to be done initial_release = this_dataset_page["description"].get("releaseDate", None) next_release = None # Where there's multiple versions, iterate all and populate a list if len(all_versions) != 0: try: for version_as_dict in all_versions: if next_release is None: release_date = initial_release else: release_date = next_release if release_date is not None: versions_dict_list.append({ "url": ONS_PREFIX + version_as_dict["uri"] + "/data", "issued": release_date }) next_release = version_as_dict["updateDate"] except KeyError: logging.debug("No older versions found for {}.".format(dataset_page_url)) # Add the current release versions_dict_list.append({ "url": ONS_PREFIX + this_dataset_page["uri"] + "/data", "issued": initial_release if next_release is None else next_release }) # NOTE - we've had an issue with the very latest dataset not being updated on the previous versions # page (the page we're getting the distributions from) so we're taking the details for it from # the landing page to use as a fallback in that scenario. # iterate through the lot, we're aiming to create at least one distribution object for each for i, version_dict in enumerate(versions_dict_list): version_url = version_dict["url"] issued = version_dict["issued"] logging.debug("Identified distribution url, building distribution object for: " + version_url) # get the response json into a python dict this_page = get_dict_from_json_url(version_url, scraper) # Get the download urls, if there's more than 1 format of this version of the dataset # each forms a separate distribution distribution_formats = this_page["downloads"] for dl in distribution_formats: # Create an empty Distribution object to represent this distribution # from here we're just looking to fill in it's fields this_distribution = Distribution(scraper) this_distribution.issued = parse_as_local_date(issued) # I don't trust dicts with one constant field (they don't make sense), so just in case... try: download_url = ONS_DOWNLOAD_PREFIX + this_page["uri"] + "/" + dl["file"].strip() this_distribution.downloadURL = download_url except: # Throw a warning and abandon this distribution, ff we don't have a downloadURL it's not much use logging.warning("Unable to create complete download url for {} on page {}" .format(dl, version_url)) continue # we've had some issues with type-guessing so we're getting the media type # by checking the download url ending if download_url.endswith(".csdb"): media_type = CSDB else: media_type, _ = mimetypes.guess_type(download_url) this_distribution.mediaType = media_type # inherit metadata from the dataset where it hasn't explicitly been changed this_distribution.title = scraper.dataset.title this_distribution.description = scraper.dataset.description logging.debug("Created distribution for download '{}'.".format(download_url)) scraper.distributions.append(this_distribution)
def statistics_handler(scraper, tree): scraper.dataset.publisher = GOV['national-records-of-scotland'] scraper.dataset.title = tree.xpath('//div[@property = "dc:title"]/h2/text()')[0].strip() scraper.dataset.description = tree.xpath('//*[@id="block-system-main"]/div/div/div/div[2]/div/div/p[2]/text()')[0].strip() contact = tree.xpath('//*[@id="node-stats-home-page-3022"]/div[2]/div/div/p[10]/a') for i in contact: scraper.dataset.contactPoint = i.attrib['href'] if tree.xpath(".//a[text()='Excel']") or tree.xpath(".//a[text()='CSV']"): nodes = tree.xpath(".//a[text()='Excel']") + tree.xpath(".//a[text()='CSV']") for node in nodes: file_type = node.text.lower() if file_type in ['excel', 'csv']: distribution = Distribution(scraper) try: distribution.title = node.getparent().xpath('.//strong/text()')[0].strip() except: distribution.title = scraper.dataset.title + ' ' + node.text distribution.downloadURL = urljoin(scraper.uri, node.attrib['href']) if 'Last update' in tree.xpath('//*[@id="block-system-main"]/div/div/div/div[2]/div/div/p[1]/text()'): distribution.issued = parse( tree.xpath('//*[@id="block-system-main"]/div/div/div/div[2]/div/div/p[1]/text()')[0]).date() else: try: distribution.issued = parse(tree.xpath("//*[contains(text(),'Updated')]/text()")[0].lower().replace('last updated:', '')).date() except: logging.warning("No Last Issue Date Found. Please update manually") pass distribution.mediaType = { 'csv': 'text/csv', 'excel': mimetypes.guess_type(distribution.downloadURL)[0] }.get( file_type, mimetypes.guess_type(distribution.downloadURL)[0] ) if distribution.mediaType in ACCEPTED_MIMETYPES: scraper.distributions.append(distribution) else: pass elif tree.findall('.//*[@id="node-stats-home-page-3022"]/div[2]/div/div/p/a'): for publication in tree.findall('.//*[@id="node-stats-home-page-3022"]/div[2]/div/div/p/a'): if publication.attrib['href'].startswith('/statistics-and-data/statistics/'): url = urljoin("https://www.nrscotland.gov.uk/", publication.attrib['href']) r = scraper.session.get(url) if r.status_code != 200: raise Exception( 'Failed to get url {url}, with status code "{status_code}".'.format(url=url, status_code=r.status_code)) pubTree = html.fromstring(r.text) if pubTree.xpath(".//a[text()='Excel']") or pubTree.xpath(".//a[text()='CSV']"): nodes = pubTree.xpath(".//a[text()='Excel']") + pubTree.xpath(".//a[text()='CSV']") for node in nodes: file_type = node.text.lower() if file_type in ['excel', 'csv']: distribution = Distribution(scraper) distribution.title = scraper.dataset.title + ' ' + publication.text + ' ' + node.text distribution.downloadURL = urljoin(scraper.uri, node.attrib['href']) if 'Last update' in pubTree.xpath( '//*[@id="block-system-main"]/div/div/div/div[2]/div/div/p/strong/text()')[0]: distribution.issued = parse(pubTree.xpath( '//*[@id="block-system-main"]/div/div/div/div[2]/div/div/p[1]/text()')[0]).date() else: try: distribution.issued = parse(re.search('\(([^)]+)', publication.getparent().text_content()).group(1)).date() except: pass distribution.mediaType = { 'csv': 'text/csv', 'excel': 'application/vnd.ms-excel' }.get( file_type, mimetypes.guess_type(distribution.downloadURL)[0] ) if distribution.mediaType in ACCEPTED_MIMETYPES: scraper.distributions.append(distribution) else: pass else: pass else: pass else: for dataset in tree.xpath(".//*[@href[contains(.,'/files/statistics/')]]"): distribution = Distribution(scraper) distribution.title = dataset.text distribution.downloadURL = dataset.attrib['href'] distribution.mediaType, encoding = mimetypes.guess_type(distribution.downloadURL) logging.warning("No Last Issue Date Found. Please update manually") if distribution.mediaType in ACCEPTED_MIMETYPES: scraper.distributions.append(distribution) else: pass
def scrape(scraper, tree): """ This is a scraper intended to use the ONS cmd (customise my data) functionality. :param scraper: the Scraper object :param landing_page: lxml tree :return: """ dataset_document = request_json_data(scraper, scraper.uri) scraper.dataset.title = dataset_document["id"] scraper.dataset.description = dataset_document["description"] # Need to get issued from the assciated publication publication_document = request_json_data( scraper, dataset_document["publications"][0]["href"] + "/data") scraper.dataset.issued = parse( publication_document["description"]["releaseDate"]) # Only take next release it its a date try: next_release = parse(dataset_document["next_release"]) scraper.dataset.updateDueOn = next_release except: pass # it's fine, "unknown" etc # Theoretically you can have more than one contact, but I'm just taking the first scraper.dataset.contactPoint = "mailto:" + dataset_document["contacts"][0][ "email"].strip() scraper.dataset.publisher = 'https://www.gov.uk/government/organisations/office-for-national-statistics' scraper.dataset.license = "http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/" edition_documents = request_json_data(scraper, scraper.uri + "/editions") for edition_document in edition_documents["items"]: edition_name = edition_document["edition"] version_documents = request_json_data( scraper, edition_document["links"]["versions"]["href"]) for version_document in version_documents["items"]: version_name = str(version_document["version"]) this_distribution = Distribution(scraper) this_distribution.issued = version_document["release_date"] this_distribution.downloadURL = version_document["downloads"][ "csv"]["href"] this_distribution.mediaType = CSV this_distribution.title = scraper.dataset.title + ", {}, version {}".format( edition_name, version_name) this_distribution.description = scraper.dataset.description this_distribution.contactPoint = scraper.dataset.contactPoint logging.debug("Created distribution for download '{}'.".format( this_distribution.downloadURL)) scraper.distributions.append(this_distribution)
def scrape_dataset(scraper, dataset_uri: str, contact_point: str, identifier: str) -> (Dataset): """ Populate a single dataset using a single dataset page. Example page: https://oifdata.defra.gov.uk/2-1-1/ """ dataset = Dataset(scraper.uri) r: Response = scraper.session.get(dataset_uri) if not r.ok: logging.warning('Faliled to get datset {dataset_uri} with status code {r.status_code}') return None tree: HtmlElement = html.fromstring(r.text) title_element: HtmlElement = assert_get_one(tree.xpath('//h1'), 'title of dataset') dataset.title = title_element.text_content().strip() # To create the description, starting with the first <div> of the page content, # we want the text from all the the paragraph <p> elements # between the first and second headings <h2> elements. page_content_elements: HtmlElement = assert_get_one(tree.xpath("//div[@id='page-content']/div"), 'element containing bulk of page written content') heading_count = 0 description_text = "" for element in page_content_elements: if element.tag.startswith("h"): heading_count +=1 elif element.tag == "p": description_text += element.text_content() + "\n" if heading_count == 2: break dataset.description = description_text try: dataset.license = assert_get_one(tree.xpath("//div[@id='oglLicense']/a"), "licence in use").get("href") except: dataset.license = "http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/" # we want the text from a table row <tr> that contains a table header <th> of text "Date last updated" issued_row_element = assert_get_one(tree.xpath("//tr/th[contains(text(),'Date last updated')]/parent::*"), 'table row that contains header text of "Date last updated"') time_as_text = assert_get_one(issued_row_element.xpath('./td[1]'), 'Time from row text').text_content() dataset.issued = parse(time_as_text) dataset.contactPoint = "mailto:"+contact_point dataset.publisher = GOV["department-for-environment-food-rural-affairs"] # There's only one distribution of data and that's the source csv. distribution = Distribution(scraper) distribution.title = " ".join(dataset.title.split(" ")[1:]) distribution.downloadURL = urljoin(scraper.uri, f'/en/data/{identifier}.csv') distribution.issued = dataset.issued distribution.mediaType, _ = mimetypes.guess_type(distribution.downloadURL) dataset.distribution = [distribution] return dataset
def content_api_guidance(scraper, metadata): title = metadata.get("title", None) if title is None: logging.warning( f'The title for dataset {scraper.url} not set, title field missing from content api' ) else: scraper.dataset.title = title description = metadata.get("description", None) if description is None: logging.warning( f'The description for dataset {scraper.url} not set, description field missing from content api' ) else: scraper.dataset.description = description first_published_at = metadata.get("first_published_at", None) if first_published_at is None: logging.warning( f'The issued date for dataset {scraper.url} not set, issued date field missing from content api' ) else: scraper.dataset.issued = first_published_at public_updated_at = metadata.get("public_updated_at", None) if public_updated_at is None: logging.warning( f'The modified date for dataset {scraper.url} not set, modified date field missing from content api' ) else: scraper.dataset.modified = public_updated_at scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/' if 'links' in metadata and 'organisations' in metadata['links']: orgs = metadata['links']['organisations'] if len(orgs) == 0: logging.warning("No publishing organisations listed.") elif len(orgs) >= 1: if len(orgs) > 1: logging.warning( 'More than one organisation listed, taking the first.') scraper.dataset.publisher = orgs[0]["web_url"] for attachment in metadata['details']['attachments']: try: distro = Distribution(scraper) dist_title = attachment.get('title') distro.title = dist_title dist_downloadURL = attachment.get('url') distro.downloadURL = attachment['url'] distro.mediaType, _ = mimetypes.guess_type(distro.downloadURL) distro.issued = scraper.dataset.issued distro.modified = scraper.dataset.modified scraper.distributions.append(distro) except KeyError: logging.warning( f'Failed to extract attachment {json.dumps(attachment, indent=2)}' )