def eth_facts_service(scraper, tree): scraper.dataset.publisher = GOV['department-for-education'] scraper.dataset.title = tree.xpath('//*[@id="title"]/text()')[0].strip() scraper.dataset.contactPoint = tree.xpath( '//*[@id="footer"]/div/div[4]/a[2]/@href') scraper.dataset.issued = parse( tree.xpath('//*[@id="history"]/p[1]/span/text()')[0]).date() scraper.dataset.modified = parse( tree.xpath('//*[@id="history"]/p[2]/span/text()')[0]).date() for node in tree.xpath( "//*//*[@itemtype='http://schema.org/DataDownload']/a"): distribution = Distribution(scraper) distribution.title = node.attrib['data-event-label'] distribution.downloadURL = urljoin(scraper.uri, node.attrib['href']) distribution.issued = scraper.dataset.issued distribution.modified = scraper.dataset.modified fileType = re.search( '\(([^)]+)', str( tree.xpath( "//*//*[@itemtype='http://schema.org/DataDownload']/a/text()" ))).group(1) distribution.mediaType = { 'csv': CSV, 'excel': Excel }.get(fileType, mimetypes.guess_type(distribution.downloadURL)[0]) if distribution.mediaType in ACCEPTED_MIMETYPES: scraper.distributions.append(distribution) else: pass
def scrape(scraper, tree): # It's not clear whether the pages are collections of datasets or datasets with distributions. # Assume the latter for simplicity for now. scraper.dataset.publisher = GOV['welsh-government'] # OGLv3 license is quoted for the whole site on https://gov.wales/copyright-statement scraper.dataset.rights = "https://gov.wales/copyright-statement" scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/' scraper.dataset.title = tree.xpath('//h1//text()')[0].strip() scraper.dataset.description = tree.xpath( "//div[contains(concat(' ', @class, ' '), ' hero-block__summary ')]/div/p/text()" )[0].strip() meta = tree.xpath("//div[@class='header-meta']")[0] published = meta.xpath( "div[contains(concat(' ', @class, ' '), ' first-published ')]/" + \ "div[contains(concat(' ', @class, ' '), ' item ')]/text()")[0].strip() scraper.dataset.issued = parse(published, dayfirst=True) updated = meta.xpath( "div[contains(concat(' ', @class, ' '), ' last-updated ')]/" + \ "div[contains(concat(' ', @class, ' '), ' item ')]//time/@datetime")[0].strip() scraper.dataset.modified = isoparse(updated) @lru_cache() def fetch_page(url): page = scraper.session.get(url) return html.fromstring(page.text) for article in tree.xpath("//div[@role='article']"): title_div = article.xpath("div[@class = 'index-list__title']")[0] meta_div = article.xpath("div[@class = 'index-list__meta']")[0] release_page = fetch_page(title_div.xpath('a/@href')[0]) for details in release_page.xpath( "//div[@id = 'release--data']//div[@class = 'document__details']" ): distribution = Distribution(scraper) distribution.downloadURL = details.xpath("h3/a/@href")[0] distribution.title = details.xpath("h3/a/div/text()")[0].strip() distribution.issued = isoparse( details.xpath( "//div[contains(concat(' ', @class, ' '), ' meta__released ')]//time/@datetime" )[0]) distribution.modified = isoparse( details.xpath( "//div[contains(concat(' ', @class, ' '), ' meta__update_history ')]//time/@datetime" )[0]) dist_meta = details.xpath("h3/a/span/text()")[0].strip() meta_match = FILE_TYPE_AND_SIZE_RE.match(dist_meta) if meta_match: distribution.mediaType = {'ODS': ODS}.get(meta_match.group(1)) size_qualifier = meta_match.group(3) size = float(meta_match.group(2)) if size_qualifier == "KB": distribution.byteSize = int(size * 1024) elif size_qualifier == "kB": distribution.byteSize = int(size * 1000) else: distribution.mediaType, _ = mimetypes.guess_type( distribution.downloadURL) scraper.distributions.append(distribution)
def scrape(scraper, tree): # A quick safety in case people are using this scraper incorrectly if "?search=" not in scraper.uri: raise Exception( """Aborting. This scraper is intended to run off the DCNI seach page. Please modify your url to use the site search. If in doubt, work from this page, change the quoted search text and capture the url https://www.communities-ni.gov.uk/publications/topic/8182?search=%22Northern+Ireland+Housing+Bulletin%22&Search-exposed-form=Go&sort_by=field_published_date """) scraper.dataset.publisher = GOV[ 'department-for-communities-northern-ireland'] scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-" \ "government-licence/version/3/' # We're taking each search result as a distribution search_result_urls = [] for linkObj in tree.xpath("//h3/a"): # linkObj.items() is eg ("href", "www.foo.com") where we want a url href = [x[1] for x in linkObj.items() if x[0] == "href"][0] # Add to distributions url list, get the root from the original url search_result_urls.append( scraper.uri.split("/publications/topic")[0] + href) # keep track of dates issued so we can find the latest last_issued = None for url in search_result_urls: # Get the distribution page page = scraper.session.get(url) distro_tree = html.fromstring(page.text) # Get any spreadsheets (ods or excel) linked on the page spreadsheet_files = [ x for x in distro_tree.xpath('//a/@href') if x.lower().endswith(".ods") or x.lower().endswith(".xlsx") ] # Now map them together, so we have the supporting info for each relevent download # TODO - make better, kinda nasty title_download_map = {} for spreadsheet_file in spreadsheet_files: # Create our new distribution object this_distribution = Distribution(scraper) # Identify the correct title this_distribution.title = distro_tree.xpath( "//a[@href='" + spreadsheet_file + "']/text()".format(spreadsheet_file))[0] this_distribution.downloadURL = spreadsheet_file if this_distribution.downloadURL.lower().endswith(".xlsx"): media_type = Excel elif this_distribution.downloadURL.lower().endswith(".ods"): media_type = ODS else: raise Exception( "Aborting. Unexpected media type for url: '{}'".format( this_distribution.downloadURL)) this_distribution.mediaType = media_type # Published and modifed time this_distribution.issued = isoparse( distro_tree.xpath( "//*[@property='article:published_time']/@content") [0]).date() this_distribution.modified = isoparse( distro_tree.xpath( "//*[@property='article:modified_time']/@content") [0]).date() this_distribution.description = distro_tree.xpath( "//*[@class='field-summary']/p/text()")[0] if last_issued is None: last_issued = this_distribution.issued elif this_distribution.issued > last_issued: last_issued = this_distribution.issued scraper.distributions.append(this_distribution) # Whatever date the latest distribution was issued, is the last issued date for this "dataset" scraper.dataset.issued = last_issued
def content_api_guidance(scraper, metadata): title = metadata.get("title", None) if title is None: logging.warning( f'The title for dataset {scraper.url} not set, title field missing from content api' ) else: scraper.dataset.title = title description = metadata.get("description", None) if description is None: logging.warning( f'The description for dataset {scraper.url} not set, description field missing from content api' ) else: scraper.dataset.description = description first_published_at = metadata.get("first_published_at", None) if first_published_at is None: logging.warning( f'The issued date for dataset {scraper.url} not set, issued date field missing from content api' ) else: scraper.dataset.issued = first_published_at public_updated_at = metadata.get("public_updated_at", None) if public_updated_at is None: logging.warning( f'The modified date for dataset {scraper.url} not set, modified date field missing from content api' ) else: scraper.dataset.modified = public_updated_at scraper.dataset.license = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/' if 'links' in metadata and 'organisations' in metadata['links']: orgs = metadata['links']['organisations'] if len(orgs) == 0: logging.warning("No publishing organisations listed.") elif len(orgs) >= 1: if len(orgs) > 1: logging.warning( 'More than one organisation listed, taking the first.') scraper.dataset.publisher = orgs[0]["web_url"] for attachment in metadata['details']['attachments']: try: distro = Distribution(scraper) dist_title = attachment.get('title') distro.title = dist_title dist_downloadURL = attachment.get('url') distro.downloadURL = attachment['url'] distro.mediaType, _ = mimetypes.guess_type(distro.downloadURL) distro.issued = scraper.dataset.issued distro.modified = scraper.dataset.modified scraper.distributions.append(distro) except KeyError: logging.warning( f'Failed to extract attachment {json.dumps(attachment, indent=2)}' )